Release: v4.2.2

Fix GPT conversion script (#9676 )
Fix imports in conversion scripts (#9674 )
2025-10-21 01:23:56 +08:00 · 2021-01-21 09:06:41 +01:00 · 2021-01-21 09:00:32 +01:00 · 2021-01-21 09:00:26 +01:00 · 2021-01-21 08:57:35 +01:00 · 2021-01-14 14:27:36 +01:00
1111 changed files with 319249 additions and 47128 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -1,88 +1,404 @@
-version: 2
+version: 2.1
+orbs:
+    gcp-gke: circleci/gcp-gke@1.0.4
+    go: circleci/go@1.3.0
+
+
+# TPU REFERENCES
+references:
+    checkout_ml_testing: &checkout_ml_testing
+        run:
+            name: Checkout ml-testing-accelerators
+            command: |
+                git clone https://github.com/GoogleCloudPlatform/ml-testing-accelerators.git
+                cd ml-testing-accelerators
+                git fetch origin 5e88ac24f631c27045e62f0e8d5dfcf34e425e25:stable
+                git checkout stable
+    build_push_docker: &build_push_docker
+        run:
+            name: Configure Docker
+            command: |
+                gcloud --quiet auth configure-docker
+                cd docker/transformers-pytorch-tpu
+                if [ -z "$CIRCLE_PR_NUMBER" ]; then docker build --tag "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" -f Dockerfile --build-arg "TEST_IMAGE=1" . ; else docker build --tag "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" -f Dockerfile --build-arg "TEST_IMAGE=1" --build-arg "GITHUB_REF=pull/$CIRCLE_PR_NUMBER/head" . ; fi
+                docker push "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID"
+    deploy_cluster: &deploy_cluster
+        run:
+            name: Deploy the job on the kubernetes cluster
+            command: |
+                go get github.com/google/go-jsonnet/cmd/jsonnet && \
+                export PATH=$PATH:$HOME/go/bin && \
+                kubectl create -f docker/transformers-pytorch-tpu/dataset.yaml || true && \
+                job_name=$(jsonnet -J ml-testing-accelerators/ docker/transformers-pytorch-tpu/bert-base-cased.jsonnet --ext-str image=$GCR_IMAGE_PATH --ext-str image-tag=$CIRCLE_WORKFLOW_JOB_ID | kubectl create -f -) && \
+                job_name=${job_name#job.batch/} && \
+                job_name=${job_name% created} && \
+                echo "Waiting on kubernetes job: $job_name" && \
+                i=0 && \
+                # 30 checks spaced 30s apart = 900s total.
+                max_checks=30 && \
+                status_code=2 && \
+                # Check on the job periodically. Set the status code depending on what
+                # happened to the job in Kubernetes. If we try max_checks times and
+                # still the job hasn't finished, give up and return the starting
+                # non-zero status code.
+                while [ $i -lt $max_checks ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else echo "Job not finished yet"; fi; sleep 30; done && \
+                echo "Done waiting. Job status code: $status_code" && \
+                pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` | awk 'match($0,!/NAME/) {print $1}') && \
+                echo "GKE pod name: $pod_name" && \
+                kubectl logs -f $pod_name --container=train
+                echo "Done with log retrieval attempt." && \
+                gcloud container images delete "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" --force-delete-tags && \
+                exit $status_code
+    delete_gke_jobs: &delete_gke_jobs
+        run:
+            name: Delete GKE Jobs
+            command: |
+                # Match jobs whose age matches patterns like '1h' or '1d', i.e. any job
+                # that has been around longer than 1hr. First print all columns for
+                # matches, then execute the delete.
+                kubectl get job | awk 'match($4,/[0-9]+[dh]/) {print $0}'
+                kubectl delete job $(kubectl get job | awk 'match($4,/[0-9]+[dh]/) {print $1}')
+
+
+
+
 jobs:
-    build_py3_torch_and_tf:
+    run_tests_torch_and_tf:
        working_directory: ~/transformers
        docker:
-            - image: circleci/python:3.5
+            - image: circleci/python:3.6
+        environment:
+            OMP_NUM_THREADS: 1
        resource_class: xlarge
        parallelism: 1
        steps:
            - checkout
-            - run: sudo pip install torch
-            - run: sudo pip install tensorflow==2.0.0-rc0
-            - run: sudo pip install --progress-bar off .
-            - run: sudo pip install pytest codecov pytest-cov
-            - run: sudo pip install tensorboardX scikit-learn
-            - run: python -m pytest -sv ./transformers/tests/ --cov
-            - run: codecov
-    build_py3_torch:
+            - restore_cache:
+                  keys:
+                      - v0.4-torch_and_tf-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: pip install --upgrade pip
+            - run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece]
+            - run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cpu.html
+            - save_cache:
+                key: v0.4-{{ checksum "setup.py" }}
+                paths:
+                    - '~/.cache/pip'
+            - run: RUN_PT_TF_CROSS_TESTS=1 python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_torch_and_tf ./tests/ -m is_pt_tf_cross_test --durations=0 | tee tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports
+
+    run_tests_torch:
        working_directory: ~/transformers
        docker:
-            - image: circleci/python:3.5
+            - image: circleci/python:3.7
+        environment:
+            OMP_NUM_THREADS: 1
        resource_class: xlarge
        parallelism: 1
        steps:
            - checkout
-            - run: sudo pip install torch
-            - run: sudo pip install --progress-bar off .
-            - run: sudo pip install pytest codecov pytest-cov
-            - run: sudo pip install tensorboardX scikit-learn
-            - run: python -m pytest -sv ./transformers/tests/ --cov
-            - run: python -m pytest -sv ./examples/
-            - run: codecov
-    build_py3_tf:
+            - restore_cache:
+                  keys:
+                      - v0.4-torch-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: pip install --upgrade pip
+            - run: pip install .[sklearn,torch,testing,sentencepiece]
+            - run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cpu.html
+            - save_cache:
+                  key: v0.4-torch-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: python -m pytest -n 8 --dist=loadfile -s --make-reports=tests_torch ./tests/ | tee tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports
+
+    run_tests_tf:
        working_directory: ~/transformers
        docker:
-            - image: circleci/python:3.5
+            - image: circleci/python:3.7
+        environment:
+            OMP_NUM_THREADS: 1
        resource_class: xlarge
        parallelism: 1
        steps:
            - checkout
-            - run: sudo pip install tensorflow==2.0.0-rc0
-            - run: sudo pip install --progress-bar off .
-            - run: sudo pip install pytest codecov pytest-cov
-            - run: sudo pip install tensorboardX scikit-learn
-            - run: python -m pytest -sv ./transformers/tests/ --cov
-            - run: codecov
-    build_py2_torch:
+            - restore_cache:
+                  keys:
+                      - v0.4-tf-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: pip install --upgrade pip
+            - run: pip install .[sklearn,tf-cpu,testing,sentencepiece]
+            - save_cache:
+                  key: v0.4-tf-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_tf ./tests/ | tee tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports
+
+    run_tests_flax:
        working_directory: ~/transformers
-        resource_class: large
-        parallelism: 1
        docker:
-            - image: circleci/python:2.7
+            - image: circleci/python:3.7
+        environment:
+            OMP_NUM_THREADS: 1
+        resource_class: xlarge
+        parallelism: 1
        steps:
            - checkout
-            - run: sudo pip install torch
-            - run: sudo pip install --progress-bar off .
-            - run: sudo pip install pytest codecov pytest-cov
-            - run: python -m pytest -sv ./transformers/tests/ --cov
-            - run: codecov
-    build_py2_tf:
+            - restore_cache:
+                keys:
+                    - v0.4-flax-{{ checksum "setup.py" }}
+                    - v0.4-{{ checksum "setup.py" }}
+            - run: pip install --upgrade pip
+            - run: sudo pip install .[flax,sklearn,torch,testing,sentencepiece]
+            - save_cache:
+                  key: v0.4-flax-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_flax ./tests/ | tee tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports
+
+    run_tests_pipelines_torch:
        working_directory: ~/transformers
-        resource_class: large
-        parallelism: 1
        docker:
-            - image: circleci/python:2.7
+            - image: circleci/python:3.7
+        environment:
+            OMP_NUM_THREADS: 1
+        resource_class: xlarge
+        parallelism: 1
        steps:
            - checkout
-            - run: sudo pip install tensorflow==2.0.0-rc0
-            - run: sudo pip install --progress-bar off .
-            - run: sudo pip install pytest codecov pytest-cov
-            - run: python -m pytest -sv ./transformers/tests/ --cov
-            - run: codecov
+            - restore_cache:
+                  keys:
+                      - v0.4-torch-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: pip install --upgrade pip
+            - run: pip install .[sklearn,torch,testing,sentencepiece]
+            - run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cpu.html
+            - save_cache:
+                  key: v0.4-torch-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: RUN_PIPELINE_TESTS=1 python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_pipelines_torch -m is_pipeline_test ./tests/ | tee tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports
+
+    run_tests_pipelines_tf:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.7
+        environment:
+            OMP_NUM_THREADS: 1
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.4-tf-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: pip install --upgrade pip
+            - run: pip install .[sklearn,tf-cpu,testing,sentencepiece]
+            - save_cache:
+                  key: v0.4-tf-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: RUN_PIPELINE_TESTS=1 python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_pipelines_tf ./tests/ -m is_pipeline_test | tee tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports
+
+    run_tests_custom_tokenizers:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.7
+        environment:
+            RUN_CUSTOM_TOKENIZERS: yes
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.4-custom_tokenizers-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: pip install --upgrade pip
+            - run: pip install .[ja,testing,sentencepiece]
+            - run: python -m unidic download
+            - save_cache:
+                  key: v0.4-custom_tokenizers-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: python -m pytest -s --make-reports=tests_custom_tokenizers ./tests/test_tokenization_bert_japanese.py | tee tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports
+
+    run_examples_torch:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.6
+        environment:
+            OMP_NUM_THREADS: 1
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.4-torch_examples-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: pip install --upgrade pip
+            - run: pip install .[sklearn,torch,sentencepiece,testing]
+            - run: pip install -r examples/_tests_requirements.txt
+            - save_cache:
+                  key: v0.4-torch_examples-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: python -m pytest -n 8 --dist=loadfile -s --make-reports=examples_torch ./examples/ | tee examples_output.txt
+            - store_artifacts:
+                  path: ~/transformers/examples_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports
+
+    run_tests_git_lfs:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.7
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - run: sudo apt-get install git-lfs
+            - run: |
+                git config --global user.email "ci@dummy.com"
+                git config --global user.name "ci"
+            - run: pip install --upgrade pip
+            - run: pip install .[testing]
+            - run: RUN_GIT_LFS_TESTS=1 python -m pytest -sv ./tests/test_hf_api.py -k "HfLargefilesTest"
+
+    build_doc:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.6
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.4-build_doc-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: pip install --upgrade pip
+            - run: pip install ."[all, docs]"
+            - save_cache:
+                  key: v0.4-build_doc-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: cd docs && make html SPHINXOPTS="-W"
+            - store_artifacts:
+                path: ./docs/_build
+
    deploy_doc:
        working_directory: ~/transformers
        docker:
-            - image: circleci/python:3.5
+            - image: circleci/python:3.6
        steps:
            - add_ssh_keys:
-                  fingerprints:
-                      - "5b:7a:95:18:07:8c:aa:76:4c:60:35:88:ad:60:56:71"
+                fingerprints:
+                    - "5b:7a:95:18:07:8c:aa:76:4c:60:35:88:ad:60:56:71"
            - checkout
-            - run: sudo pip install --progress-bar off -r docs/requirements.txt
-            - run: sudo pip install --progress-bar off -r requirements.txt
-            - run: cd docs/source && ln -s ../../examples/README.md examples.md && cd -
-            - run: cd docs && make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir
+            - restore_cache:
+                  keys:
+                      - v0.4-deploy_doc-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: pip install ."[all,docs]"
+            - save_cache:
+                  key: v0.4-deploy_doc-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: ./.circleci/deploy.sh
+
+    check_code_quality:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.6
+        resource_class: medium
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.4-code_quality-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: pip install --upgrade pip
+            - run: pip install isort
+            - run: pip install .[all,quality]
+            - save_cache:
+                  key: v0.4-code_quality-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: black --check examples tests src utils
+            - run: isort --check-only examples tests src utils
+            - run: flake8 examples tests src utils
+            - run: python utils/style_doc.py src/transformers docs/source --max_len 119 --check_only
+            - run: python utils/check_copies.py
+            - run: python utils/check_table.py
+            - run: python utils/check_dummies.py
+            - run: python utils/check_repo.py
+
+    check_repository_consistency:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.6
+        resource_class: small
+        parallelism: 1
+        steps:
+            - checkout
+            - run: pip install requests
+            - run: python ./utils/link_tester.py
+
+# TPU JOBS
+    run_examples_tpu:
+        docker:
+            - image: circleci/python:3.6
+        environment:
+            OMP_NUM_THREADS: 1
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - go/install
+            - *checkout_ml_testing
+            - gcp-gke/install
+            - gcp-gke/update-kubeconfig-with-credentials:
+                  cluster: $GKE_CLUSTER
+                  perform-login: true
+            - setup_remote_docker
+            - *build_push_docker
+            - *deploy_cluster
+
+    cleanup-gke-jobs:
+        docker:
+            - image: circleci/python:3.6
+        steps:
+            - gcp-gke/install
+            - gcp-gke/update-kubeconfig-with-credentials:
+                  cluster: $GKE_CLUSTER
+                  perform-login: true
+            - *delete_gke_jobs
+
 workflow_filters: &workflow_filters
    filters:
        branches:
@ -92,9 +408,28 @@ workflows:
    version: 2
    build_and_test:
        jobs:
-            - build_py3_torch_and_tf
-            - build_py3_torch
-            - build_py3_tf
-            - build_py2_torch
-            - build_py2_tf
-            - deploy_doc: *workflow_filters
+            - check_code_quality
+            - check_repository_consistency
+            - run_examples_torch
+            - run_tests_custom_tokenizers
+            - run_tests_torch_and_tf
+            - run_tests_torch
+            - run_tests_tf
+            - run_tests_flax
+            - run_tests_pipelines_torch
+            - run_tests_pipelines_tf
+            - run_tests_git_lfs
+            - build_doc
+            - deploy_doc: *workflow_filters
+    tpu_testing_jobs:
+        triggers:
+            - schedule:
+                # Set to run at the first minute of every hour.
+                cron: "0 8 * * *"
+                filters:
+                    branches:
+                        only:
+                            - master
+        jobs:
+            - cleanup-gke-jobs
+            - run_examples_tpu
--- a/.circleci/deploy.sh
+++ b/.circleci/deploy.sh
@ -0,0 +1,57 @@
+cd docs
+
+function deploy_doc(){
+	echo "Creating doc at commit $1 and pushing to folder $2"
+	git checkout $1
+	if [ ! -z "$2" ]
+	then
+		if [ "$2" == "master" ]; then
+		    echo "Pushing master"
+			make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir/$2/
+			cp -r _build/html/_static .
+		elif ssh -oStrictHostKeyChecking=no $doc "[ -d $dir/$2 ]"; then
+			echo "Directory" $2 "already exists"
+			scp -r -oStrictHostKeyChecking=no _static/* $doc:$dir/$2/_static/
+		else
+			echo "Pushing version" $2
+			make clean && make html
+			rm -rf _build/html/_static
+			cp -r _static _build/html
+			scp -r -oStrictHostKeyChecking=no _build/html $doc:$dir/$2
+		fi
+	else
+		echo "Pushing stable"
+		make clean && make html
+		rm -rf _build/html/_static
+		cp -r _static _build/html
+		scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir
+	fi
+}
+
+# You can find the commit for each tag on https://github.com/huggingface/transformers/tags
+deploy_doc "master" master
+deploy_doc "b33a385" v1.0.0
+deploy_doc "fe02e45" v1.1.0
+deploy_doc "89fd345" v1.2.0
+deploy_doc "fc9faa8" v2.0.0
+deploy_doc "3ddce1d" v2.1.1
+deploy_doc "3616209" v2.2.0
+deploy_doc "d0f8b9a" v2.3.0
+deploy_doc "6664ea9" v2.4.0
+deploy_doc "fb560dc" v2.5.0
+deploy_doc "b90745c" v2.5.1
+deploy_doc "fbc5bf1" v2.6.0
+deploy_doc "6f5a12a" v2.7.0
+deploy_doc "11c3257" v2.8.0
+deploy_doc "e7cfc1a" v2.9.0
+deploy_doc "7cb203f" v2.9.1
+deploy_doc "10d7239" v2.10.0 
+deploy_doc "b42586e" v2.11.0
+deploy_doc "7fb8bdf" v3.0.2
+deploy_doc "4b3ee9c" v3.1.0
+deploy_doc "3ebb1b3" v3.2.0
+deploy_doc "0613f05" v3.3.1
+deploy_doc "eb0e0ce" v3.4.0
+deploy_doc "818878d" v3.5.1
+deploy_doc "c781171" v4.0.0
+deploy_doc "bfa4ccf" # v4.1.1 Latest stable release
--- a/.github/ISSUE_TEMPLATE/---new-benchmark.md
+++ b/.github/ISSUE_TEMPLATE/---new-benchmark.md
@ -0,0 +1,22 @@
+---
+name: "\U0001F5A5 New benchmark"
+about: Benchmark a part of this library and share your results
+title: "[Benchmark]"
+labels: ''
+assignees: ''
+
+---
+
+# 🖥 Benchmarking `transformers`
+
+## Benchmark
+
+Which part of `transformers` did you benchmark?
+
+## Set-up
+
+What did you run your benchmarks on? Please include details, such as: CPU, GPU? If using multiple GPUs, which parallelization did you use?
+
+## Results
+
+Put your results here!
--- a/.github/ISSUE_TEMPLATE/--new-model-addition.md
+++ b/.github/ISSUE_TEMPLATE/--new-model-addition.md
@ -0,0 +1,20 @@
+---
+name: "\U0001F31F New model addition"
+about: Submit a proposal/request to implement a new Transformer-based model
+title: ''
+labels: New model
+assignees: ''
+
+---
+
+# 🌟 New model addition
+
+## Model description
+
+<!-- Important information -->
+
+## Open source status
+
+* [ ] the model implementation is available: (give details)
+* [ ] the model weights are available: (give details)
+* [ ] who are the authors: (mention them, if possible by @gh-username)
--- a/.github/ISSUE_TEMPLATE/bug-report.md
+++ b/.github/ISSUE_TEMPLATE/bug-report.md
@ -1,25 +1,71 @@
 ---
 name: "\U0001F41B Bug Report"
-about: Submit a bug report to help us improve PyTorch Transformers
+about: Submit a bug report to help us improve transformers
+title: ''
+labels: ''
+assignees: ''
+
 ---

-## 🐛 Bug

-<!-- Important information -->
+## Environment info
+<!-- You can run the command `transformers-cli env` and copy-and-paste its output below.
+     Don't forget to fill out the missing fields in that output! -->

-Model I am using (Bert, XLNet....):
+- `transformers` version:
+- Platform:
+- Python version:
+- PyTorch version (GPU?):
+- Tensorflow version (GPU?):
+- Using GPU in script?:
+- Using distributed or parallel set-up in script?:

-Language I am using the model on (English, Chinese....):
+### Who can help
+<!-- Your issue will be replied to more quickly if you can figure out the right person to tag with @
+ If you know how to use git blame, that is the easiest way, otherwise, here is a rough guide of **who to tag**.
+ Please tag fewer than 3 people.

-The problem arise when using:
-* [ ] the official example scripts: (give details)
-* [ ] my own modified scripts: (give details)
+ albert, bert, GPT2, XLM: @LysandreJik
+ tokenizers: @mfuntowicz
+ Trainer: @sgugger
+ Speed and Memory Benchmarks: @patrickvonplaten
+ Model Cards: @julien-c
+ TextGeneration: @TevenLeScao
+ examples/distillation: @VictorSanh
+ nlp datasets: [different repo](https://github.com/huggingface/nlp)
+ rust tokenizers: [different repo](https://github.com/huggingface/tokenizers)
+ Text Generation: @patrickvonplaten @TevenLeScao
+ Blenderbot: @patrickvonplaten
+ Bart: @patrickvonplaten
+ Marian: @patrickvonplaten
+ Pegasus: @patrickvonplaten
+ mBART: @patrickvonplaten
+ T5: @patrickvonplaten
+ Longformer/Reformer: @patrickvonplaten
+ TransfoXL/XLNet: @TevenLeScao
+ RAG: @patrickvonplaten, @lhoestq
+ FSMT: @stas00
+ examples/seq2seq: @patil-suraj
+ examples/bert-loses-patience: @JetRunner
+ ray/raytune: @richardliaw @amogkam
+ tensorflow: @jplu
+ examples/token-classification: @stefan-it
+ documentation: @sgugger
+ -->
+
+## Information
+
+Model I am using (Bert, XLNet ...):
+
+The problem arises when using:
+* [ ] the official example scripts: (give details below)
+* [ ] my own modified scripts: (give details below)

 The tasks I am working on is:
 * [ ] an official GLUE/SQUaD task: (give the name)
-* [ ] my own task or dataset: (give details)
+* [ ] my own task or dataset: (give details below)

-## To Reproduce
+## To reproduce

 Steps to reproduce the behavior:

@ -27,22 +73,10 @@ Steps to reproduce the behavior:
 2.
 3.

-<!-- If you have a code sample, error messages, stack traces, please provide it here as well. -->
+<!-- If you have code snippets, error messages, stack traces please provide them here as well.
+     Important! Use code tags to correctly format your code. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
+     Do not use screenshots, as they are hard to read and (more importantly) don't allow others to copy-and-paste your code.-->

 ## Expected behavior

-<!-- A clear and concise description of what you expected to happen. -->
-
-## Environment
-
-* OS:
-* Python version:
-* PyTorch version:
-* PyTorch Transformers version (or branch):
-* Using GPU ?
-* Distributed of parallel setup ?
-* Any other relevant information:
-
-## Additional context
-
-<!-- Add any other context about the problem here. -->
+<!-- A clear and concise description of what you would expect to happen. -->
--- a/.github/ISSUE_TEMPLATE/feature-request.md
+++ b/.github/ISSUE_TEMPLATE/feature-request.md
@ -1,16 +1,25 @@
 ---
-name: "\U0001F680 Feature Request"
-about: Submit a proposal/request for a new PyTorch Transformers feature
+name: "\U0001F680 Feature request"
+about: Submit a proposal/request for a new transformers feature
+title: ''
+labels: ''
+assignees: ''
+
 ---

-## 🚀 Feature
+# 🚀 Feature request

-<!-- A clear and concise description of the feature proposal. Please provide a link to the paper and code in case they exist. -->
+<!-- A clear and concise description of the feature proposal.
+     Please provide a link to the paper and code in case they exist. -->

 ## Motivation

-<!-- Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too. -->
+<!-- Please outline the motivation for the proposal. Is your feature request
+     related to a problem? e.g., I'm always frustrated when [...]. If this is related
+     to another GitHub issue, please link here too. -->

-## Additional context
+## Your contribution

-<!-- Add any other context or screenshots about the feature request here. -->
+<!-- Is there any way that you could help, e.g. by submitting a PR?
+     Make sure to read the CONTRIBUTING.MD readme:
+     https://github.com/huggingface/transformers/blob/master/CONTRIBUTING.md -->
--- a/.github/ISSUE_TEMPLATE/migration.md
+++ b/.github/ISSUE_TEMPLATE/migration.md
@ -1,43 +1,58 @@
 ---
-name: "\U0001F4DA Migration from PyTorch-pretrained-Bert"
-about: Report a problem when migrating from PyTorch-pretrained-Bert to Transformers
+name: "\U0001F4DA Migration from pytorch-pretrained-bert or pytorch-transformers"
+about: Report a problem when migrating from pytorch-pretrained-bert or pytorch-transformers
+  to transformers
+title: ''
+labels: Migration
+assignees: ''
+
 ---

-## 📚 Migration
+# 📚 Migration
+
+## Information

 <!-- Important information -->

-Model I am using (Bert, XLNet....):
+Model I am using (Bert, XLNet ...):

-Language I am using the model on (English, Chinese....):
+Language I am using the model on (English, Chinese ...):

-The problem arise when using:
-* [ ] the official example scripts: (give details)
-* [ ] my own modified scripts: (give details)
+The problem arises when using:
+* [ ] the official example scripts: (give details below)
+* [ ] my own modified scripts: (give details below)

 The tasks I am working on is:
 * [ ] an official GLUE/SQUaD task: (give the name)
-* [ ] my own task or dataset: (give details)
+* [ ] my own task or dataset: (give details below)

-Details of the issue:
+## Details

-<!-- A clear and concise description of the migration issue. If you have code snippets, please provide it here as well. -->
+<!-- A clear and concise description of the migration issue.
+    If you have code snippets, please provide it here as well.
+    Important! Use code tags to correctly format your code. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
+    Do not use screenshots, as they are hard to read and (more importantly) don't allow others to copy-and-paste your code.
+    -->

-## Environment
+## Environment info
+<!-- You can run the command `python transformers-cli env` and copy-and-paste its output below.
+     Don't forget to fill out the missing fields in that output! -->
+ 
+- `transformers` version:
+- Platform:
+- Python version:
+- PyTorch version (GPU?):
+- Tensorflow version (GPU?):
+- Using GPU in script?:
+- Using distributed or parallel set-up in script?:
+
+<!-- IMPORTANT: which version of the former library do you use? -->
+* `pytorch-transformers` or `pytorch-pretrained-bert` version (or branch):

-* OS:
-* Python version:
-* PyTorch version:
-* PyTorch Transformers version (or branch):
-* Using GPU ?
-* Distributed of parallel setup ?
-* Any other relevant information:

 ## Checklist

 - [ ] I have read the migration guide in the readme.
+ ([pytorch-transformers](https://github.com/huggingface/transformers#migrating-from-pytorch-transformers-to-transformers);
+  [pytorch-pretrained-bert](https://github.com/huggingface/transformers#migrating-from-pytorch-pretrained-bert-to-transformers))
 - [ ] I checked if a related official extension example runs on my machine.
-
-## Additional context
-
-<!-- Add any other context about the problem here. -->
--- a/.github/ISSUE_TEMPLATE/question-help.md
+++ b/.github/ISSUE_TEMPLATE/question-help.md
@ -1,8 +1,26 @@
 ---
-name: "❓Questions & Help"
-about: Start a general discussion related to PyTorch Transformers
+name: "❓ Questions & Help"
+about: Post your general questions on the Hugging Face forum: https://discuss.huggingface.co/
+title: ''
+labels: ''
+assignees: ''
+
 ---

-## ❓ Questions & Help
+# ❓ Questions & Help

-<!-- A clear and concise description of the question. -->
+<!-- The GitHub issue tracker is primarly intended for bugs, feature requests,
+     new models, benchmarks, and migration questions. For all other questions,
+     we direct you to the Hugging Face forum: https://discuss.huggingface.co/ .
+     -->
+
+## Details
+
+<!-- Description of your issue -->
+
+<!-- You should first ask your question on the forum, and only if
+     you didn't get an answer after a few days ask it here on GitHub. -->
+
+**A link to original question on the forum**:
+
+<!-- Your issue will be closed if you don't fill this part. -->
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@ -0,0 +1,62 @@
+# What does this PR do?
+
+<!--
+Congratulations! You've made it this far! You're not quite done yet though.
+
+Once merged, your PR is going to appear in the release notes with the title you set, so make sure it's a great title that fully reflects the extent of your awesome contribution.
+
+Then, please replace this with a description of the change and which issue is fixed (if applicable). Please also include relevant motivation and context. List any dependencies (if any) that are required for this change.
+
+Once you're done, someone will review your PR shortly (see the section "Who can review?" below to tag some potential reviewers). They may suggest changes to make the code even better. If no one reviewed your PR after a week has passed, don't hesitate to post a new comment @-mentioning the same persons---sometimes notifications get lost.
+-->
+
+<!-- Remove if not applicable -->
+
+Fixes # (issue)
+
+
+## Before submitting
+- [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case).
+- [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/master/CONTRIBUTING.md#start-contributing-pull-requests),
+      Pull Request section?
+- [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link
+      to it if that's the case.
+- [ ] Did you make sure to update the documentation with your changes? Here are the
+      [documentation guidelines](https://github.com/huggingface/transformers/tree/master/docs), and
+      [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/master/docs#writing-source-documentation).
+- [ ] Did you write any new necessary tests?
+
+
+## Who can review?
+
+Anyone in the community is free to review the PR once the tests have passed. Feel free to tag
+members/contributors which may be interested in your PR.
+
+<!-- Your PR will be replied to more quickly if you can figure out the right person to tag with @
+
+ If you know how to use git blame, that is the easiest way, otherwise, here is a rough guide of **who to tag**.
+ Please tag fewer than 3 people.
+
+ albert, bert, XLM: @LysandreJik
+ GPT2: @LysandreJik, @patrickvonplaten
+ tokenizers: @mfuntowicz
+ Trainer: @sgugger
+ Benchmarks: @patrickvonplaten
+ Model Cards: @julien-c
+ examples/distillation: @VictorSanh
+ nlp datasets: [different repo](https://github.com/huggingface/nlp)
+ rust tokenizers: [different repo](https://github.com/huggingface/tokenizers)
+ Text Generation: @patrickvonplaten, @TevenLeScao
+ Blenderbot, Bart, Marian, Pegasus: @patrickvonplaten
+ T5: @patrickvonplaten
+ Rag: @patrickvonplaten, @lhoestq
+ EncoderDecoder: @patrickvonplaten
+ Longformer, Reformer: @patrickvonplaten
+ TransfoXL, XLNet: @TevenLeScao, @patrickvonplaten
+ examples/seq2seq: @patil-suraj
+ examples/bert-loses-patience: @JetRunner
+ tensorflow: @jplu
+ examples/token-classification: @stefan-it
+ documentation: @sgugger
+ FSMT: @stas00
+ -->
--- a/.github/conda/build.sh
+++ b/.github/conda/build.sh
@ -0,0 +1 @@
+$PYTHON setup.py install     # Python command to install the script.
--- a/.github/conda/meta.yaml
+++ b/.github/conda/meta.yaml
@ -0,0 +1,48 @@
+{% set name = "transformers" %}
+
+package:
+  name: "{{ name|lower }}"
+  version: "{{ TRANSFORMERS_VERSION }}"
+
+source:
+  path: ../../
+
+build:
+  noarch: python
+
+requirements:
+  host:
+    - python
+    - pip
+    - numpy
+    - dataclasses
+    - packaging
+    - filelock
+    - requests
+    - tqdm >=4.27
+    - sacremoses
+    - regex !=2019.12.17
+    - protobuf
+    - tokenizers ==0.9.4
+  run:
+    - python
+    - numpy
+    - dataclasses
+    - packaging
+    - filelock
+    - requests
+    - tqdm >=4.27
+    - sacremoses
+    - regex !=2019.12.17
+    - protobuf
+    - tokenizers ==0.9.4
+
+test:
+  imports:
+    - transformers
+
+about:
+  home: https://huggingface.co
+  license: Apache License 2.0
+  license_file: LICENSE
+  summary: "🤗Transformers: State-of-the-art Natural Language Processing for Pytorch and TensorFlow 2.0."
--- a/.github/stale.yml
+++ b/.github/stale.yml
@ -6,6 +6,7 @@ daysUntilClose: 7
 exemptLabels:
  - pinned
  - security
+  - Feature request
 # Label to use when marking an issue as stale
 staleLabel: wontfix
 # Comment to post when marking an issue as stale. Set to `false` to disable
--- a/.github/workflows/github-torch-hub.yml
+++ b/.github/workflows/github-torch-hub.yml
@ -0,0 +1,46 @@
+name: Torch hub integration
+
+on:
+  push:
+    branches:
+      - "*"
+
+jobs:
+  torch_hub_integration:
+    runs-on: ubuntu-latest
+    env:
+      # TODO quickfix but may need more investigation
+      ACTIONS_ALLOW_UNSECURE_COMMANDS: True
+    steps:
+    # no checkout necessary here.
+    - name: Extract branch name
+      run: echo "::set-env name=BRANCH::${GITHUB_REF#refs/heads/}"
+    - name: Check branch name
+      run: echo $BRANCH
+    - name: Set up Python
+      uses: actions/setup-python@v1
+      with:
+        python-version: 3.7
+
+    - name: Loading cache
+      uses: actions/cache@v2
+      id: cache
+      with:
+        path: ~/.cache/pip
+        key: v0-torch_hub-${{ hashFiles('setup.py') }}
+
+    - name: Install dependencies
+      run: |
+        pip install --upgrade pip
+        # install torch-hub specific dependencies
+        pip install -e git+https://github.com/huggingface/transformers.git#egg=transformers[torchhub]
+        # no longer needed
+        pip uninstall -y transformers
+
+    - name: Torch hub list
+      run: |
+        python -c "import torch; print(torch.hub.list('huggingface/transformers:$BRANCH'))"
+
+    - name: Torch hub help
+      run: |
+        python -c "import torch; print(torch.hub.help('huggingface/transformers:$BRANCH', 'modelForSequenceClassification'))"
--- a/.github/workflows/model-templates.yml
+++ b/.github/workflows/model-templates.yml
@ -0,0 +1,67 @@
+name: Model templates runner
+
+on:
+  push:
+    paths:
+      - "src/**"
+      - "tests/**"
+      - ".github/**"
+      - "templates/**"
+
+jobs:
+  run_tests_templates:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v1
+
+      - name: Install Python
+        uses: actions/setup-python@v1
+        with:
+          python-version: 3.6
+
+      - name: Loading cache.
+        uses: actions/cache@v2
+        id: cache
+        with:
+          path: ~/.cache/pip
+          key: v1.2-tests_templates
+          restore-keys: |
+            v1.2-tests_templates-${{ hashFiles('setup.py') }}
+            v1.2-tests_templates
+
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install .[dev]
+      - name: Create model files
+        run: |
+          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/standalone.json --path=templates/adding_a_new_model
+          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-seq-2-seq-bart-tokenizer.json --path=templates/adding_a_new_model
+          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-seq-2-seq-bart-tokenizer.json --path=templates/adding_a_new_model
+          make style
+          python utils/check_table.py --fix_and_overwrite
+          python utils/check_dummies.py --fix_and_overwrite
+
+      - name: Run all non-slow tests
+        run: |
+          python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_templates tests/*template*
+
+      - name: Run style changes
+        run: |
+          git fetch origin master:master
+          make fixup
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/tests_templates_failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: run_all_tests_templates_test_reports
+          path: reports
--- a/.github/workflows/release-conda.yml
+++ b/.github/workflows/release-conda.yml
@ -0,0 +1,44 @@
+name: Release - Conda
+
+on:
+  push:
+    tags:
+      - v*
+
+env:
+  ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+
+jobs:
+  build_and_package:
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        shell: bash -l {0}
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v1
+
+      - name: Install miniconda
+        uses: conda-incubator/setup-miniconda@v2
+        with:
+          auto-update-conda: true
+          auto-activate-base: false
+          activate-environment: "build-transformers"
+          channels: huggingface
+
+      - name: Setup conda env
+        run: |
+          conda install -c defaults anaconda-client conda-build
+
+      - name: Extract version
+        run: echo "TRANSFORMERS_VERSION=`python setup.py --version`" >> $GITHUB_ENV
+
+      - name: Build conda packages
+        run: |
+          conda info
+          conda list
+          conda-build .github/conda
+
+      - name: Upload to Anaconda
+        run: anaconda upload `conda-build .github/conda --output` --force
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@ -0,0 +1,275 @@
+name: Self-hosted runner (push)
+
+on:
+  push:
+    branches:
+      - master
+      - ci_*
+    paths:
+      - "src/**"
+      - "tests/**"
+      - ".github/**"
+      - "templates/**"
+  # pull_request:
+  repository_dispatch:
+
+
+jobs:
+  run_tests_torch_gpu:
+    runs-on: [self-hosted, gpu, single-gpu]
+    steps:
+      - uses: actions/checkout@v2
+      - name: Python version
+        run: |
+          which python
+          python --version
+          pip --version
+
+      - name: Current dir
+        run: pwd
+      - run: nvidia-smi
+
+      - name: Loading cache.
+        uses: actions/cache@v2
+        id: cache
+        with:
+          path: .env
+          key: v1.1-tests_torch_gpu-${{ hashFiles('setup.py') }}
+
+      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+        run: |
+          python -m venv .env
+          source .env/bin/activate
+          which python
+          python --version
+          pip --version
+
+      - name: Install dependencies
+        run: |
+          source .env/bin/activate
+          pip install --upgrade pip
+          pip install .[torch,sklearn,testing,onnxruntime,sentencepiece]
+          pip install git+https://github.com/huggingface/datasets
+          pip install pandas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cu102.html
+
+      - name: Are GPUs recognized by our DL frameworks
+        run: |
+          source .env/bin/activate
+          python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
+          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
+
+#      - name: Create model files
+#        run: |
+#          source .env/bin/activate
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/standalone.json --path=templates/adding_a_new_model
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+
+      - name: Run all non-slow tests on GPU
+        env:
+          OMP_NUM_THREADS: 1
+          CUDA_VISIBLE_DEVICES: 0
+        run: |
+          source .env/bin/activate
+          python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_torch_gpu tests
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/tests_torch_gpu_failures_short.txt
+        
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: run_all_tests_torch_gpu_test_reports
+          path: reports
+                  
+
+  run_tests_tf_gpu:
+    runs-on: [self-hosted, gpu, single-gpu]
+    steps:
+      - uses: actions/checkout@v2
+      - name: Python version
+        run: |
+          which python
+          python --version
+          pip --version
+      - name: Current dir
+        run: pwd
+      - run: nvidia-smi
+
+      - name: Loading cache.
+        uses: actions/cache@v2
+        id: cache
+        with:
+          path: .env
+          key: v1.1-tests_tf_gpu-${{ hashFiles('setup.py') }}
+
+      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+        run: |
+          python -m venv .env
+          source .env/bin/activate
+          which python
+          python --version
+          pip --version
+
+      - name: Install dependencies
+        run: |
+          source .env/bin/activate
+          pip install --upgrade pip
+          pip install .[tf,sklearn,testing,onnxruntime,sentencepiece]
+          pip install git+https://github.com/huggingface/datasets
+
+      - name: Are GPUs recognized by our DL frameworks
+        run: |
+          source .env/bin/activate
+          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
+          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
+
+      - name: Create model files
+        run: |
+          source .env/bin/activate
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/standalone.json --path=templates/adding_a_new_model
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+
+      - name: Run all non-slow tests on GPU
+        env:
+          OMP_NUM_THREADS: 1
+          CUDA_VISIBLE_DEVICES: 0
+        run: |
+          source .env/bin/activate
+          python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_tf_gpu tests
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/tests_tf_gpu_failures_short.txt
+        
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: run_all_tests_tf_gpu_test_reports
+          path: reports
+
+  run_tests_torch_multi_gpu:
+    runs-on: [self-hosted, gpu, multi-gpu]
+    steps:
+      - uses: actions/checkout@v2
+      - name: Python version
+        run: |
+          which python
+          python --version
+          pip --version
+
+      - name: Current dir
+        run: pwd
+      - run: nvidia-smi
+
+      - name: Loading cache.
+        uses: actions/cache@v2
+        id: cache
+        with:
+          path: .env
+          key: v1.1-tests_torch_multi_gpu-${{ hashFiles('setup.py') }}
+
+      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+        run: |
+          python -m venv .env
+          source .env/bin/activate
+          which python
+          python --version
+          pip --version
+      - name: Install dependencies
+        run: |
+          source .env/bin/activate
+          pip install --upgrade pip
+          pip install .[torch,sklearn,testing,onnxruntime,sentencepiece]
+          pip install git+https://github.com/huggingface/datasets
+          pip install pandas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cu102.html
+
+      - name: Are GPUs recognized by our DL frameworks
+        run: |
+          source .env/bin/activate
+          python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
+          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
+
+      - name: Run all non-slow tests on GPU
+        env:
+          OMP_NUM_THREADS: 1
+        run: |
+          source .env/bin/activate
+          python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_torch_multi_gpu tests
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/tests_torch_multi_gpu_failures_short.txt          
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: run_all_tests_torch_multi_gpu_test_reports
+          path: reports
+
+  run_tests_tf_multi_gpu:
+    runs-on: [self-hosted, gpu, multi-gpu]
+    steps:
+      - uses: actions/checkout@v2
+      - name: Python version
+        run: |
+          which python
+          python --version
+          pip --version
+
+      - name: Current dir
+        run: pwd
+      - run: nvidia-smi
+
+      - name: Loading cache.
+        uses: actions/cache@v2
+        id: cache
+        with:
+          path: .env
+          key: v1.1-tests_tf_multi_gpu-${{ hashFiles('setup.py') }}
+
+      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+        run: |
+          python -m venv .env
+          source .env/bin/activate
+          which python
+          python --version
+          pip --version
+      - name: Install dependencies
+        run: |
+          source .env/bin/activate
+          pip install --upgrade pip
+          pip install .[tf,sklearn,testing,onnxruntime,sentencepiece]
+          pip install git+https://github.com/huggingface/datasets
+
+      - name: Are GPUs recognized by our DL frameworks
+        run: |
+          source .env/bin/activate
+          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
+          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
+
+      - name: Run all non-slow tests on GPU
+        env:
+          OMP_NUM_THREADS: 1
+        run: |
+          source .env/bin/activate
+          python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_tf_multi_gpu tests
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/tests_tf_multi_gpu_failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: run_all_tests_tf_multi_gpu_test_reports
+          path: reports
+          
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@ -0,0 +1,356 @@
+# configuration notes:
+#
+# - `source .env/bin/activate` is currently needed to be run first thing first in each step. Otherwise
+# the step uses the system-wide python interpreter.
+
+name: Self-hosted runner (scheduled)
+
+on:
+  repository_dispatch:
+  schedule:
+    - cron: "0 0 * * *"
+
+jobs:
+  run_all_tests_torch_gpu:
+    runs-on: [self-hosted, gpu, single-gpu]
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Loading cache.
+        uses: actions/cache@v2
+        id: cache
+        with:
+          path: .env
+          key: v  1.1-slow_tests_torch_gpu-${{ hashFiles('setup.py') }}
+
+      - name: Python version
+        run: |
+          which python
+          python --version
+          pip --version
+
+      - name: Current dir
+        run: pwd
+      - run: nvidia-smi
+
+      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+        if: steps.cache.outputs.cache-hit != 'true'
+        run: |
+          python -m venv .env
+          source .env/bin/activate
+          which python
+          python --version
+          pip --version
+
+      - name: Install dependencies
+        run: |
+          source .env/bin/activate
+          pip install --upgrade pip
+          pip install .[torch,sklearn,testing,onnxruntime,sentencepiece]
+          pip install git+https://github.com/huggingface/datasets
+          pip list
+
+      - name: Are GPUs recognized by our DL frameworks
+        run: |
+          source .env/bin/activate
+          python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
+          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
+
+      - name: Run all tests on GPU
+        env:
+          OMP_NUM_THREADS: 1
+          RUN_SLOW: yes
+        run: |
+          source .env/bin/activate
+          python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_torch_gpu tests
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/tests_torch_gpu_failures_short.txt
+        
+      - name: Run examples tests on GPU
+        if: ${{ always() }}
+        env:
+          OMP_NUM_THREADS: 1
+          RUN_SLOW: yes
+        run: |
+          source .env/bin/activate
+          pip install -r examples/_tests_requirements.txt
+          python -m pytest -n 1 --dist=loadfile -s --make-reports=examples_torch_gpu examples
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/examples_torch_gpu_failures_short.txt
+
+      - name: Run all pipeline tests on GPU
+        if: ${{ always() }}
+        env:
+          TF_FORCE_GPU_ALLOW_GROWTH: "true"
+          OMP_NUM_THREADS: 1
+          RUN_SLOW: yes
+          RUN_PIPELINE_TESTS: yes
+        run: |
+          source .env/bin/activate
+          python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/tests_torch_pipeline_gpu_failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: run_all_tests_torch_gpu_test_reports
+          path: reports
+
+
+  run_all_tests_tf_gpu:
+    runs-on: [self-hosted, gpu, single-gpu]
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Loading cache.
+        uses: actions/cache@v2
+        id: cache
+        with:
+          path: .env
+          key: v1.1-slow_tests_tf_gpu-${{ hashFiles('setup.py') }}
+
+      - name: Python version
+        run: |
+          which python
+          python --version
+          pip --version
+
+      - name: Current dir
+        run: pwd
+      - run: nvidia-smi
+
+      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+        if: steps.cache.outputs.cache-hit != 'true'
+        run: |
+          python -m venv .env
+          source .env/bin/activate
+          which python
+          python --version
+          pip --version
+
+      - name: Install dependencies
+        run: |
+          source .env/bin/activate
+          pip install --upgrade pip
+          pip install .[tf,sklearn,testing,onnxruntime,sentencepiece]
+          pip install git+https://github.com/huggingface/datasets
+          pip list
+
+      - name: Are GPUs recognized by our DL frameworks
+        run: |
+          source .env/bin/activate
+          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
+          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
+
+      - name: Run all tests on GPU
+        env:
+          OMP_NUM_THREADS: 1
+          RUN_SLOW: yes
+        run: |
+          source .env/bin/activate
+          python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_tf_gpu tests
+          
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/tests_tf_gpu_failures_short.txt
+
+      - name: Run all pipeline tests on GPU
+        if: ${{ always() }}
+        env:
+          TF_FORCE_GPU_ALLOW_GROWTH: "true"
+          OMP_NUM_THREADS: 1
+          RUN_SLOW: yes
+          RUN_PIPELINE_TESTS: yes
+        run: |
+          source .env/bin/activate
+          python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_tf_pipelines_gpu tests
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/tests_tf_pipelines_gpu_failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: run_all_tests_tf_gpu_test_reports
+          path: reports
+          
+  run_all_tests_torch_multi_gpu:
+    runs-on: [self-hosted, gpu, multi-gpu]
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Loading cache.
+        uses: actions/cache@v2
+        id: cache
+        with:
+          path: .env
+          key: v1.1-slow_tests_torch_multi_gpu-${{ hashFiles('setup.py') }}
+
+      - name: Python version
+        run: |
+          which python
+          python --version
+          pip --version
+
+      - name: Current dir
+        run: pwd
+      - run: nvidia-smi
+
+      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+        if: steps.cache.outputs.cache-hit != 'true'
+        run: |
+          python -m venv .env
+          source .env/bin/activate
+          which python
+          python --version
+          pip --version
+
+      - name: Install dependencies
+        run: |
+          source .env/bin/activate
+          pip install --upgrade pip
+          pip install .[torch,sklearn,testing,onnxruntime,sentencepiece]
+          pip install git+https://github.com/huggingface/datasets
+          pip list
+
+      - name: Are GPUs recognized by our DL frameworks
+        run: |
+          source .env/bin/activate
+          python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
+          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
+
+      - name: Run all tests on multi-GPU
+        env:
+          OMP_NUM_THREADS: 1
+          RUN_SLOW: yes
+        run: |
+          source .env/bin/activate
+          python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_torch_multi_gpu tests
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/tests_torch_multi_gpu_failures_short.txt
+
+      - name: Run examples tests on multi-GPU
+        env:
+          OMP_NUM_THREADS: 1
+          RUN_SLOW: yes
+        run: |
+          source .env/bin/activate
+          python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_torch_examples_multi_gpu examples
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/tests_torch_examples_multi_gpu_failures_short.txt
+
+      - name: Run all pipeline tests on multi-GPU
+        if: ${{ always() }}
+        env:
+          TF_FORCE_GPU_ALLOW_GROWTH: "true"
+          OMP_NUM_THREADS: 1
+          RUN_SLOW: yes
+          RUN_PIPELINE_TESTS: yes
+        run: |
+          source .env/bin/activate
+          python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/tests_torch_pipeline_multi_gpu_failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: run_all_tests_torch_multi_gpu_test_reports
+          path: reports
+
+  run_all_tests_tf_multi_gpu:
+    runs-on: [self-hosted, gpu, multi-gpu]
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Loading cache.
+        uses: actions/cache@v2
+        id: cache
+        with:
+          path: .env
+          key: v1.1-slow_tests_tf_multi_gpu-${{ hashFiles('setup.py') }}
+
+      - name: Python version
+        run: |
+          which python
+          python --version
+          pip --version
+
+      - name: Current dir
+        run: pwd
+      - run: nvidia-smi
+
+      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+        if: steps.cache.outputs.cache-hit != 'true'
+        run: |
+          python -m venv .env
+          source .env/bin/activate
+          which python
+          python --version
+          pip --version
+
+      - name: Install dependencies
+        run: |
+          source .env/bin/activate
+          pip install --upgrade pip
+          pip install .[tf,sklearn,testing,onnxruntime,sentencepiece]
+          pip install git+https://github.com/huggingface/datasets
+          pip list
+
+      - name: Are GPUs recognized by our DL frameworks
+        run: |
+          source .env/bin/activate
+          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
+          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
+
+      - name: Run all tests on multi-GPU
+        env:
+          OMP_NUM_THREADS: 1
+          RUN_SLOW: yes
+        run: |
+          source .env/bin/activate
+          python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_tf_multi_gpu tests
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/tests_tf_multi_gpu_failures_short.txt
+
+      - name: Run all pipeline tests on multi-GPU
+        if: ${{ always() }}
+        env:
+          TF_FORCE_GPU_ALLOW_GROWTH: "true"
+          OMP_NUM_THREADS: 1
+          RUN_SLOW: yes
+          RUN_PIPELINE_TESTS: yes
+        run: |
+          source .env/bin/activate
+          python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_tf_pipeline_multi_gpu tests
+          
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/tests_tf_pipeline_multi_gpu_failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: run_all_tests_tf_multi_gpu_test_reports
+          path: reports
+          
--- a/.gitignore
+++ b/.gitignore
@ -8,6 +8,13 @@ __pycache__/
 # C extensions
 *.so

+# tests and logs
+tests/fixtures/*
+!tests/fixtures/sample_text_no_unicode.txt
+logs/
+lightning_logs/
+lang_code_data/
+
 # Distribution / packaging
 .Python
 build/
@ -116,19 +123,42 @@ dmypy.json
 .pyre/

 # vscode
+.vs
 .vscode

+# Pycharm
+.idea
+
 # TF code
 tensorflow_code

 # Models
-models
 proc_data

 # examples
 runs
-examples/runs
+/runs_old
+/wandb
+/examples/runs
+/examples/**/*.args
+/examples/rag/sweep

 # data
 /data
-serialization_dir
+serialization_dir
+
+# emacs
+*.*~
+debug.env
+
+# vim
+.*.swp
+
+#ctags
+tags
+
+# pre-commit
+.pre-commit*
+
+# .lock
+*.lock
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@ -0,0 +1,129 @@
+
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+We as members, contributors, and leaders pledge to make participation in our
+community a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, religion, or sexual identity
+and orientation.
+
+We pledge to act and interact in ways that contribute to an open, welcoming,
+diverse, inclusive, and healthy community.
+
+## Our Standards
+
+Examples of behavior that contributes to a positive environment for our
+community include:
+
+* Demonstrating empathy and kindness toward other people
+* Being respectful of differing opinions, viewpoints, and experiences
+* Giving and gracefully accepting constructive feedback
+* Accepting responsibility and apologizing to those affected by our mistakes,
+  and learning from the experience
+* Focusing on what is best not just for us as individuals, but for the
+  overall community
+
+Examples of unacceptable behavior include:
+
+* The use of sexualized language or imagery, and sexual attention or
+  advances of any kind
+* Trolling, insulting or derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or email
+  address, without their explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Enforcement Responsibilities
+
+Community leaders are responsible for clarifying and enforcing our standards of
+acceptable behavior and will take appropriate and fair corrective action in
+response to any behavior that they deem inappropriate, threatening, offensive,
+or harmful.
+
+Community leaders have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that are
+not aligned to this Code of Conduct, and will communicate reasons for moderation
+decisions when appropriate.
+
+## Scope
+
+This Code of Conduct applies within all community spaces, and also applies when
+an individual is officially representing the community in public spaces.
+Examples of representing our community include using an official e-mail address,
+posting via an official social media account, or acting as an appointed
+representative at an online or offline event.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the community leaders responsible for enforcement at
+feedback@huggingface.co.
+All complaints will be reviewed and investigated promptly and fairly.
+
+All community leaders are obligated to respect the privacy and security of the
+reporter of any incident.
+
+## Enforcement Guidelines
+
+Community leaders will follow these Community Impact Guidelines in determining
+the consequences for any action they deem in violation of this Code of Conduct:
+
+### 1. Correction
+
+**Community Impact**: Use of inappropriate language or other behavior deemed
+unprofessional or unwelcome in the community.
+
+**Consequence**: A private, written warning from community leaders, providing
+clarity around the nature of the violation and an explanation of why the
+behavior was inappropriate. A public apology may be requested.
+
+### 2. Warning
+
+**Community Impact**: A violation through a single incident or series
+of actions.
+
+**Consequence**: A warning with consequences for continued behavior. No
+interaction with the people involved, including unsolicited interaction with
+those enforcing the Code of Conduct, for a specified period of time. This
+includes avoiding interactions in community spaces as well as external channels
+like social media. Violating these terms may lead to a temporary or
+permanent ban.
+
+### 3. Temporary Ban
+
+**Community Impact**: A serious violation of community standards, including
+sustained inappropriate behavior.
+
+**Consequence**: A temporary ban from any sort of interaction or public
+communication with the community for a specified period of time. No public or
+private interaction with the people involved, including unsolicited interaction
+with those enforcing the Code of Conduct, is allowed during this period.
+Violating these terms may lead to a permanent ban.
+
+### 4. Permanent Ban
+
+**Community Impact**: Demonstrating a pattern of violation of community
+standards, including sustained inappropriate behavior,  harassment of an
+individual, or aggression toward or disparagement of classes of individuals.
+
+**Consequence**: A permanent ban from any sort of public interaction within
+the community.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage],
+version 2.0, available at
+https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
+
+Community Impact Guidelines were inspired by [Mozilla's code of conduct
+enforcement ladder](https://github.com/mozilla/diversity).
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see the FAQ at
+https://www.contributor-covenant.org/faq. Translations are available at
+https://www.contributor-covenant.org/translations.
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -0,0 +1,355 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# How to contribute to transformers?
+
+Everyone is welcome to contribute, and we value everybody's contribution. Code
+is thus not the only way to help the community. Answering questions, helping
+others, reaching out and improving the documentations are immensely valuable to
+the community.
+
+It also helps us if you spread the word: reference the library from blog posts
+on the awesome projects it made possible, shout out on Twitter every time it has
+helped you, or simply star the repo to say "thank you".
+
+Whichever way you choose to contribute, please be mindful to respect our
+[code of conduct](https://github.com/huggingface/transformers/blob/master/CODE_OF_CONDUCT.md).
+
+## You can contribute in so many ways!
+
+There are 4 ways you can contribute to transformers:
+* Fixing outstanding issues with the existing code;
+* Implementing new models;
+* Contributing to the examples or to the documentation;
+* Submitting issues related to bugs or desired new features.
+
+*All are equally valuable to the community.*
+
+## Submitting a new issue or feature request
+
+Do your best to follow these guidelines when submitting an issue or a feature
+request. It will make it easier for us to come back to you quickly and with good
+feedback.
+
+### Did you find a bug?
+
+The transformers are robust and reliable thanks to the users who notify us of
+the problems they encounter. So thank you for reporting an issue.
+
+First, we would really appreciate it if you could **make sure the bug was not
+already reported** (use the search bar on Github under Issues).
+
+Did not find it? :( So we can act quickly on it, please follow these steps:
+
+* Include your **OS type and version**, the versions of **Python**, **PyTorch** and
+  **Tensorflow** when applicable;
+* A short, self-contained, code snippet that allows us to reproduce the bug in
+  less than 30s;
+* Provide the *full* traceback if an exception is raised.
+
+To get the OS and software versions automatically, you can run the following command:
+
+```bash
+transformers-cli env
+```
+
+or from the root of the repository the following command:
+
+```bash
+python src/transformers/commands/transformers_cli.py env
+```
+
+
+### Do you want to implement a new model?
+
+Awesome! Please provide the following information:
+
+* Short description of the model and link to the paper;
+* Link to the implementation if it is open-source;
+* Link to the model weights if they are available.
+
+If you are willing to contribute the model yourself, let us know so we can best
+guide you.
+
+We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them
+in the [`templates`](https://github.com/huggingface/transformers/tree/master/templates) folder.
+
+### Do you want a new feature (that is not a model)?
+
+A world-class feature request addresses the following points:
+
+1. Motivation first:
+  * Is it related to a problem/frustration with the library? If so, please explain
+    why. Providing a code snippet that demonstrates the problem is best.
+  * Is it related to something you would need for a project? We'd love to hear
+    about it!
+  * Is it something you worked on and think could benefit the community?
+    Awesome! Tell us what problem it solved for you.
+2. Write a *full paragraph* describing the feature;
+3. Provide a **code snippet** that demonstrates its future use;
+4. In case this is related to a paper, please attach a link;
+5. Attach any additional information (drawings, screenshots, etc.) you think may help.
+
+If your issue is well written we're already 80% of the way there by the time you
+post it.
+
+We have added **templates** to guide you in the process of adding a new example script for training or testing the
+models in the library. You can find them in the [`templates`](https://github.com/huggingface/transformers/tree/master/templates)
+folder.
+
+## Start contributing! (Pull Requests)
+
+Before writing code, we strongly advise you to search through the existing PRs or
+issues to make sure that nobody is already working on the same thing. If you are
+unsure, it is always a good idea to open an issue to get some feedback.
+
+You will need basic `git` proficiency to be able to contribute to
+`transformers`. `git` is not the easiest tool to use but it has the greatest
+manual. Type `git --help` in a shell and enjoy. If you prefer books, [Pro
+Git](https://git-scm.com/book/en/v2) is a very good reference.
+
+Follow these steps to start contributing:
+
+1. Fork the [repository](https://github.com/huggingface/transformers) by
+   clicking on the 'Fork' button on the repository's page. This creates a copy of the code
+   under your GitHub user account.
+
+2. Clone your fork to your local disk, and add the base repository as a remote:
+
+   ```bash
+   $ git clone git@github.com:<your Github handle>/transformers.git
+   $ cd transformers
+   $ git remote add upstream https://github.com/huggingface/transformers.git
+   ```
+
+3. Create a new branch to hold your development changes:
+
+   ```bash
+   $ git checkout -b a-descriptive-name-for-my-changes
+   ```
+
+   **Do not** work on the `master` branch.
+
+4. Set up a development environment by running the following command in a virtual environment:
+
+   ```bash
+   $ pip install -e ".[dev]"
+   ```
+
+   (If transformers was already installed in the virtual environment, remove
+   it with `pip uninstall transformers` before reinstalling it in editable
+   mode with the `-e` flag.)
+
+   To run the full test suite, you might need the additional dependency on `datasets` which requires a separate source
+   install:
+
+   ```bash
+   $ git clone https://github.com/huggingface/datasets
+   $ cd datasets
+   $ pip install -e .
+   ```
+
+   If you have already cloned that repo, you might need to `git pull` to get the most recent changes in the `datasets`
+   library.
+
+5. Develop the features on your branch.
+
+   As you work on the features, you should make sure that the test suite
+   passes:
+
+   ```bash
+   $ make test
+   ```
+
+   Note, that this command uses `-n auto` pytest flag, therefore, it will start as many parallel `pytest` processes as the number of your computer's CPU-cores, and if you have lots of those and a few GPUs and not a great amount of RAM, it's likely to overload your computer. Therefore, to run the test suite, you may want to consider using this command instead:
+
+   ```bash
+   $ python -m pytest -n 3 --dist=loadfile -s -v ./tests/
+   ```
+
+   Adjust the value of `-n` to fit the load your hardware can support.
+
+   `transformers` relies on `black` and `isort` to format its source code
+   consistently. After you make changes, format them with:
+
+   ```bash
+   $ make style
+   ```
+
+   `transformers` also uses `flake8` and a few custom scripts to check for coding mistakes. Quality
+   control runs in CI, however you can also run the same checks with:
+
+   ```bash
+   $ make quality
+   ```
+   You can do the automatic style corrections and code verifications that can't be automated in one go:
+
+   ```bash
+   $ make fixup
+   ```
+
+   This target is also optimized to only work with files modified by the PR you're working on.
+
+   If you're modifying documents under `docs/source`, make sure to validate that
+   they can still be built. This check also runs in CI. To run a local check
+   make sure you have installed the documentation builder requirements, by
+   running `pip install .[tf,torch,docs]` once from the root of this repository
+   and then run:
+
+   ```bash
+   $ make docs
+   ```
+
+   Once you're happy with your changes, add changed files using `git add` and
+   make a commit with `git commit` to record your changes locally:
+
+   ```bash
+   $ git add modified_file.py
+   $ git commit
+   ```
+
+   Please write [good commit
+   messages](https://chris.beams.io/posts/git-commit/).
+
+   It is a good idea to sync your copy of the code with the original
+   repository regularly. This way you can quickly account for changes:
+
+   ```bash
+   $ git fetch upstream
+   $ git rebase upstream/master
+   ```
+
+   Push the changes to your account using:
+
+   ```bash
+   $ git push -u origin a-descriptive-name-for-my-changes
+   ```
+
+6. Once you are satisfied (**and the checklist below is happy too**), go to the
+   webpage of your fork on GitHub. Click on 'Pull request' to send your changes
+   to the project maintainers for review.
+
+7. It's ok if maintainers ask you for changes. It happens to core contributors
+   too! So everyone can see the changes in the Pull request, work in your local
+   branch and push the changes to your fork. They will automatically appear in
+   the pull request.
+
+
+### Checklist
+
+1. The title of your pull request should be a summary of its contribution;
+2. If your pull request addresses an issue, please mention the issue number in
+   the pull request description to make sure they are linked (and people
+   consulting the issue know you are working on it);
+3. To indicate a work in progress please prefix the title with `[WIP]`. These
+   are useful to avoid duplicated work, and to differentiate it from PRs ready
+   to be merged;
+4. Make sure existing tests pass;
+5. Add high-coverage tests. No quality testing = no merge.
+   - If you are adding a new model, make sure that you use
+     `ModelTester.all_model_classes = (MyModel, MyModelWithLMHead,...)`, which triggers the common tests.
+   - If you are adding new `@slow` tests, make sure they pass using
+     `RUN_SLOW=1 python -m pytest tests/test_my_new_model.py`.
+   - If you are adding a new tokenizer, write tests, and make sure
+     `RUN_SLOW=1 python -m pytest tests/test_tokenization_{your_model_name}.py` passes.
+   CircleCI does not run the slow tests, but github actions does every night!
+6. All public methods must have informative docstrings that work nicely with sphinx. See `modeling_ctrl.py` for an
+   example.
+
+### Tests
+
+An extensive test suite is included to test the library behavior and several examples. Library tests can be found in
+the [tests folder](https://github.com/huggingface/transformers/tree/master/tests) and examples tests in the
+[examples folder](https://github.com/huggingface/transformers/tree/master/examples).
+
+We like `pytest` and `pytest-xdist` because it's faster. From the root of the
+repository, here's how to run tests with `pytest` for the library:
+
+```bash
+$ python -m pytest -n auto --dist=loadfile -s -v ./tests/
+```
+
+and for the examples:
+
+```bash
+$ pip install -r examples/requirements.txt  # only needed the first time
+$ python -m pytest -n auto --dist=loadfile -s -v ./examples/
+```
+In fact, that's how `make test` and `make test-examples` are implemented (sans the `pip install` line)!
+
+You can specify a smaller set of tests in order to test only the feature
+you're working on.
+
+By default, slow tests are skipped. Set the `RUN_SLOW` environment variable to
+`yes` to run them. This will download many gigabytes of models — make sure you
+have enough disk space and a good Internet connection, or a lot of patience!
+
+```bash
+$ RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./tests/
+$ RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./examples/
+```
+
+Likewise, set the `RUN_CUSTOM_TOKENIZERS` environment variable to `yes` to run
+tests for custom tokenizers, which don't run by default either.
+
+🤗 Transformers uses `pytest` as a test runner only. It doesn't use any
+`pytest`-specific features in the test suite itself.
+
+This means `unittest` is fully supported. Here's how to run tests with
+`unittest`:
+
+```bash
+$ python -m unittest discover -s tests -t . -v
+$ python -m unittest discover -s examples -t examples -v
+```
+
+
+### Style guide
+
+For documentation strings, `transformers` follows the [google style](https://google.github.io/styleguide/pyguide.html).
+Check our [documentation writing guide](https://github.com/huggingface/transformers/tree/master/docs#writing-documentation---specification)
+for more information.
+
+#### This guide was heavily inspired by the awesome [scikit-learn guide to contributing](https://github.com/scikit-learn/scikit-learn/blob/master/CONTRIBUTING.md)
+
+
+### Develop on Windows
+
+On windows, you need to configure git to transform Windows `CRLF` line endings to Linux `LF` line endings:
+
+`git config core.autocrlf input`
+
+One way one can run the make command on Window is to pass by MSYS2:
+
+1. [Download MSYS2](https://www.msys2.org/), we assume to have it installed in C:\msys64
+2. Open the command line C:\msys64\msys2.exe (it should be available from the start menu)
+3. Run in the shell: `pacman -Syu` and install make with `pacman -S make`
+4. Add `C:\msys64\usr\bin` to your PATH environment variable.
+
+You can now use `make` from any terminal (Powershell, cmd.exe, etc) 🎉
+
+### Syncing forked master with upstream (HuggingFace) master
+
+To avoid pinging the upstream repository which adds reference notes to each upstream PR and sends unnessary notifications to the developers involved in these PRs, 
+when syncing the master branch of a forked repository, please, follow these steps:
+1. When possible, avoid syncing with the upstream using a branch and PR on the forked repository. Instead merge directly into the forked master.
+2. If a PR is absolutely necessary, use the following steps after checking out your branch:
+```
+$ git checkout -b your-branch-for-syncing
+$ git pull --squash --no-commit upstream master
+$ git commit -m '<your message without GitHub references>'
+$ git push --set-upstream origin your-branch-for-syncing
+```
--- a/ISSUES.md
+++ b/ISSUES.md
@ -0,0 +1,275 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# How To Request Support
+
+This is an Open Source Project so please be mindful that like in any other project of this kind there is no obligation to answer all requests for help.
+
+However, we want to encourage you to ask for help whenever you think it's needed! We are happy about every  question we get because it allows us to better understand your needs, possible misunderstandings, and most importantly a way for you to help us make this library better. That being said, this document's main purpose is to provide guidelines at how you can formulate your requests to increase your chances to be understood and to get support.
+
+There are two main venues to receive support: [the forums](https://discuss.huggingface.co/) and [the GitHub issues](https://github.com/huggingface/transformers/issues).
+
+## The Forums
+
+[The user forums](https://discuss.huggingface.co/) are supported by the wide community of the library users and backed up by developers when needed.
+
+If you have a difficulty with deploying this library or some questions, or you'd like to discuss a new feature, please first consider discussing those things at the forums. Only when you feel your subject matter has been crystalized and you still need support from the library developers do proceed to file an [issue](https://github.com/huggingface/transformers/issues).
+
+In particular all "Please explain" questions or objectively very user-specific feature requests belong to the forums. Here are some example of such questions:
+
+* "I would like to use a BertModel within a RL-Agent for a customer support service. How can I use a BertForMaskedLM in my ChatBotModel?"
+
+* "Could you please explain why T5 has no positional embedding matrix under T5Model?"
+
+* "How should I set my generation parameters for translation?"
+
+* "How to train T5 on De->En translation?"
+
+
+## The GitHub Issues
+
+Everything which hints at a bug should be opened as an [issue](https://github.com/huggingface/transformers/issues).
+
+You are not required to read the following guidelines before opening an issue. However, if you notice that your issue doesn't get any replies, chances are that the developers have one or several difficulties with its quality. In this case, reading the following points and adjusting your issue accordingly could help.
+
+1. Before posting an issue, first search for already posted issues, since chances are someone has already asked a similar question before you.
+
+    If you use Google your search query should be:
+
+    ```
+    "huggingface" "transformers" your query
+    ```
+
+    The first two quoted words tell Google to limit the search to the context of the Huggingface Transformers. The remainder is your query - most commonly this would be the error message the software fails with. We will go deeper into details shortly.
+
+    The results of such a query will typically match GitHub issues, Hugging Face forums, StackExchange, and blogs.
+
+    If you find relevant hints, you may choose to continue the discussion there if you have follow up questions.
+
+    If what you found is similar but doesn't quite answer your problem, please, post a new issue and do include links to similar issues or forum discussions you may have found.
+
+    Let's look at some examples:
+
+    The error message, often referred to as an assertion, tells us what went wrong. Here is an example of an assertion:
+
+   ```python
+   Traceback (most recent call last):
+     File "<string>", line 1, in <module>
+     File "/transformers/src/transformers/__init__.py", line 34, in <module>
+       from . import dependency_versions_check
+     File "/transformers/src/transformers/dependency_versions_check.py", line 34, in <module>
+       from .file_utils import is_tokenizers_available
+     File "/transformers/src/transformers/file_utils.py", line 40, in <module>
+       from tqdm.auto import tqdm
+    ModuleNotFoundError: No module named 'tqdm.auto'
+    ```
+
+   and it typically includes a traceback, so that we can see the full stack of calls the program made before it fails. This gives us the context to know why the program failed.
+
+   Going back to the above example. If you received this error search, look at the very last line of the error which is:
+
+   ```python
+    ModuleNotFoundError: No module named 'tqdm.auto'
+    ```
+
+    And now we can use it to do the searching on your favorite search engine:
+
+    1. first for `"huggingface" "transformers" "ModuleNotFoundError: No module named 'tqdm.auto'"`
+    2. if you don't find relevant results, then search for just `"ModuleNotFoundError: No module named 'tqdm.auto'"`
+    3. and finally if nothing still comes up, then remove the outside quotes: `ModuleNotFoundError: No module named 'tqdm.auto'`
+
+   If the error includes any messages that include bits unique to your filesystem, always remove those in the search query since other users will not have the same filesystem as yours. For example:
+
+   ```bash
+   python -c 'open("/tmp/wrong_path.txt", "r")'
+   Traceback (most recent call last):
+     File "<string>", line 1, in <module>
+   FileNotFoundError: [Errno 2] No such file or directory: '/tmp/wrong_path.txt'
+   ```
+   Here you'd search for just: `"FileNotFoundError: [Errno 2] No such file or directory"`
+
+   If the local information that you removed were inside the error message and you removed them you may need to remove double quotes since your query is no longer exact. So if the error message was something like:
+
+   ```bash
+      ValueError: '/tmp/wrong_path.txt' cannot be found
+   ```
+
+   then you'd search for `"ValueError" "cannot be found"`
+
+   As you search you will notice that when you don't use quotes often the search engines will return a variety of unrelated hits, which may or may not be what you want.
+
+   Experiment with different ways and find which approach gives the most satisfactory results.
+
+2. Keep the issue short, providing the information that you think will aid the developers to understand your situation. Put yourself in the shoes of the person who has never seen your code or knows anything about your custom setup. This mental exercise will help to develop an intuition to what/what not to share"
+
+3. If there is a software failure, always provide the full traceback, for example:
+
+   ```python
+   $ python -c 'import transformers'
+   Traceback (most recent call last):
+     File "<string>", line 1, in <module>
+     File "/transformers/src/transformers/__init__.py", line 34, in <module>
+       from . import dependency_versions_check
+     File "/transformers/src/transformers/dependency_versions_check.py", line 34, in <module>
+       from .file_utils import is_tokenizers_available
+     File "/transformers/src/transformers/file_utils.py", line 40, in <module>
+       from tqdm.auto import tqdm
+   ModuleNotFoundError: No module named 'tqdm.auto'
+   ```
+
+   As compared to providing just the last line of the error message, e.g.:
+   ```python
+   ModuleNotFoundError: No module named 'tqdm.auto'
+   ```
+   which is not sufficient.
+
+   If your application is running on more than one GPU (e.g. under `DistributedDataParallel`) and typically getting every log and traceback printed multiple times, please make sure that you paste only one copy of it. At times the traceback from parallel processes may get interleaved - so either disentangle these or change the loggers to log only for `local_rank==0` so that only one process logs things.
+
+4. When quoting a traceback, command line instructions and any type of code always enclose it in triple backticks inside the editor window, that is:
+
+   ````
+   ```
+   git clone https://github.com/huggingface/transformers
+   cd transformers
+   pip install .
+   ```
+   ````
+
+   If it's a command line with a long argument list, please consider breaking it down using backslashes and new lines. Here is an example of a good command line quote:
+
+   ```bash
+    cd examples/seq2seq
+    python -m torch.distributed.launch --nproc_per_node=2 ./finetune_trainer.py \
+    --model_name_or_path sshleifer/distill-mbart-en-ro-12-4 --data_dir wmt_en_ro \
+    --output_dir output_dir --overwrite_output_dir \
+    --do_train --n_train 500 --num_train_epochs 1 \
+    --per_device_train_batch_size 1  --freeze_embeds \
+    --src_lang en_XX --tgt_lang ro_RO --task translation \
+    --fp16 --sharded_ddp
+   ```
+
+   If you don't break it up, one has to scroll horizontally which often makes it quite difficult to quickly see what's happening.
+
+   The backslashes allow us to copy the command directly into the console to run it, without needing to edit it.
+
+5. Include only the important information that you think will help the developer to quickly identify the problem.
+
+   For example applications often create huge amounts of logs. Ask yourself whether providing all or parts of the log is useful.
+
+   Pasting a 100-1000 lines of log into the issue is an immediate turn off, since it will take a lot of time to figure out where the pertinent parts of the log are.
+
+   Attaching a full log can be helpful if it's done as an attachment, if it's enclosed in the following html code in the comment editor window:
+
+   ```
+   <details>
+   <summary>Full log</summary>
+   <pre>
+
+   many
+   lines
+   go
+   here
+
+   </pre>
+   </details>
+   ```
+
+   which would result in the following entry, which can be opened if desired, but otherwise takes little space.
+
+   <details>
+   <summary>Full log</summary>
+   <pre>
+   many
+   lines
+   go
+   here
+   </pre>
+   </details>
+
+    You could also provide a link to a pastebin service, but this is less beneficial since those links tend to expire quickly and future readers of your issue might not be able to access that log file anymore and may lack some context.
+
+6. If this is an issue in your code, do try to reduce that code to a minimal example that still demonstrates the problem. Please ask at the forums if you have a hard time figuring how to do that. Please realize that we don't have the luxury of having time to try and understand all of your custom code.
+
+   If you really tried to make a short reproducible code but couldn't figure it out, it might be that having a traceback will give the developer enough information to know what's going on. But if it is not enough and we can't reproduce the problem, we can't really solve it.
+
+   Do not dispair if you can't figure it out from the begining, just share what you can and perhaps someone else will be able to help you at the forums.
+
+7. If you forked off some of this project's code or example applications, please, do not ask us to go into your code repository and figure out what you may have done. The code is already very complex and unless there is an easy way to do a diff and it's a small diff, it won't be possible to find someone with time on their hands to make a lengthy investigation. Albeit, you might find someone at the forums who will be generous to do this for you.
+
+8. Before reporting an issue, first, always try to update your environment to the latest official version of this library. We have no resources to go and debug older revisions, which could easily have bugs that have been fixed in the latest released version.
+
+   We understand that this is not always possible, especially when APIs change, in which case file an issue against the highest library version your environment can support.
+
+   Of course, if you upgrade the library, always retest that the problem is still there.
+
+9. Please do not ask us to reproduce an issue with your custom data, since we don't have it. So, either you should use some existing dataset supported by HF datasets or you need to supply a code that generates a small sample on the fly, or some another quick and simple way to get it.
+
+   Please do not send us any non-public domain data that may require a license or a permission to be used.
+
+10. Do not tag multiple developers on the issue unless you know this is expected, either because you asked them and they gave you an explicit permission to tag them or the issue template instructs you to do so.
+
+   The "who to tag for what domain" part of the issue template is there to help users direct their questions to the right developers who are designated maintainers of project's specific domains. They can then decide at their own discretion to tag other developers if they feel it'd help move the issue forward.
+
+   We currently don't have a triage service and we trust your capacity to identify the right domain and thus the persons to tag in your issue. If you are not sure, please use the forums to ask for guidance.
+
+   When in doubt, err on the side of not tagging a given person. If you tag multiple people out of context or permission don't be surprised if you get no response at all. Please remember that every time you tag someone, they get a notification and you're taking their time without their permission. Please be sensitive to that.
+
+   If you got helped by one of the developers in the past please don't tag them in future issues, unless they are listed in the issue template for the domain you are asking about or that developer gave you an explicit permission to tag them in future issues.
+
+   If you see a certain developer doing multiple and/or recent commits into a specific area of the project that you feel is relevant to your issue, it is not a good reason to tag them. Various developers may be fixing things that prevent them from moving forward, but often their work is focused on a totally different domain. And while they may or may not know how to help you with the problem at hand, it would benefit the whole community much more if they focus on the domain of their unique expertise.
+
+11. Use the Edit button. Take your time, and re-read and improve the wording and formatting to make your posts and comments as easy to understand as possible.
+
+    Avoid posting multiple comments in a row, as each comment generates a notification for the developers tagged in that issue. If you happened to post multiple comments in a row, and nobody followed up yet - consider merging those into one or a few comments while editing the combined content to be coherent.
+
+    If you choose to edit your older comments after others posted follow up comments you need to be aware that your modifications might not be noticed, so if it's not a typo fixing, try to write a new comment flagging that something has been changed in the previous comments.
+
+    For example, the very first comment is the most important one. If while the thread unfolds you realize that things aren't as they seemed to you originally you may want to edit the first post to reflect the up-to-date understanding of the issue at hand so that it helps those who read your issue in the future quickly understand what's going on and not need to sift through dozens of comments. It also helps to indicate that the post was edited. So, those reading the thread later can understand why there might be certain discontinuity in the information flow.
+
+    Use bullets and items if you have lists of items and the outcome improves overall readability.
+
+    Use backticks to refer to class and function names, e.g. `BartModel` and `generate` as these stand out and improve the speed of a reader's comprehension.
+
+    Try not use italics and bold text too much as these often make the text more difficult to read.
+
+
+12. If you are cross-referencing a specific comment in a given thread or another issue, always link to that specific comment, rather than using the issue link. If you do the latter it could be quite impossible to find which specific comment you're referring to.
+
+    To get the link to the specific comment do not copy the url from the location bar of your browser, but instead, click the `...` icon in the upper right corner of the comment and then select "Copy Link".
+
+    For example the first link is a link to an issue, and the second to a specific comment in the same issue:
+
+    1. https://github.com/huggingface/transformers/issues/9257
+    2. https://github.com/huggingface/transformers/issues/9257#issuecomment-749945162
+
+
+13. If you are replying to a last comment, it's totally fine to make your reply with just your comment in it. The readers can follow the information flow here.
+
+    But if you're replying to a comment that happened some comments back it's always a good practice to quote just the relevant lines you're replying it. The `>` is used for quoting, or you can always use the menu to do so. For example your editor box will look like:
+
+    ```
+    > How big is your gpu cluster?
+
+    Our cluster is made of 256 gpus.
+    ```
+
+    If you are addressing multiple comments, quote the relevant parts of each before your answer. Some people use the same comment to do multiple replies, others separate them into separate comments. Either way works. The latter approach helps for linking to a specific comment.
+
+In general the best way to figure out what works the best is learn from issues posted by other people - see which issues get great responses and which get little to no response - observe what the posters who received great responses did differently from those who did not.
+
+Thank you for reading this somewhat lengthy document. We would like to conclude that these are not absolute rules, but a friendly advice that will help maximize the chances for us to understand what you are trying to communicate, reproduce the problem then resolve it to your satisfaction and the benefit of the whole community.
+
+If after reading this document there are remaining questions on how and why or there is a need for further elucidation, please, don't hesitate to ask your question in [this thread](https://discuss.huggingface.co/t/how-to-request-support/3128).
--- a/1
+++ b/1
@ -1,3 +1,4 @@
+Copyright 2018- The Hugging Face team. All rights reserved.

                                 Apache License
                           Version 2.0, January 2004
--- a/70
+++ b/70
@ -0,0 +1,70 @@
+.PHONY: deps_table_update modified_only_fixup extra_quality_checks quality style fixup fix-copies test test-examples docs
+
+
+check_dirs := examples tests src utils
+
+modified_only_fixup:
+	$(eval modified_py_files := $(shell python utils/get_modified_files.py $(check_dirs)))
+	@if test -n "$(modified_py_files)"; then \
+		echo "Checking/fixing $(modified_py_files)"; \
+		black $(modified_py_files); \
+		isort $(modified_py_files); \
+		flake8 $(modified_py_files); \
+	else \
+		echo "No library .py files were modified"; \
+	fi
+
+# Update src/transformers/dependency_versions_table.py
+
+deps_table_update:
+	@python setup.py deps_table_update
+
+# Check that source code meets quality standards
+
+extra_quality_checks: deps_table_update
+	python utils/check_copies.py
+	python utils/check_table.py
+	python utils/check_dummies.py
+	python utils/check_repo.py
+	python utils/style_doc.py src/transformers docs/source --max_len 119
+
+# this target runs checks on all files
+quality:
+	black --check $(check_dirs)
+	isort --check-only $(check_dirs)
+	flake8 $(check_dirs)
+	python utils/style_doc.py src/transformers docs/source --max_len 119 --check_only
+	${MAKE} extra_quality_checks
+
+# Format source code automatically and check is there are any problems left that need manual fixing
+
+style: deps_table_update
+	black $(check_dirs)
+	isort $(check_dirs)
+	python utils/style_doc.py src/transformers docs/source --max_len 119
+
+# Super fast fix and check target that only works on relevant modified files since the branch was made
+
+fixup: modified_only_fixup extra_quality_checks
+
+# Make marked copies of snippets of codes conform to the original
+
+fix-copies:
+	python utils/check_copies.py --fix_and_overwrite
+	python utils/check_table.py --fix_and_overwrite
+	python utils/check_dummies.py --fix_and_overwrite
+
+# Run tests for the library
+
+test:
+	python -m pytest -n auto --dist=loadfile -s -v ./tests/
+
+# Run tests for examples
+
+test-examples:
+	python -m pytest -n auto --dist=loadfile -s -v ./examples/
+
+# Check that docs can build
+
+docs:
+	cd docs && make html SPHINXOPTS="-W -j 4"
--- a/README.md
+++ b/README.md
@ -1,10 +1,26 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
 <p align="center">
    <br>
    <img src="https://raw.githubusercontent.com/huggingface/transformers/master/docs/source/imgs/transformers_logo_name.png" width="400"/>
    <br>
 <p>
 <p align="center">
-    <a href="https://github.com/huggingface/transformers/blob/master/LICENSE">
+    <a href="https://circleci.com/gh/huggingface/transformers">
        <img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/master">
    </a>
    <a href="https://github.com/huggingface/transformers/blob/master/LICENSE">
@ -16,511 +32,236 @@
    <a href="https://github.com/huggingface/transformers/releases">
        <img alt="GitHub release" src="https://img.shields.io/github/release/huggingface/transformers.svg">
    </a>
+    <a href="https://github.com/huggingface/transformers/blob/master/CODE_OF_CONDUCT.md">
+        <img alt="Contributor Covenant" src="https://img.shields.io/badge/Contributor%20Covenant-v2.0%20adopted-ff69b4.svg">
+    </a>
 </p>

 <h3 align="center">
-<p>State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch
+<p>State-of-the-art Natural Language Processing for PyTorch and TensorFlow 2.0
 </h3>

-🤗 Transformers (formerly known as `pytorch-transformers` and `pytorch-pretrained-bert`) provides state-of-the-art general-purpose architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural Language Understanding (NLU) and Natural Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between TensorFlow 2.0 and PyTorch.
+🤗 Transformers provides thousands of pretrained models to perform tasks on texts such as classification, information extraction, question answering, summarization, translation, text generation, etc in 100+ languages. Its aim is to make cutting-edge NLP easier to use for everyone.

-### Features
+🤗 Transformers provides APIs to quickly download and use those pretrained models on a given text, fine-tune them on your own datasets then share them with the community on our [model hub](https://huggingface.co/models). At the same time, each python module defining an architecture can be used as a standalone and modified to enable quick research experiments.

- As easy to use as pytorch-transformers
- As powerful and concise as Keras
- High performance on NLU and NLG tasks
- Low barrier to entry for educators and practitioners
+🤗 Transformers is backed by the two most popular deep learning libraries, [PyTorch](https://pytorch.org/) and [TensorFlow](https://www.tensorflow.org/), with a seamless integration between them, allowing you to train your models with one then load it for inference with the other.

-State-of-the-art NLP for everyone
- Deep learning researchers
- Hands-on practitioners
- AI/ML/NLP teachers and educators
+## Online demos

-Lower compute costs, smaller carbon footprint
- Researchers can share trained models instead of always retraining
- Practitioners can reduce compute time and production costs
- 8 architectures with over 30 pretrained models, some in more than 100 languages
+You can test most of our models directly on their pages from the [model hub](https://huggingface.co/models). We also offer [private model hosting, versioning, & an inference API](https://huggingface.co/pricing) to use those models.

-Choose the right framework for every part of a model's lifetime
- Train state-of-the-art models in 3 lines of code
- Deep interoperability between TensorFlow 2.0 and PyTorch models
- Move a single model between TF2.0/PyTorch frameworks at will
- Seamlessly pick the right framework for training, evaluation, production
+Here are a few examples:
+- [Masked word completion with BERT](https://huggingface.co/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
+- [Name Entity Recognition with Electra](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
+- [Text generation with GPT-2](https://huggingface.co/gpt2?text=A+long+time+ago%2C+)
+- [Natural Langugage Inference with RoBERTa](https://huggingface.co/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
+- [Summarization with BART](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct)
+- [Question answering with DistilBERT](https://huggingface.co/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
+- [Translation with T5](https://huggingface.co/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)

+**[Write With Transformer](https://transformer.huggingface.co)**, built by the Hugging Face team, is the official demo of this repo’s text generation capabilities.

-| Section | Description |
-|-|-|
-| [Installation](#installation) | How to install the package |
-| [Model architectures](#model-architectures) | Architectures (with pretrained weights) |
-| [Online demo](#online-demo) | Experimenting with this repo’s text generation capabilities |
-| [Quick tour: Usage](#quick-tour) | Tokenizers & models usage: Bert and GPT-2 |
-| [Quick tour: TF 2.0 and PyTorch ](#Quick-tour-TF-2.0-training-and-PyTorch-interoperability) | Train a TF 2.0 model in 10 lines of code, load it in PyTorch |
-| [Quick tour: Fine-tuning/usage scripts](#quick-tour-of-the-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation |
-| [Migrating from pytorch-transformers to transformers](#Migrating-from-pytorch-transformers-to-transformers) | Migrating your code from pytorch-pretrained-bert to transformers |
-| [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-transformers) | Migrating your code from pytorch-pretrained-bert to transformers |
-| [Documentation](https://huggingface.co/transformers/) | Full API documentation and more |
+## Quick tour
+
+To immediately use a model on a given text, we provide the `pipeline` API. Pipelines group together a pretrained model with the preprocessing that was used during that model training. Here is how to quickly use a pipeline to classify positive versus negative texts
+
+```python
+>>> from transformers import pipeline
+
+# Allocate a pipeline for sentiment-analysis
+>>> classifier = pipeline('sentiment-analysis')
+>>> classifier('We are very happy to include pipeline into the transformers repository.')
+[{'label': 'POSITIVE', 'score': 0.9978193640708923}]
+```
+
+The second line of code downloads and caches the pretrained model used by the pipeline, the third line evaluates it on the given text. Here the answer is "positive" with a confidence of 99.8%.
+
+This is another example of pipeline used for that can extract question answers from some context:
+
+``` python
+>>> from transformers import pipeline
+
+# Allocate a pipeline for question-answering
+>>> question_answerer = pipeline('question-answering')
+>>> question_answerer({
+...     'question': 'What is the name of the repository ?',
+...     'context': 'Pipeline have been included in the huggingface/transformers repository'
+... })
+{'score': 0.5135612454720828, 'start': 35, 'end': 59, 'answer': 'huggingface/transformers'}
+
+```
+
+On top of the answer, the pretrained model used here returned its confidence score, along with the start position and its end position in the tokenized sentence. You can learn more about the tasks supported by the `pipeline` API in [this tutorial](https://huggingface.co/transformers/task_summary.html).
+
+To download and use any of the pretrained models on your given task, you just need to use those three lines of codes (PyTorch version):
+```python
+>>> from transformers import AutoTokenizer, AutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+>>> model = AutoModel.from_pretrained("bert-base-uncased")
+
+>>> inputs = tokenizer("Hello world!", return_tensors="pt")
+>>> outputs = model(**inputs)
+```
+or for TensorFlow:
+```python
+>>> from transformers import AutoTokenizer, TFAutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+>>> model = TFAutoModel.from_pretrained("bert-base-uncased")
+
+>>> inputs = tokenizer("Hello world!", return_tensors="tf")
+>>> outputs = model(**inputs)
+```
+
+The tokenizer is responsible for all the preprocessing the pretrained model expects, and can be called directly on one (or list) of texts (as we can see on the fourth line of both code examples). It will output a dictionary you can directly pass to your model (which is done on the fifth line).
+
+The model itself is a regular [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) or a [TensorFlow `tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) (depending on your backend) which you can use normally. For instance, [this tutorial](https://huggingface.co/transformers/training.html) explains how to integrate such a model in classic PyTorch or TensorFlow training loop, or how to use our `Trainer` API to quickly fine-tune the on a new dataset.
+
+## Why should I use transformers?
+
+1. Easy-to-use state-of-the-art models:
+    - High performance on NLU and NLG tasks.
+    - Low barrier to entry for educators and practitioners.
+    - Few user-facing abstractions with just three classes to learn.
+    - A unified API for using all our pretrained models.
+
+1. Lower compute costs, smaller carbon footprint:
+    - Researchers can share trained models instead of always retraining.
+    - Practitioners can reduce compute time and production costs.
+    - Dozens of architectures with over 2,000 pretrained models, some in more than 100 languages.
+
+1. Choose the right framework for every part of a model's lifetime:
+    - Train state-of-the-art models in 3 lines of code.
+    - Move a single model between TF2.0/PyTorch frameworks at will.
+    - Seamlessly pick the right framework for training, evaluation, production.
+
+1. Easily customize a model or an example to your needs:
+    - Examples for each architecture to reproduce the results by the official authors of said architecture.
+    - Expose the models internal as consistently as possible.
+    - Model files can be used independently of the library for quick experiments.
+
+## Why shouldn't I use transformers?
+
+- This library is not a modular toolbox of building blocks for neural nets. The code in the model files is not refactored with additional abstractions on purpose, so that researchers can quickly iterate on each of the models without diving in additional abstractions/files.
+- The training API is not intended to work on any model but is optimized to work with the models provided by the library. For generic machine learning loops, you should use another library.
+- While we strive to present as many use cases as possible, the scripts in our [examples folder](https://github.com/huggingface/transformers/tree/master/examples) are just that: examples. It is expected that they won't work out-of-the box on your specific problem and that you will be required to change a few lines of code to adapt them to your needs.

 ## Installation

-This repo is tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+) and PyTorch 1.0.0+
-
 ### With pip

-Transformers can be installed by pip as follows:
+This repository is tested on Python 3.6+, PyTorch 1.0.0+ (PyTorch 1.3.1+ for [examples](https://github.com/huggingface/transformers/tree/master/examples)) and TensorFlow 2.0.
+
+You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).
+
+First, create a virtual environment with the version of Python you're going to use and activate it.
+
+Then, you will need to install at least one of TensorFlow 2.0, PyTorch or Flax.
+Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available), [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) regarding the specific install command for your platform and/or [Flax installation page](https://github.com/google/flax#quick-install).
+
+When TensorFlow 2.0 and/or PyTorch has been installed, 🤗 Transformers can be installed using pip as follows:

 ```bash
 pip install transformers
 ```

-### From source
+If you'd like to play with the examples, you must [install the library from source](https://huggingface.co/transformers/installation.html#installing-from-source).

-Clone the repository and run:
+### With conda

-```bash
-pip install [--editable] .
+Since Transformers version v4.0.0, we now have a conda channel: `huggingface`.
+
+🤗 Transformers can be installed using conda as follows:
+
+```shell script
+conda install -c huggingface transformers
 ```

-### Tests
-
-A series of tests is included for the library and the example scripts. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).
-
-These tests can be run using `pytest` (install pytest if needed with `pip install pytest`).
-
-You can run the tests from the root of the cloned repository with the commands:
-
-```bash
-python -m pytest -sv ./transformers/tests/
-python -m pytest -sv ./examples/
-```
-
-### Do you want to run a Transformer model on a mobile device?
-
-You should check out our [`swift-coreml-transformers`](https://github.com/huggingface/swift-coreml-transformers) repo.
-
-It contains an example of a conversion script from a Pytorch trained Transformer model (here, `GPT-2`) to a CoreML model that runs on iOS devices.
-
-At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models in PyTorch to productizing them in CoreML,
-or prototype a model or an app in CoreML then research its hyperparameters or architecture from PyTorch. Super exciting!
-
-## Model architectures
-
-🤗 Transformers currently provides 8 NLU/NLG architectures:
-
-1. **[BERT](https://github.com/google-research/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
-2. **[GPT](https://github.com/openai/finetune-transformer-lm)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
-3. **[GPT-2](https://blog.openai.com/better-language-models/)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
-4. **[Transformer-XL](https://github.com/kimiyoung/transformer-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-5. **[XLNet](https://github.com/zihangdai/xlnet/)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-6. **[XLM](https://github.com/facebookresearch/XLM/)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
-7. **[RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-8. **[DistilBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation)** (from HuggingFace), released together with the blogpost [Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT](https://medium.com/huggingface/distilbert-8cf3380435b5
-) by Victor Sanh, Lysandre Debut and Thomas Wolf.
-
-These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html).
-
-## Online demo
-
-**[Write With Transformer](https://transformer.huggingface.co)**, built by the Hugging Face team at transformer.huggingface.co, is the official demo of this repo’s text generation capabilities.
-You can use it to experiment with completions generated by `GPT2Model`, `TransfoXLModel`, and `XLNetModel`.
-
-> “🦄 Write with transformer is to writing what calculators are to calculus.”
-
-![write_with_transformer](https://transformer.huggingface.co/front/assets/thumbnail-large.png)
-
-## Quick tour
-
-Let's do a very quick overview of the model architectures in 🤗 Transformers. Detailed examples for each model architecture (Bert, GPT, GPT-2, Transformer-XL, XLNet and XLM) can be found in the [full documentation](https://huggingface.co/transformers/).
-
-```python
-import torch
-from transformers import *
-
-# Transformers has a unified API
-# for 8 transformer architectures and 30 pretrained weights.
-#          Model          | Tokenizer          | Pretrained weights shortcut
-MODELS = [(BertModel,       BertTokenizer,       'bert-base-uncased'),
-          (OpenAIGPTModel,  OpenAIGPTTokenizer,  'openai-gpt'),
-          (GPT2Model,       GPT2Tokenizer,       'gpt2'),
-          (TransfoXLModel,  TransfoXLTokenizer,  'transfo-xl-wt103'),
-          (XLNetModel,      XLNetTokenizer,      'xlnet-base-cased'),
-          (XLMModel,        XLMTokenizer,        'xlm-mlm-enfr-1024'),
-          (DistilBertModel, DistilBertTokenizer, 'distilbert-base-uncased'),
-          (RobertaModel,    RobertaTokenizer,    'roberta-base')]
-
-# To use TensorFlow 2.0 versions of the models, simply prefix the class names with 'TF', e.g. `TFRobertaModel` is the TF 2.0 counterpart of the PyTorch model `RobertaModel`
-
-# Let's encode some text in a sequence of hidden-states using each model:
-for model_class, tokenizer_class, pretrained_weights in MODELS:
-    # Load pretrained model/tokenizer
-    tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
-    model = model_class.from_pretrained(pretrained_weights)
-
-    # Encode text
-    input_ids = torch.tensor([tokenizer.encode("Here is some text to encode", add_special_tokens=True)])  # Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
-    with torch.no_grad():
-        last_hidden_states = model(input_ids)[0]  # Models outputs are now tuples
-
-# Each architecture is provided with several class for fine-tuning on down-stream tasks, e.g.
-BERT_MODEL_CLASSES = [BertModel, BertForPreTraining, BertForMaskedLM, BertForNextSentencePrediction,
-                      BertForSequenceClassification, BertForMultipleChoice, BertForTokenClassification,
-                      BertForQuestionAnswering]
-
-# All the classes for an architecture can be initiated from pretrained weights for this architecture
-# Note that additional weights added for fine-tuning are only initialized
-# and need to be trained on the down-stream task
-tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-for model_class in BERT_MODEL_CLASSES:
-    # Load pretrained model/tokenizer
-    model = model_class.from_pretrained('bert-base-uncased')
-
-# Models can return full list of hidden-states & attentions weights at each layer
-model = model_class.from_pretrained(pretrained_weights,
-                                    output_hidden_states=True,
-                                    output_attentions=True)
-input_ids = torch.tensor([tokenizer.encode("Let's see all hidden-states and attentions on this text")])
-all_hidden_states, all_attentions = model(input_ids)[-2:]
-
-# Models are compatible with Torchscript
-model = model_class.from_pretrained(pretrained_weights, torchscript=True)
-traced_model = torch.jit.trace(model, (input_ids,))
-
-# Simple serialization for models and tokenizers
-model.save_pretrained('./directory/to/save/')  # save
-model = model_class.from_pretrained('./directory/to/save/')  # re-load
-tokenizer.save_pretrained('./directory/to/save/')  # save
-tokenizer = tokenizer_class.from_pretrained('./directory/to/save/')  # re-load
-
-# SOTA examples for GLUE, SQUAD, text generation...
-```
-
-## Quick tour TF 2.0 training and PyTorch interoperability
-
-Let's do a quick example of how a TensorFlow 2.0 model can be trained in 12 lines of code with 🤗 Transformers and then loaded in PyTorch for fast inspection/tests.
-
-```python
-import tensorflow as tf
-import tensorflow_datasets
-from pytorch_transformers import *
-
-# Load dataset, tokenizer, model from pretrained model/vocabulary
-tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
-model = TFBertForSequenceClassification.from_pretrained('bert-base-cased')
-data = tensorflow_datasets.load('glue/mrpc')
-
-# Prepare dataset for GLUE as a tf.data.Dataset instance
-train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, 128, 'mrpc')
-valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, 128, 'mrpc')
-train_dataset = train_dataset.shuffle(100).batch(32).repeat(2)
-valid_dataset = valid_dataset.batch(64)
-
-# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule 
-optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
-loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
-metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
-model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
-
-# Train and evaluate using tf.keras.Model.fit()
-history = model.fit(train_dataset, epochs=2, steps_per_epoch=115,
-                    validation_data=valid_dataset, validation_steps=7)
-
-# Load the TensorFlow model in PyTorch for inspection
-model.save_pretrained('./save/')
-pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True)
-
-# Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
-sentence_0 = "This research was consistent with his findings."
-sentence_1 = "His findings were compatible with this research."
-sentence_2 = "His findings were not compatible with this research."
-inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
-inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt')
-
-pred_1 = pytorch_model(**inputs_1)[0].argmax().item()
-pred_2 = pytorch_model(**inputs_2)[0].argmax().item()
-print("sentence_1 is", "a paraphrase" if pred_1 else "not a paraphrase", "of sentence_0")
-print("sentence_2 is", "a paraphrase" if pred_2 else "not a paraphrase", "of sentence_0")
-```
-
-## Quick tour of the fine-tuning/usage scripts
-
-The library comprises several example scripts with SOTA performances for NLU and NLG tasks:
-
- `run_glue.py`: an example fine-tuning Bert, XLNet and XLM on nine different GLUE tasks (*sequence-level classification*)
- `run_squad.py`: an example fine-tuning Bert, XLNet and XLM on the question answering dataset SQuAD 2.0 (*token-level classification*)
- `run_generation.py`: an example using GPT, GPT-2, Transformer-XL and XLNet for conditional language generation
- other model-specific examples (see the documentation).
-
-Here are three quick usage examples for these scripts:
-
-### `run_glue.py`: Fine-tuning on GLUE tasks for sequence classification
-
-The [General Language Understanding Evaluation (GLUE) benchmark](https://gluebenchmark.com/) is a collection of nine sentence- or sentence-pair language understanding tasks for evaluating and analyzing natural language understanding systems.
-
-Before running anyone of these GLUE tasks you should download the
-[GLUE data](https://gluebenchmark.com/tasks) by running
-[this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
-and unpack it to some directory `$GLUE_DIR`.
-
-You should also install the additional packages required by the examples:
-
-```shell
-pip install -r ./examples/requirements.txt
-```
-
-```shell
-export GLUE_DIR=/path/to/glue
-export TASK_NAME=MRPC
-
-python ./examples/run_glue.py \
-    --model_type bert \
-    --model_name_or_path bert-base-uncased \
-    --task_name $TASK_NAME \
-    --do_train \
-    --do_eval \
-    --do_lower_case \
-    --data_dir $GLUE_DIR/$TASK_NAME \
-    --max_seq_length 128 \
-    --per_gpu_eval_batch_size=8   \
-    --per_gpu_train_batch_size=8   \
-    --learning_rate 2e-5 \
-    --num_train_epochs 3.0 \
-    --output_dir /tmp/$TASK_NAME/
-```
-
-where task name can be one of CoLA, SST-2, MRPC, STS-B, QQP, MNLI, QNLI, RTE, WNLI.
-
-The dev set results will be present within the text file 'eval_results.txt' in the specified output_dir. In case of MNLI, since there are two separate dev sets, matched and mismatched, there will be a separate output folder called '/tmp/MNLI-MM/' in addition to '/tmp/MNLI/'.
-
-#### Fine-tuning XLNet model on the STS-B regression task
-
-This example code fine-tunes XLNet on the STS-B corpus using parallel training on a server with 4 V100 GPUs.
-Parallel training is a simple way to use several GPUs (but is slower and less flexible than distributed training, see below).
-
-```shell
-export GLUE_DIR=/path/to/glue
-
-python ./examples/run_glue.py \
-    --model_type xlnet \
-    --model_name_or_path xlnet-large-cased \
-    --do_train  \
-    --do_eval   \
-    --task_name=sts-b     \
-    --data_dir=${GLUE_DIR}/STS-B  \
-    --output_dir=./proc_data/sts-b-110   \
-    --max_seq_length=128   \
-    --per_gpu_eval_batch_size=8   \
-    --per_gpu_train_batch_size=8   \
-    --gradient_accumulation_steps=1 \
-    --max_steps=1200  \
-    --model_name=xlnet-large-cased   \
-    --overwrite_output_dir   \
-    --overwrite_cache \
-    --warmup_steps=120
-```
-
-On this machine we thus have a batch size of 32, please increase `gradient_accumulation_steps` to reach the same batch size if you have a smaller machine. These hyper-parameters should result in a Pearson correlation coefficient of `+0.917` on the development set.
-
-#### Fine-tuning Bert model on the MRPC classification task
-
-This example code fine-tunes the Bert Whole Word Masking model on the Microsoft Research Paraphrase Corpus (MRPC) corpus using distributed training on 8 V100 GPUs to reach a F1 > 92.
-
-```bash
-python -m torch.distributed.launch --nproc_per_node 8 ./examples/run_glue.py   \
-    --model_type bert \
-    --model_name_or_path bert-large-uncased-whole-word-masking \
-    --task_name MRPC \
-    --do_train   \
-    --do_eval   \
-    --do_lower_case   \
-    --data_dir $GLUE_DIR/MRPC/   \
-    --max_seq_length 128   \
-    --per_gpu_eval_batch_size=8   \
-    --per_gpu_train_batch_size=8   \
-    --learning_rate 2e-5   \
-    --num_train_epochs 3.0  \
-    --output_dir /tmp/mrpc_output/ \
-    --overwrite_output_dir   \
-    --overwrite_cache \
-```
-
-Training with these hyper-parameters gave us the following results:
-
-```bash
-  acc = 0.8823529411764706
-  acc_and_f1 = 0.901702786377709
-  eval_loss = 0.3418912578906332
-  f1 = 0.9210526315789473
-  global_step = 174
-  loss = 0.07231863956341798
-```
-
-### `run_squad.py`: Fine-tuning on SQuAD for question-answering
-
-This example code fine-tunes BERT on the SQuAD dataset using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD:
-
-```bash
-python -m torch.distributed.launch --nproc_per_node=8 ./examples/run_squad.py \
-    --model_type bert \
-    --model_name_or_path bert-large-uncased-whole-word-masking \
-    --do_train \
-    --do_eval \
-    --do_lower_case \
-    --train_file $SQUAD_DIR/train-v1.1.json \
-    --predict_file $SQUAD_DIR/dev-v1.1.json \
-    --learning_rate 3e-5 \
-    --num_train_epochs 2 \
-    --max_seq_length 384 \
-    --doc_stride 128 \
-    --output_dir ../models/wwm_uncased_finetuned_squad/ \
-    --per_gpu_eval_batch_size=3   \
-    --per_gpu_train_batch_size=3   \
-```
-
-Training with these hyper-parameters gave us the following results:
-
-```bash
-python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json ../models/wwm_uncased_finetuned_squad/predictions.json
-{"exact_match": 86.91579943235573, "f1": 93.1532499015869}
-```
-
-This is the model provided as `bert-large-uncased-whole-word-masking-finetuned-squad`.
-
-### `run_generation.py`: Text generation with GPT, GPT-2, Transformer-XL and XLNet
-
-A conditional generation script is also included to generate text from a prompt.
-The generation script includes the [tricks](https://github.com/rusiaaman/XLNet-gen#methodology) proposed by Aman Rusia to get high quality generation with memory models like Transformer-XL and XLNet (include a predefined text to make short inputs longer).
-
-Here is how to run the script with the small version of OpenAI GPT-2 model:
-
-```shell
-python ./examples/run_generation.py \
-    --model_type=gpt2 \
-    --length=20 \
-    --model_name_or_path=gpt2 \
-```
-
-## Migrating from pytorch-transformers to transformers
-
-Here is a quick summary of what you should take care of when migrating from `pytorch-transformers` to `transformers`.
-
-### Positional order of some models' keywords inputs (`attention_mask`, `token_type_ids`...) changed
-
-To be able to use Torchscript (see #1010, #1204 and #1195) the specific order of some models **keywords inputs** (`attention_mask`, `token_type_ids`...) has been changed.
-
-If you used to call the models with keyword names for keyword arguments, e.g. `model(inputs_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)`, this should not cause any change.
-
-If you used to call the models with positional inputs for keyword arguments, e.g. `model(inputs_ids, attention_mask, token_type_ids)`, you may have to double check the exact order of input arguments.
-
-
-## Migrating from pytorch-pretrained-bert to transformers
-
-Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to `transformers`.
-
-### Models always output `tuples`
-
-The main breaking change when migrating from `pytorch-pretrained-bert` to `transformers` is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
-
-The exact content of the tuples for each model are detailed in the models' docstrings and the [documentation](https://huggingface.co/transformers/).
-
-In pretty much every case, you will be fine by taking the first element of the output as the output you previously used in `pytorch-pretrained-bert`.
-
-Here is a `pytorch-pretrained-bert` to `transformers` conversion example for a `BertForSequenceClassification` classification model:
-
-```python
-# Let's load our model
-model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
-
-# If you used to have this line in pytorch-pretrained-bert:
-loss = model(input_ids, labels=labels)
-
-# Now just use this line in transformers to extract the loss from the output tuple:
-outputs = model(input_ids, labels=labels)
-loss = outputs[0]
-
-# In transformers you can also have access to the logits:
-loss, logits = outputs[:2]
-
-# And even the attention weights if you configure the model to output them (and other outputs too, see the docstrings and documentation)
-model = BertForSequenceClassification.from_pretrained('bert-base-uncased', output_attentions=True)
-outputs = model(input_ids, labels=labels)
-loss, logits, attentions = outputs
-```
-
-### Serialization
-
-Breaking change in the `from_pretrained()`method:
-
-1. Models are now set in evaluation mode by default when instantiated with the `from_pretrained()` method. To train them don't forget to set them back in training mode (`model.train()`) to activate the dropout modules.
-
-2. The additional `*input` and `**kwargs` arguments supplied to the `from_pretrained()` method used to be directly passed to the underlying model's class `__init__()` method. They are now used to update the model configuration attribute instead which can break derived model classes build based on the previous `BertForSequenceClassification` examples. We are working on a way to mitigate this breaking change in [#866](https://github.com/huggingface/transformers/pull/866) by forwarding the the model `__init__()` method (i) the provided positional arguments and (ii) the keyword arguments which do not match any configuration class attributes.
-
-Also, while not a breaking change, the serialization methods have been standardized and you probably should switch to the new method `save_pretrained(save_directory)` if you were using any other serialization method before.
-
-Here is an example:
-
-```python
-### Let's load a model and tokenizer
-model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
-tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-
-### Do some stuff to our model and tokenizer
-# Ex: add new tokens to the vocabulary and embeddings of our model
-tokenizer.add_tokens(['[SPECIAL_TOKEN_1]', '[SPECIAL_TOKEN_2]'])
-model.resize_token_embeddings(len(tokenizer))
-# Train our model
-train(model)
-
-### Now let's save our model and tokenizer to a directory
-model.save_pretrained('./my_saved_model_directory/')
-tokenizer.save_pretrained('./my_saved_model_directory/')
-
-### Reload the model and the tokenizer
-model = BertForSequenceClassification.from_pretrained('./my_saved_model_directory/')
-tokenizer = BertTokenizer.from_pretrained('./my_saved_model_directory/')
-```
-
-### Optimizers: BertAdam & OpenAIAdam are now AdamW, schedules are standard PyTorch schedules
-
-The two optimizers previously included, `BertAdam` and `OpenAIAdam`, have been replaced by a single `AdamW` optimizer which has a few differences:
-
- it only implements weights decay correction,
- schedules are now externals (see below),
- gradient clipping is now also external (see below).
-
-The new optimizer `AdamW` matches PyTorch `Adam` optimizer API and let you use standard PyTorch or apex methods for the schedule and clipping.
-
-The schedules are now standard [PyTorch learning rate schedulers](https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate) and not part of the optimizer anymore.
-
-Here is a conversion examples from `BertAdam` with a linear warmup and decay schedule to `AdamW` and the same schedule:
-
-```python
-# Parameters:
-lr = 1e-3
-max_grad_norm = 1.0
-num_total_steps = 1000
-num_warmup_steps = 100
-warmup_proportion = float(num_warmup_steps) / float(num_total_steps)  # 0.1
-
-### Previously BertAdam optimizer was instantiated like this:
-optimizer = BertAdam(model.parameters(), lr=lr, schedule='warmup_linear', warmup=warmup_proportion, t_total=num_total_steps)
-### and used like this:
-for batch in train_data:
-    loss = model(batch)
-    loss.backward()
-    optimizer.step()
-
-### In Transformers, optimizer and schedules are splitted and instantiated like this:
-optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
-scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps)  # PyTorch scheduler
-### and used like this:
-for batch in train_data:
-    loss = model(batch)
-    loss.backward()
-    torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)  # Gradient clipping is not in AdamW anymore (so you can use amp without issue)
-    optimizer.step()
-    scheduler.step()
-    optimizer.zero_grad()
-```
+Follow the installation pages of TensorFlow, PyTorch or Flax to see how to install them with conda. 
+
+## Models architectures
+
+**[All the model checkpoints](https://huggingface.co/models)** provided by 🤗 Transformers are seamlessly integrated from the huggingface.co [model hub](https://huggingface.co) where they are uploaded directly by [users](https://huggingface.co/users) and [organizations](https://huggingface.co/organizations).
+
+Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen)
+
+🤗 Transformers currently provides the following architectures (see [here](https://huggingface.co/transformers/model_summary.html) for a high-level summary of each them):
+
+1. **[ALBERT](https://huggingface.co/transformers/model_doc/albert.html)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
+1. **[BART](https://huggingface.co/transformers/model_doc/bart.html)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
+1. **[BARThez](https://huggingface.co/transformers/model_doc/barthez.html)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
+1. **[BERT](https://huggingface.co/transformers/model_doc/bert.html)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
+1. **[BERT For Sequence Generation](https://huggingface.co/transformers/model_doc/bertgeneration.html)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+1. **[Blenderbot](https://huggingface.co/transformers/model_doc/blenderbot.html)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+1. **[BlenderbotSmall](https://huggingface.co/transformers/model_doc/blenderbot_small.html)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+1. **[CamemBERT](https://huggingface.co/transformers/model_doc/camembert.html)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
+1. **[CTRL](https://huggingface.co/transformers/model_doc/ctrl.html)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
+1. **[DeBERTa](https://huggingface.co/transformers/model_doc/deberta.html)** (from Microsoft Research) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
+1. **[DialoGPT](https://huggingface.co/transformers/model_doc/dialogpt.html)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
+1. **[DistilBERT](https://huggingface.co/transformers/model_doc/distilbert.html)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) and a German version of DistilBERT.
+1. **[DPR](https://huggingface.co/transformers/model_doc/dpr.html)** (from Facebook) released with the paper [Dense Passage Retrieval
+for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon
+Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
+1. **[ELECTRA](https://huggingface.co/transformers/model_doc/electra.html)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
+1. **[FlauBERT](https://huggingface.co/transformers/model_doc/flaubert.html)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
+1. **[Funnel Transformer](https://huggingface.co/transformers/model_doc/funnel.html)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
+1. **[GPT](https://huggingface.co/transformers/model_doc/gpt.html)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
+1. **[GPT-2](https://huggingface.co/transformers/model_doc/gpt2.html)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
+1. **[LayoutLM](https://huggingface.co/transformers/model_doc/layoutlm.html)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
+1. **[LED](https://huggingface.co/transformers/model_doc/led.html)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+1. **[Longformer](https://huggingface.co/transformers/model_doc/longformer.html)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+1. **[LXMERT](https://huggingface.co/transformers/model_doc/lxmert.html)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
+1. **[MarianMT](https://huggingface.co/transformers/model_doc/marian.html)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
+1. **[MBart](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
+1. **[MPNet](https://huggingface.co/transformers/model_doc/mpnet.html)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
+1. **[MT5](https://huggingface.co/transformers/model_doc/mt5.html)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
+1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)> by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
+1. **[ProphetNet](https://huggingface.co/transformers/model_doc/prophetnet.html)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[Reformer](https://huggingface.co/transformers/model_doc/reformer.html)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
+1. **[RoBERTa](https://huggingface.co/transformers/model_doc/roberta.html)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
+ultilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) and a German version of DistilBERT.
+1. **[SqueezeBert](https://huggingface.co/transformers/model_doc/squeezebert.html)** released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
+1. **[T5](https://huggingface.co/transformers/model_doc/t5.html)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+1. **[TAPAS](https://huggingface.co/transformers/model_doc/tapas.html)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
+1. **[Transformer-XL](https://huggingface.co/transformers/model_doc/transformerxl.html)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+1. **[XLM](https://huggingface.co/transformers/model_doc/xlm.html)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
+1. **[XLM-ProphetNet](https://huggingface.co/transformers/model_doc/xlmprophetnet.html)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[XLM-RoBERTa](https://huggingface.co/transformers/model_doc/xlmroberta.html)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
+1. **[XLNet](https://huggingface.co/transformers/model_doc/xlnet.html)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
+1. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
+
+To check if each model has an implementation in PyTorch/TensorFlow/Flax or has an associated tokenizer backed by the 🤗 Tokenizers library, refer to [this table](https://huggingface.co/transformers/index.html#bigtable)
+
+These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations. You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html).
+
+
+## Learn more
+
+| Section | Description |
+|-|-|
+| [Documentation](https://huggingface.co/transformers/) | Full API documentation and tutorials |
+| [Task summary](https://huggingface.co/transformers/task_summary.html) | Tasks supported by 🤗 Transformers |
+| [Preprocessing tutorial](https://huggingface.co/transformers/preprocessing.html) | Using the `Tokenizer` class to prepare data for the models |
+| [Training and fine-tuning](https://huggingface.co/transformers/training.html) | Using the models provided by 🤗 Transformers in a PyTorch/TensorFlow training loop and the `Trainer` API |
+| [Quick tour: Fine-tuning/usage scripts](https://github.com/huggingface/transformers/tree/master/examples) | Example scripts for fine-tuning models on a wide range of tasks |
+| [Model sharing and uploading](https://huggingface.co/transformers/model_sharing.html) | Upload and share your fine-tuned models with the community |
+| [Migration](https://huggingface.co/transformers/migration.html) | Migrate to 🤗 Transformers from `pytorch-transformers` or `pytorch-pretrained-bert` |

 ## Citation

-At the moment, there is no paper associated to Transformers but we are working on preparing one. In the meantime, please include a mention of the library and a link to the present repository if you use this work in a published or open-source project.
+We now have a [paper](https://www.aclweb.org/anthology/2020.emnlp-demos.6/) you can cite for the 🤗 Transformers library:
+```bibtex
+@inproceedings{wolf-etal-2020-transformers,
+    title = "Transformers: State-of-the-Art Natural Language Processing",
+    author = "Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and Rémi Louf and Morgan Funtowicz and Joe Davison and Sam Shleifer and Patrick von Platen and Clara Ma and Yacine Jernite and Julien Plu and Canwen Xu and Teven Le Scao and Sylvain Gugger and Mariama Drame and Quentin Lhoest and Alexander M. Rush",
+    booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
+    month = oct,
+    year = "2020",
+    address = "Online",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/2020.emnlp-demos.6",
+    pages = "38--45"
+}
+```
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -1,7 +0,0 @@
-FROM pytorch/pytorch:latest
-
-RUN git clone https://github.com/NVIDIA/apex.git && cd apex && python setup.py install --cuda_ext --cpp_ext
-
-RUN pip install transformers
-
-WORKDIR /workspace
--- a/docker/transformers-cpu/Dockerfile
+++ b/docker/transformers-cpu/Dockerfile
@ -0,0 +1,26 @@
+FROM ubuntu:18.04
+LABEL maintainer="Hugging Face"
+LABEL repository="transformers"
+
+RUN apt update && \
+    apt install -y bash \
+                   build-essential \
+                   git \
+                   curl \
+                   ca-certificates \
+                   python3 \
+                   python3-pip && \
+    rm -rf /var/lib/apt/lists
+
+RUN python3 -m pip install --no-cache-dir --upgrade pip && \
+    python3 -m pip install --no-cache-dir \
+    jupyter \
+    tensorflow-cpu \
+    torch
+
+WORKDIR /workspace
+COPY . transformers/
+RUN cd transformers/ && \
+    python3 -m pip install --no-cache-dir .
+
+CMD ["/bin/bash"]
--- a/docker/transformers-gpu/Dockerfile
+++ b/docker/transformers-gpu/Dockerfile
@ -0,0 +1,31 @@
+FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04
+LABEL maintainer="Hugging Face"
+LABEL repository="transformers"
+
+RUN apt update && \
+    apt install -y bash \
+                   build-essential \
+                   git \
+                   curl \
+                   ca-certificates \
+                   python3 \
+                   python3-pip && \
+    rm -rf /var/lib/apt/lists
+
+RUN python3 -m pip install --no-cache-dir --upgrade pip && \
+    python3 -m pip install --no-cache-dir \
+    jupyter \
+    tensorflow \
+    torch
+
+RUN git clone https://github.com/NVIDIA/apex
+RUN cd apex && \
+    python3 setup.py install && \
+    pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
+
+WORKDIR /workspace
+COPY . transformers/
+RUN cd transformers/ && \
+    python3 -m pip install --no-cache-dir .
+
+CMD ["/bin/bash"]
--- a/docker/transformers-pytorch-cpu/Dockerfile
+++ b/docker/transformers-pytorch-cpu/Dockerfile
@ -0,0 +1,25 @@
+FROM ubuntu:18.04
+LABEL maintainer="Hugging Face"
+LABEL repository="transformers"
+
+RUN apt update && \
+    apt install -y bash \
+                   build-essential \
+                   git \
+                   curl \
+                   ca-certificates \
+                   python3 \
+                   python3-pip && \
+    rm -rf /var/lib/apt/lists
+
+RUN python3 -m pip install --no-cache-dir --upgrade pip && \
+    python3 -m pip install --no-cache-dir \
+    jupyter \
+    torch
+
+WORKDIR /workspace
+COPY . transformers/
+RUN cd transformers/ && \
+    python3 -m pip install --no-cache-dir .
+
+CMD ["/bin/bash"]
--- a/docker/transformers-pytorch-gpu/Dockerfile
+++ b/docker/transformers-pytorch-gpu/Dockerfile
@ -0,0 +1,30 @@
+FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04
+LABEL maintainer="Hugging Face"
+LABEL repository="transformers"
+
+RUN apt update && \
+    apt install -y bash \
+                   build-essential \
+                   git \
+                   curl \
+                   ca-certificates \
+                   python3 \
+                   python3-pip && \
+    rm -rf /var/lib/apt/lists
+
+RUN python3 -m pip install --no-cache-dir --upgrade pip && \
+    python3 -m pip install --no-cache-dir \
+    mkl \
+    torch
+
+RUN git clone https://github.com/NVIDIA/apex
+RUN cd apex && \
+    python3 setup.py install && \
+    pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
+
+WORKDIR /workspace
+COPY . transformers/
+RUN cd transformers/ && \
+    python3 -m pip install --no-cache-dir .
+
+CMD ["/bin/bash"]
--- a/docker/transformers-pytorch-tpu/Dockerfile
+++ b/docker/transformers-pytorch-tpu/Dockerfile
@ -0,0 +1,65 @@
+FROM google/cloud-sdk:slim
+
+# Build args.
+ARG GITHUB_REF=refs/heads/master
+
+# TODO: This Dockerfile installs pytorch/xla 3.6 wheels. There are also 3.7
+# wheels available; see below.
+ENV PYTHON_VERSION=3.6
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+         build-essential \
+         cmake \
+         git \
+         curl \
+         ca-certificates
+
+# Install conda and python.
+# NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385
+RUN curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-4.7.12-Linux-x86_64.sh  && \
+    chmod +x ~/miniconda.sh && \
+    ~/miniconda.sh -b && \
+    rm ~/miniconda.sh
+
+ENV PATH=/root/miniconda3/bin:$PATH
+
+RUN conda create -y --name container python=$PYTHON_VERSION
+
+# Run the rest of commands within the new conda env.
+# Use absolute path to appease Codefactor.
+SHELL ["/root/miniconda3/bin/conda", "run", "-n", "container", "/bin/bash", "-c"]
+RUN conda install -y python=$PYTHON_VERSION mkl
+
+RUN pip uninstall -y torch && \
+    # Python 3.7 wheels are available. Replace cp36-cp36m with cp37-cp37m
+    gsutil cp 'gs://tpu-pytorch/wheels/torch-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' . && \
+    gsutil cp 'gs://tpu-pytorch/wheels/torch_xla-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' . && \
+    gsutil cp 'gs://tpu-pytorch/wheels/torchvision-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' . && \
+    pip install 'torch-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
+    pip install 'torch_xla-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
+    pip install 'torchvision-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
+    rm 'torch-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
+    rm 'torch_xla-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
+    rm 'torchvision-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
+    apt-get install -y libomp5
+
+ENV LD_LIBRARY_PATH=root/miniconda3/envs/container/lib
+
+
+# Install huggingface/transformers at the current PR, plus dependencies.
+RUN git clone https://github.com/huggingface/transformers.git && \
+    cd transformers && \
+    git fetch origin $GITHUB_REF:CI && \
+    git checkout CI && \
+    cd .. && \
+    pip install ./transformers && \
+    pip install -r ./transformers/examples/requirements.txt && \
+    pip install pytest
+
+RUN python -c "import torch_xla; print(torch_xla.__version__)"
+RUN python -c "import transformers as trf; print(trf.__version__)"
+RUN conda init bash
+COPY docker-entrypoint.sh /usr/local/bin/
+RUN chmod +x /usr/local/bin/docker-entrypoint.sh
+ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"]
+CMD ["bash"]
--- a/docker/transformers-pytorch-tpu/bert-base-cased.jsonnet
+++ b/docker/transformers-pytorch-tpu/bert-base-cased.jsonnet
@ -0,0 +1,38 @@
+local base = import 'templates/base.libsonnet';
+local tpus = import 'templates/tpus.libsonnet';
+local utils = import "templates/utils.libsonnet";
+local volumes = import "templates/volumes.libsonnet";
+
+local bertBaseCased = base.BaseTest {
+  frameworkPrefix: "hf",
+  modelName: "bert-base-cased",
+  mode: "example",
+  configMaps: [],
+
+  timeout: 3600, # 1 hour, in seconds
+
+  image: std.extVar('image'),
+  imageTag: std.extVar('image-tag'),
+
+  tpuSettings+: {
+    softwareVersion: "pytorch-nightly",
+  },
+  accelerator: tpus.v3_8,
+
+  volumeMap+: {
+    datasets: volumes.PersistentVolumeSpec {
+      name: "huggingface-cluster-disk",
+      mountPath: "/datasets",
+    },
+  },
+  command: utils.scriptCommand(
+    |||
+      python -m pytest -s transformers/examples/test_xla_examples.py -v
+      test_exit_code=$?
+      echo "\nFinished running commands.\n"
+      test $test_exit_code -eq 0
+    |||
+  ),
+};
+
+bertBaseCased.oneshotJob
--- a/docker/transformers-pytorch-tpu/dataset.yaml
+++ b/docker/transformers-pytorch-tpu/dataset.yaml
@ -0,0 +1,32 @@
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: huggingface-cluster-disk
+spec:
+  storageClassName: ""
+  capacity:
+    storage: 500Gi
+  accessModes:
+    - ReadOnlyMany
+  claimRef:
+    namespace: default
+    name: huggingface-cluster-disk-claim
+  gcePersistentDisk:
+    pdName: huggingface-cluster-disk
+    fsType: ext4
+    readOnly: true
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: huggingface-cluster-disk-claim
+spec:
+  # Specify "" as the storageClassName so it matches the PersistentVolume's StorageClass.
+  # A nil storageClassName value uses the default StorageClass. For details, see
+  # https://kubernetes.io/docs/concepts/storage/persistent-volumes/#class-1
+  storageClassName: ""
+  accessModes:
+    - ReadOnlyMany
+  resources:
+    requests:
+      storage: 1Ki
--- a/docker/transformers-pytorch-tpu/docker-entrypoint.sh
+++ b/docker/transformers-pytorch-tpu/docker-entrypoint.sh
@ -0,0 +1,8 @@
+#!/bin/bash
+source ~/.bashrc
+echo "running docker-entrypoint.sh"
+conda activate container
+echo $KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS
+echo "printed TPU info"
+export XRT_TPU_CONFIG="tpu_worker;0;${KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS:7}"
+exec "$@"#!/bin/bash
--- a/docker/transformers-tensorflow-cpu/Dockerfile
+++ b/docker/transformers-tensorflow-cpu/Dockerfile
@ -0,0 +1,25 @@
+FROM ubuntu:18.04
+LABEL maintainer="Hugging Face"
+LABEL repository="transformers"
+
+RUN apt update && \
+    apt install -y bash \
+                   build-essential \
+                   git \
+                   curl \
+                   ca-certificates \
+                   python3 \
+                   python3-pip && \
+    rm -rf /var/lib/apt/lists
+
+RUN python3 -m pip install --no-cache-dir --upgrade pip && \
+    python3 -m pip install --no-cache-dir \
+    mkl \
+    tensorflow-cpu
+
+WORKDIR /workspace
+COPY . transformers/
+RUN cd transformers/ && \
+    python3 -m pip install --no-cache-dir .
+
+CMD ["/bin/bash"]
--- a/docker/transformers-tensorflow-gpu/Dockerfile
+++ b/docker/transformers-tensorflow-gpu/Dockerfile
@ -0,0 +1,25 @@
+FROM nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04
+LABEL maintainer="Hugging Face"
+LABEL repository="transformers"
+
+RUN apt update && \
+    apt install -y bash \
+                   build-essential \
+                   git \
+                   curl \
+                   ca-certificates \
+                   python3 \
+                   python3-pip && \
+    rm -rf /var/lib/apt/lists
+
+RUN python3 -m pip install --no-cache-dir --upgrade pip && \
+    python3 -m pip install --no-cache-dir \
+    mkl \
+    tensorflow
+
+WORKDIR /workspace
+COPY . transformers/
+RUN cd transformers/ && \
+    python3 -m pip install --no-cache-dir .
+
+CMD ["/bin/bash"]
--- a/docs/README.md
+++ b/docs/README.md
@ -1,25 +1,49 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
 # Generating the documentation

 To generate the documentation, you first have to build it. Several packages are necessary to build the doc,
-you can install them using:
+you can install them with the following command, at the root of the code repository:

 ```bash
-pip install -r requirements.txt
+pip install -e ".[docs]"
 ```
- 
+
+---
+**NOTE**
+
+You only need to generate the documentation to inspect it locally (if you're planning changes and want to 
+check how they look like before committing for instance). You don't have to commit the built documentation.
+
+---
+
 ## Packages installed

-Here's an overview of all the packages installed. If you ran the previous command installing all packages from 
+Here's an overview of all the packages installed. If you ran the previous command installing all packages from
 `requirements.txt`, you do not need to run the following commands.

-Building it requires the package `sphinx` that you can 
+Building it requires the package `sphinx` that you can
 install using:

 ```bash
 pip install -U sphinx
 ```

-You would also need the custom installed [theme](https://github.com/readthedocs/sphinx_rtd_theme) by 
+You would also need the custom installed [theme](https://github.com/readthedocs/sphinx_rtd_theme) by
 [Read The Docs](https://readthedocs.org/). You can install it using the following command:

 ```bash
@ -34,23 +58,19 @@ pip install recommonmark

 ## Building the documentation

-Make sure that there is a symlink from the `example` file (in /examples) inside the source folder. Run the followig 
-command to generate it:
-
-```bash
-ln -s ../../examples/README.md source/examples.md
-```
-
 Once you have setup `sphinx`, you can build the documentation by running the following command in the `/docs` folder:

 ```bash
 make html
 ```

+A folder called ``_build/html`` should have been created. You can now open the file ``_build/html/index.html`` in your
+browser. 
+
 ---
 **NOTE**

-If you are adding/removing elements from the toc-tree or from any strutural item, it is recommended to clean the build
+If you are adding/removing elements from the toc-tree or from any structural item, it is recommended to clean the build
 directory before rebuilding. Run the following command to clean and build:

 ```bash
@ -65,3 +85,176 @@ It should build the static app that will be available under `/docs/_build/html`

 Accepted files are reStructuredText (.rst) and Markdown (.md). Create a file with its extension and put it
 in the source directory. You can then link it to the toc-tree by putting the filename without the extension.
+
+## Preview the documentation in a pull request
+
+Once you have made your pull request, you can check what the documentation will look like after it's merged by
+following these steps:
+
+- Look at the checks at the bottom of the conversation page of your PR (you may need to click on "show all checks" to
+  expand them).
+- Click on "details" next to the `ci/circleci: build_doc` check.
+- In the new window, click on the "Artifacts" tab.
+- Locate the file "docs/_build/html/index.html" (or any specific page you want to check) and click on it to get a 
+  preview.
+
+## Writing Documentation - Specification
+
+The `huggingface/transformers` documentation follows the
+[Google documentation](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) style. It is
+mostly written in ReStructuredText 
+([Sphinx simple documentation](https://www.sphinx-doc.org/en/master/usage/restructuredtext/index.html), 
+[Sourceforge complete documentation](https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html)).
+
+
+### Adding a new tutorial
+
+Adding a new tutorial or section is done in two steps:
+
+- Add a new file under `./source`. This file can either be ReStructuredText (.rst) or Markdown (.md).
+- Link that file in `./source/index.rst` on the correct toc-tree.
+
+Make sure to put your new file under the proper section. It's unlikely to go in the first section (*Get Started*), so
+depending on the intended targets (beginners, more advanced users or researchers) it should go in section two, three or
+four.
+
+### Adding a new model
+
+When adding a new model:
+ 
+- Create a file `xxx.rst` under `./source/model_doc` (don't hesitate to copy an existing file as template). 
+- Link that file in `./source/index.rst` on the `model_doc` toc-tree.
+- Write a short overview of the model:
+    - Overview with paper & authors
+    - Paper abstract
+    - Tips and tricks and how to use it best
+- Add the classes that should be linked in the model. This generally includes the configuration, the tokenizer, and
+  every model of that class (the base model, alongside models with additional heads), both in PyTorch and TensorFlow.
+  The order is generally: 
+    - Configuration, 
+    - Tokenizer
+    - PyTorch base model
+    - PyTorch head models
+    - TensorFlow base model
+    - TensorFlow head models
+
+These classes should be added using the RST syntax. Usually as follows:
+```
+XXXConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XXXConfig
+    :members:
+```
+
+This will include every public method of the configuration that is documented. If for some reason you wish for a method
+not to be displayed in the documentation, you can do so by specifying which methods should be in the docs:
+
+```
+XXXTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XXXTokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+```
+
+### Writing source documentation
+
+Values that should be put in `code` should either be surrounded by double backticks: \`\`like so\`\` or be written as
+an object using the :obj: syntax: :obj:\`like so\`. Note that argument names and objects like True, None or any strings
+should usually be put in `code`.
+
+When mentionning a class, it is recommended to use the :class: syntax as the mentioned class will be automatically
+linked by Sphinx: :class:\`~transformers.XXXClass\`
+
+When mentioning a function, it is recommended to use the :func: syntax as the mentioned function will be automatically
+linked by Sphinx: :func:\`~transformers.function\`.
+
+When mentioning a method, it is recommended to use the :meth: syntax as the mentioned method will be automatically
+linked by Sphinx: :meth:\`~transformers.XXXClass.method\`.
+
+Links should be done as so (note the double underscore at the end): \`text for the link <./local-link-or-global-link#loc>\`__
+
+#### Defining arguments in a method
+
+Arguments should be defined with the `Args:` prefix, followed by a line return and an indentation. 
+The argument should be followed by its type, with its shape if it is a tensor, and a line return.
+Another indentation is necessary before writing the description of the argument.
+
+Here's an example showcasing everything so far:
+
+```
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~transformers.AlbertTokenizer`.
+            See :meth:`~transformers.PreTrainedTokenizer.encode` and
+            :meth:`~transformers.PreTrainedTokenizer.__call__` for details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+```
+
+For optional arguments or arguments with defaults we follow the following syntax: imagine we have a function with the
+following signature:
+
+```
+def my_function(x: str = None, a: float = 1):
+```
+
+then its documentation should look like this:
+
+```
+    Args:
+        x (:obj:`str`, `optional`):
+            This argument controls ...
+        a (:obj:`float`, `optional`, defaults to 1):
+            This argument is used to ...
+```
+
+Note that we always omit the "defaults to :obj:\`None\`" when None is the default for any argument. Also note that even
+if the first line describing your argument type and its default gets long, you can't break it on several lines. You can
+however write as many lines as you want in the indented description (see the example above with `input_ids`). 
+
+#### Writing a multi-line code block 
+
+Multi-line code blocks can be useful for displaying examples. They are done like so:
+
+```
+Example::
+
+    # first line of code
+    # second line
+    # etc
+```
+
+The `Example` string at the beginning can be replaced by anything as long as there are two semicolons following it.
+
+We follow the [doctest](https://docs.python.org/3/library/doctest.html) syntax for the examples to automatically test
+the results stay consistent with the library.
+
+#### Writing a return block
+
+Arguments should be defined with the `Args:` prefix, followed by a line return and an indentation. 
+The first line should be the type of the return, followed by a line return. No need to indent further for the elements
+building the return.
+
+Here's an example for tuple return, comprising several objects:
+
+```
+    Returns:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss.
+        prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+```
+
+Here's an example for a single value return:
+
+```
+    Returns:
+        :obj:`List[int]`: A list of integers in the range [0, 1] --- 1 for a special token, 0 for a sequence token.
+```
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@ -1,29 +0,0 @@
-alabaster==0.7.12
-Babel==2.7.0
-certifi==2019.6.16
-chardet==3.0.4
-commonmark==0.9.0
-docutils==0.14
-future==0.17.1
-idna==2.8
-imagesize==1.1.0
-Jinja2==2.10.1
-MarkupSafe==1.1.1
-packaging==19.0
-Pygments==2.4.2
-pyparsing==2.4.0
-pytz==2019.1
-recommonmark==0.5.0
-requests==2.22.0
-six==1.12.0
-snowballstemmer==1.9.0
-Sphinx==2.1.2
-sphinx-rtd-theme==0.4.3
-sphinxcontrib-applehelp==1.0.1
-sphinxcontrib-devhelp==1.0.1
-sphinxcontrib-htmlhelp==1.0.2
-sphinxcontrib-jsmath==1.0.1
-sphinxcontrib-qthelp==1.0.2
-sphinxcontrib-serializinghtml==1.1.3
-urllib3==1.25.3
-sphinx-markdown-tables==0.0.9
--- a/docs/source/_static/css/code-snippets.css
+++ b/docs/source/_static/css/code-snippets.css
@ -9,4 +9,8 @@

 .highlight .kn, .highlight .nv, .highlight .s2, .highlight .ow {
    color: #6670FF;
+}
+
+.highlight .gp {
+    color: #FB8D68;
 }
--- a/docs/source/_static/css/huggingface.css
+++ b/docs/source/_static/css/huggingface.css
@ -1,4 +1,111 @@
-huggingface.css
+/* Our DOM objects */
+
+/* Colab dropdown */
+
+table.center-aligned-table td {
+    text-align: center;
+}
+
+table.center-aligned-table th {
+    text-align: center;
+    vertical-align: middle;
+}
+
+.colab-dropdown {
+    position: relative;
+    display: inline-block;
+}
+  
+.colab-dropdown-content {
+    display: none;
+    position: absolute;
+    background-color: #f9f9f9;
+    min-width: 117px;
+    box-shadow: 0px 8px 16px 0px rgba(0,0,0,0.2);
+    z-index: 1;
+}
+  
+.colab-dropdown-content button {
+    color: #6670FF;
+    background-color: #f9f9f9;
+    font-size: 12px;
+    border: none;
+    min-width: 117px;
+    padding: 5px 5px;
+    text-decoration: none;
+    display: block;
+}
+  
+.colab-dropdown-content button:hover {background-color: #eee;}
+  
+.colab-dropdown:hover .colab-dropdown-content {display: block;}
+
+/* Version control */
+
+.version-button {
+    background-color: #6670FF;
+    color: white;
+    border: none;
+    padding: 5px;
+    font-size: 15px;
+    cursor: pointer;
+}
+
+.version-button:hover, .version-button:focus {
+    background-color: #A6B0FF;
+}
+ 
+.version-dropdown {
+    display: none;
+    background-color: #6670FF;
+    min-width: 160px;
+    overflow: auto;
+    font-size: 15px;
+}
+  
+.version-dropdown a {
+    color: white;
+    padding: 3px 4px;
+    text-decoration: none;
+    display: block;
+}
+  
+.version-dropdown a:hover {
+    background-color: #A6B0FF;
+}
+  
+.version-show {
+    display: block;
+}
+
+/* Framework selector */
+
+.framework-selector {
+    display: flex;
+    flex-direction: row;
+    justify-content: flex-end;
+    margin-right: 30px;
+}
+
+.framework-selector > button {
+    background-color: white;
+    color: #6670FF;
+    border: 1px solid #6670FF;
+    padding: 5px;
+}
+
+.framework-selector > button.selected{
+    background-color: #6670FF;
+    color: white;
+    border: 1px solid #6670FF;
+    padding: 5px;
+}
+
+/* Copy button */
+
+a.copybtn {
+    margin: 3px;
+}

 /* The literal code blocks */
 .rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal {
@ -18,6 +125,7 @@ huggingface.css

 /* The research field on top of the toc tree */
 .wy-side-nav-search{
+    padding-top: 0;
    background-color: #6670FF;
 }

@ -26,6 +134,12 @@ huggingface.css
    background-color: #6670FF;
 }

+/* The section headers in the toc tree */
+.wy-menu-vertical p.caption{
+    background-color: #4d59ff;
+    line-height: 40px;
+}
+
 /* The selected items in the toc tree */
 .wy-menu-vertical li.current{
    background-color: #A6B0FF;
@ -44,11 +158,11 @@ huggingface.css
 /* The text items on the toc tree */
 .wy-menu-vertical a {
    color: #FFFFDD;
-    font-family: Calibre-Light;
+    font-family: Calibre-Light, sans-serif;
 }
 .wy-menu-vertical header, .wy-menu-vertical p.caption{
    color: white;
-    font-family: Calibre-Light;
+    font-family: Calibre-Light, sans-serif;
 }

 /* The color inside the selected toc tree block */
@ -85,7 +199,7 @@ a {
    border-right: solid 2px #FB8D68;
    border-left: solid 2px #FB8D68;
    color: #FB8D68;
-    font-family: Calibre-Light;
+    font-family: Calibre-Light, sans-serif;
    border-top: none;
    font-style: normal !important;
 }
@ -136,14 +250,14 @@ a {

 /* class and method names in doc */
 .rst-content dl:not(.docutils) tt.descname, .rst-content dl:not(.docutils) tt.descclassname, .rst-content dl:not(.docutils) tt.descname, .rst-content dl:not(.docutils) code.descname, .rst-content dl:not(.docutils) tt.descclassname, .rst-content dl:not(.docutils) code.descclassname{
-    font-family: Calibre;
+    font-family: Calibre, sans-serif;
    font-size: 20px !important;
 }

 /* class name in doc*/
 .rst-content dl:not(.docutils) tt.descname, .rst-content dl:not(.docutils) tt.descname, .rst-content dl:not(.docutils) code.descname{
    margin-right: 10px;
-    font-family: Calibre-Medium;
+    font-family: Calibre-Medium, sans-serif;
 }

 /* Method and class parameters */
@ -160,17 +274,17 @@ a {

 /* FONTS */
 body{
-    font-family: Calibre;
+    font-family: Calibre, sans-serif;
    font-size: 16px;
 }

 h1 {
-    font-family: Calibre-Thin;
+    font-family: Calibre-Thin, sans-serif;
    font-size: 70px;
 }

 h2, .rst-content .toctree-wrapper p.caption, h3, h4, h5, h6, legend{
-    font-family: Calibre-Medium;
+    font-family: Calibre-Medium, sans-serif;
 }

@font-face {
@ -197,3 +311,40 @@ h2, .rst-content .toctree-wrapper p.caption, h3, h4, h5, h6, legend{
    font-weight:400;
 }

+
+/**
+ * Nav Links to other parts of huggingface.co
+ */
+ div.menu {
+    position: absolute;
+    top: 0;
+    right: 0;
+    padding-top: 20px;
+    padding-right: 20px;
+    z-index: 1000;
+}
+div.menu a {
+    font-size: 14px;
+    letter-spacing: 0.3px;
+    text-transform: uppercase;
+    color: white;
+    -webkit-font-smoothing: antialiased;
+    background: linear-gradient(0deg, #6671ffb8, #9a66ffb8 50%);
+    padding: 10px 16px 6px 16px;
+    border-radius: 3px;
+    margin-left: 12px;
+    position: relative;
+}
+div.menu a:active {
+    top: 1px;
+}
+@media (min-width: 768px) and (max-width: 1750px) {
+    .wy-breadcrumbs {
+        margin-top: 32px;
+    }
+}
+@media (max-width: 768px) {
+    div.menu {
+        display: none;
+    }
+}
--- a/docs/source/_static/js/custom.js
+++ b/docs/source/_static/js/custom.js
--- a/docs/source/_static/js/huggingface_logo.svg
+++ b/docs/source/_static/js/huggingface_logo.svg
--- a/docs/source/benchmarks.rst
+++ b/docs/source/benchmarks.rst
@ -0,0 +1,361 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Benchmarks
+=======================================================================================================================
+
+Let's take a look at how 🤗 Transformer models can be benchmarked, best practices, and already available benchmarks.
+
+A notebook explaining in more detail how to benchmark 🤗 Transformer models can be found :prefix_link:`here
+<notebooks/05-benchmark.ipynb>`.
+
+How to benchmark 🤗 Transformer models
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The classes :class:`~transformers.PyTorchBenchmark` and :class:`~transformers.TensorFlowBenchmark` allow to flexibly
+benchmark 🤗 Transformer models. The benchmark classes allow us to measure the `peak memory usage` and `required time`
+for both `inference` and `training`.
+
+.. note::
+
+  Hereby, `inference` is defined by a single forward pass, and `training` is defined by a single forward pass and
+  backward pass.
+
+The benchmark classes :class:`~transformers.PyTorchBenchmark` and :class:`~transformers.TensorFlowBenchmark` expect an
+object of type :class:`~transformers.PyTorchBenchmarkArguments` and
+:class:`~transformers.TensorFlowBenchmarkArguments`, respectively, for instantiation.
+:class:`~transformers.PyTorchBenchmarkArguments` and :class:`~transformers.TensorFlowBenchmarkArguments` are data
+classes and contain all relevant configurations for their corresponding benchmark class. In the following example, it
+is shown how a BERT model of type `bert-base-cased` can be benchmarked.
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> from transformers import PyTorchBenchmark, PyTorchBenchmarkArguments
+
+    >>> args = PyTorchBenchmarkArguments(models=["bert-base-uncased"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512])
+    >>> benchmark = PyTorchBenchmark(args)
+
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import TensorFlowBenchmark, TensorFlowBenchmarkArguments
+
+    >>> args = TensorFlowBenchmarkArguments(models=["bert-base-uncased"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512])
+    >>> benchmark = TensorFlowBenchmark(args)
+
+
+Here, three arguments are given to the benchmark argument data classes, namely ``models``, ``batch_sizes``, and
+``sequence_lengths``. The argument ``models`` is required and expects a :obj:`list` of model identifiers from the
+`model hub <https://huggingface.co/models>`__ The :obj:`list` arguments ``batch_sizes`` and ``sequence_lengths`` define
+the size of the ``input_ids`` on which the model is benchmarked. There are many more parameters that can be configured
+via the benchmark argument data classes. For more detail on these one can either directly consult the files
+``src/transformers/benchmark/benchmark_args_utils.py``, ``src/transformers/benchmark/benchmark_args.py`` (for PyTorch)
+and ``src/transformers/benchmark/benchmark_args_tf.py`` (for Tensorflow). Alternatively, running the following shell
+commands from root will print out a descriptive list of all configurable parameters for PyTorch and Tensorflow
+respectively.
+
+.. code-block:: bash
+
+    ## PYTORCH CODE
+    python examples/benchmarking/run_benchmark.py --help
+
+    ## TENSORFLOW CODE
+    python examples/benchmarking/run_benchmark_tf.py --help
+
+
+An instantiated benchmark object can then simply be run by calling ``benchmark.run()``.
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> results = benchmark.run()
+    >>> print(results)
+    ====================       INFERENCE - SPEED - RESULT       ====================
+    --------------------------------------------------------------------------------
+    Model Name             Batch Size     Seq Length     Time in s                  
+    --------------------------------------------------------------------------------
+    bert-base-uncased          8               8             0.006     
+    bert-base-uncased          8               32            0.006     
+    bert-base-uncased          8              128            0.018     
+    bert-base-uncased          8              512            0.088     
+    --------------------------------------------------------------------------------
+
+    ====================      INFERENCE - MEMORY - RESULT       ====================
+    --------------------------------------------------------------------------------
+    Model Name             Batch Size     Seq Length    Memory in MB 
+    --------------------------------------------------------------------------------
+    bert-base-uncased          8               8             1227
+    bert-base-uncased          8               32            1281
+    bert-base-uncased          8              128            1307
+    bert-base-uncased          8              512            1539
+    --------------------------------------------------------------------------------
+
+    ====================        ENVIRONMENT INFORMATION         ====================
+
+    - transformers_version: 2.11.0
+    - framework: PyTorch
+    - use_torchscript: False
+    - framework_version: 1.4.0
+    - python_version: 3.6.10
+    - system: Linux
+    - cpu: x86_64
+    - architecture: 64bit
+    - date: 2020-06-29
+    - time: 08:58:43.371351
+    - fp16: False
+    - use_multiprocessing: True
+    - only_pretrain_model: False
+    - cpu_ram_mb: 32088
+    - use_gpu: True
+    - num_gpus: 1
+    - gpu: TITAN RTX
+    - gpu_ram_mb: 24217
+    - gpu_power_watts: 280.0
+    - gpu_performance_state: 2
+    - use_tpu: False
+
+    >>> ## TENSORFLOW CODE
+    >>> results = benchmark.run()
+    >>> print(results)
+    ====================       INFERENCE - SPEED - RESULT       ====================
+    --------------------------------------------------------------------------------
+    Model Name             Batch Size     Seq Length     Time in s                  
+    --------------------------------------------------------------------------------
+    bert-base-uncased          8               8             0.005
+    bert-base-uncased          8               32            0.008
+    bert-base-uncased          8              128            0.022
+    bert-base-uncased          8              512            0.105
+    --------------------------------------------------------------------------------
+
+    ====================      INFERENCE - MEMORY - RESULT       ====================
+    --------------------------------------------------------------------------------
+    Model Name             Batch Size     Seq Length    Memory in MB 
+    --------------------------------------------------------------------------------
+    bert-base-uncased          8               8             1330
+    bert-base-uncased          8               32            1330
+    bert-base-uncased          8              128            1330
+    bert-base-uncased          8              512            1770
+    --------------------------------------------------------------------------------
+
+    ====================        ENVIRONMENT INFORMATION         ====================
+
+    - transformers_version: 2.11.0
+    - framework: Tensorflow
+    - use_xla: False
+    - framework_version: 2.2.0
+    - python_version: 3.6.10
+    - system: Linux
+    - cpu: x86_64
+    - architecture: 64bit
+    - date: 2020-06-29
+    - time: 09:26:35.617317
+    - fp16: False
+    - use_multiprocessing: True
+    - only_pretrain_model: False
+    - cpu_ram_mb: 32088
+    - use_gpu: True
+    - num_gpus: 1
+    - gpu: TITAN RTX
+    - gpu_ram_mb: 24217
+    - gpu_power_watts: 280.0
+    - gpu_performance_state: 2
+    - use_tpu: False
+
+By default, the `time` and the `required memory` for `inference` are benchmarked. In the example output above the first
+two sections show the result corresponding to `inference time` and `inference memory`. In addition, all relevant
+information about the computing environment, `e.g.` the GPU type, the system, the library versions, etc... are printed
+out in the third section under `ENVIRONMENT INFORMATION`. This information can optionally be saved in a `.csv` file
+when adding the argument :obj:`save_to_csv=True` to :class:`~transformers.PyTorchBenchmarkArguments` and
+:class:`~transformers.TensorFlowBenchmarkArguments` respectively. In this case, every section is saved in a separate
+`.csv` file. The path to each `.csv` file can optionally be defined via the argument data classes.
+
+Instead of benchmarking pre-trained models via their model identifier, `e.g.` `bert-base-uncased`, the user can
+alternatively benchmark an arbitrary configuration of any available model class. In this case, a :obj:`list` of
+configurations must be inserted with the benchmark args as follows.
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> from transformers import PyTorchBenchmark, PyTorchBenchmarkArguments, BertConfig
+
+    >>> args = PyTorchBenchmarkArguments(models=["bert-base", "bert-384-hid", "bert-6-lay"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512])
+    >>> config_base = BertConfig()
+    >>> config_384_hid = BertConfig(hidden_size=384)
+    >>> config_6_lay = BertConfig(num_hidden_layers=6)
+
+    >>> benchmark = PyTorchBenchmark(args, configs=[config_base, config_384_hid, config_6_lay])
+    >>> benchmark.run()
+    ====================       INFERENCE - SPEED - RESULT       ====================
+    --------------------------------------------------------------------------------
+    Model Name             Batch Size     Seq Length       Time in s                  
+    --------------------------------------------------------------------------------
+    bert-base                  8              128            0.006
+    bert-base                  8              512            0.006
+    bert-base                  8              128            0.018     
+    bert-base                  8              512            0.088     
+    bert-384-hid              8               8             0.006     
+    bert-384-hid              8               32            0.006     
+    bert-384-hid              8              128            0.011     
+    bert-384-hid              8              512            0.054     
+    bert-6-lay                 8               8             0.003     
+    bert-6-lay                 8               32            0.004     
+    bert-6-lay                 8              128            0.009     
+    bert-6-lay                 8              512            0.044
+    --------------------------------------------------------------------------------
+
+    ====================      INFERENCE - MEMORY - RESULT       ====================
+    --------------------------------------------------------------------------------
+    Model Name             Batch Size     Seq Length      Memory in MB 
+    --------------------------------------------------------------------------------
+    bert-base                  8               8             1277
+    bert-base                  8               32            1281
+    bert-base                  8              128            1307     
+    bert-base                  8              512            1539     
+    bert-384-hid              8               8             1005     
+    bert-384-hid              8               32            1027     
+    bert-384-hid              8              128            1035     
+    bert-384-hid              8              512            1255     
+    bert-6-lay                 8               8             1097     
+    bert-6-lay                 8               32            1101     
+    bert-6-lay                 8              128            1127     
+    bert-6-lay                 8              512            1359
+    --------------------------------------------------------------------------------
+
+    ====================        ENVIRONMENT INFORMATION         ====================
+
+    - transformers_version: 2.11.0
+    - framework: PyTorch
+    - use_torchscript: False
+    - framework_version: 1.4.0
+    - python_version: 3.6.10
+    - system: Linux
+    - cpu: x86_64
+    - architecture: 64bit
+    - date: 2020-06-29
+    - time: 09:35:25.143267
+    - fp16: False
+    - use_multiprocessing: True
+    - only_pretrain_model: False
+    - cpu_ram_mb: 32088
+    - use_gpu: True
+    - num_gpus: 1
+    - gpu: TITAN RTX
+    - gpu_ram_mb: 24217
+    - gpu_power_watts: 280.0
+    - gpu_performance_state: 2
+    - use_tpu: False
+
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import TensorFlowBenchmark, TensorFlowBenchmarkArguments, BertConfig
+
+    >>> args = TensorFlowBenchmarkArguments(models=["bert-base", "bert-384-hid", "bert-6-lay"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512])
+    >>> config_base = BertConfig()
+    >>> config_384_hid = BertConfig(hidden_size=384)
+    >>> config_6_lay = BertConfig(num_hidden_layers=6)
+
+    >>> benchmark = TensorFlowBenchmark(args, configs=[config_base, config_384_hid, config_6_lay])
+    >>> benchmark.run()
+    ====================       INFERENCE - SPEED - RESULT       ====================
+    --------------------------------------------------------------------------------
+    Model Name             Batch Size     Seq Length       Time in s                  
+    --------------------------------------------------------------------------------
+    bert-base                  8               8             0.005
+    bert-base                  8               32            0.008
+    bert-base                  8              128            0.022
+    bert-base                  8              512            0.106
+    bert-384-hid              8               8             0.005
+    bert-384-hid              8               32            0.007
+    bert-384-hid              8              128            0.018
+    bert-384-hid              8              512            0.064
+    bert-6-lay                 8               8             0.002
+    bert-6-lay                 8               32            0.003
+    bert-6-lay                 8              128            0.0011
+    bert-6-lay                 8              512            0.074
+    --------------------------------------------------------------------------------
+
+    ====================      INFERENCE - MEMORY - RESULT       ====================
+    --------------------------------------------------------------------------------
+    Model Name             Batch Size     Seq Length      Memory in MB 
+    --------------------------------------------------------------------------------
+    bert-base                  8               8             1330
+    bert-base                  8               32            1330
+    bert-base                  8              128            1330
+    bert-base                  8              512            1770
+    bert-384-hid              8               8             1330
+    bert-384-hid              8               32            1330
+    bert-384-hid              8              128            1330
+    bert-384-hid              8              512            1540
+    bert-6-lay                 8               8             1330
+    bert-6-lay                 8               32            1330
+    bert-6-lay                 8              128            1330
+    bert-6-lay                 8              512            1540
+    --------------------------------------------------------------------------------
+
+    ====================        ENVIRONMENT INFORMATION         ====================
+
+    - transformers_version: 2.11.0
+    - framework: Tensorflow
+    - use_xla: False
+    - framework_version: 2.2.0
+    - python_version: 3.6.10
+    - system: Linux
+    - cpu: x86_64
+    - architecture: 64bit
+    - date: 2020-06-29
+    - time: 09:38:15.487125
+    - fp16: False
+    - use_multiprocessing: True
+    - only_pretrain_model: False
+    - cpu_ram_mb: 32088
+    - use_gpu: True
+    - num_gpus: 1
+    - gpu: TITAN RTX
+    - gpu_ram_mb: 24217
+    - gpu_power_watts: 280.0
+    - gpu_performance_state: 2
+    - use_tpu: False
+
+
+Again, `inference time` and `required memory` for `inference` are measured, but this time for customized configurations
+of the :obj:`BertModel` class. This feature can especially be helpful when deciding for which configuration the model
+should be trained.
+
+
+Benchmark best practices
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This section lists a couple of best practices one should be aware of when benchmarking a model.
+
+- Currently, only single device benchmarking is supported. When benchmarking on GPU, it is recommended that the user
+  specifies on which device the code should be run by setting the ``CUDA_VISIBLE_DEVICES`` environment variable in the
+  shell, `e.g.` ``export CUDA_VISIBLE_DEVICES=0`` before running the code.
+- The option :obj:`no_multi_processing` should only be set to :obj:`True` for testing and debugging. To ensure accurate
+  memory measurement it is recommended to run each memory benchmark in a separate process by making sure
+  :obj:`no_multi_processing` is set to :obj:`True`.
+- One should always state the environment information when sharing the results of a model benchmark. Results can vary
+  heavily between different GPU devices, library versions, etc., so that benchmark results on their own are not very
+  useful for the community.
+
+
+Sharing your benchmark
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Previously all available core models (10 at the time) have been benchmarked for `inference time`, across many different
+settings: using PyTorch, with and without TorchScript, using TensorFlow, with and without XLA. All of those tests were
+done across CPUs (except for TensorFlow XLA) and GPUs.
+
+The approach is detailed in the `following blogpost
+<https://medium.com/huggingface/benchmarking-transformers-pytorch-and-tensorflow-e2917fb891c2>`__ and the results are
+available `here
+<https://docs.google.com/spreadsheets/d/1sryqufw2D0XlUH4sq3e9Wnxu5EAQkaohzrJbd5HdQ_w/edit?usp=sharing>`__.
+
+With the new `benchmark` tools, it is easier than ever to share your benchmark results with the community
+:prefix_link:`here <examples/benchmarking/README.md>`.
--- a/docs/source/bertology.rst
+++ b/docs/source/bertology.rst
@ -1,18 +1,38 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 BERTology
---------
+-----------------------------------------------------------------------------------------------------------------------

-There is a growing field of study concerned with investigating the inner working of large-scale transformers like BERT (that some call "BERTology"). Some good examples of this field are:
+There is a growing field of study concerned with investigating the inner working of large-scale transformers like BERT
+(that some call "BERTology"). Some good examples of this field are:


-* BERT Rediscovers the Classical NLP Pipeline by Ian Tenney, Dipanjan Das, Ellie Pavlick: https://arxiv.org/abs/1905.05950
+* BERT Rediscovers the Classical NLP Pipeline by Ian Tenney, Dipanjan Das, Ellie Pavlick:
+  https://arxiv.org/abs/1905.05950
 * Are Sixteen Heads Really Better than One? by Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650
-* What Does BERT Look At? An Analysis of BERT's Attention by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D. Manning: https://arxiv.org/abs/1906.04341
+* What Does BERT Look At? An Analysis of BERT's Attention by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D.
+  Manning: https://arxiv.org/abs/1906.04341

-In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to help people access the inner representations, mainly adapted  from the great work of Paul Michel (https://arxiv.org/abs/1905.10650):
+In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to
+help people access the inner representations, mainly adapted from the great work of Paul Michel
+(https://arxiv.org/abs/1905.10650):


 * accessing all the hidden-states of BERT/GPT/GPT-2,
 * accessing all the attention weights for each head of BERT/GPT/GPT-2,
-* retrieving heads output values and gradients to be able to compute head importance score and prune head as explained in https://arxiv.org/abs/1905.10650.
+* retrieving heads output values and gradients to be able to compute head importance score and prune head as explained
+  in https://arxiv.org/abs/1905.10650.

-To help you understand and use these features, we have added a specific example script: `bertology.py <https://github.com/huggingface/transformers/blob/master/examples/run_bertology.py>`_ while extract information and prune a model pre-trained on GLUE.
+To help you understand and use these features, we have added a specific example script: :prefix_link:`bertology.py
+<examples/research_projects/bertology/run_bertology.py>` while extract information and prune a model pre-trained on
+GLUE.
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -14,20 +14,23 @@
 #
 import os
 import sys
-sys.path.insert(0, os.path.abspath('../..'))
+sys.path.insert(0, os.path.abspath('../../src'))


 # -- Project information -----------------------------------------------------

 project = u'transformers'
-copyright = u'2019, huggingface'
+copyright = u'2020, The Hugging Face Team, Licenced under the Apache License, Version 2.0'
 author = u'huggingface'

 # The short X.Y version
 version = u''
 # The full version, including alpha/beta/rc tags
-release = u'1.2.0'
-
+release = u'4.2.0'
+# Prefix link to point to master, comment this during version release and uncomment below line
+extlinks = {'prefix_link': ('https://github.com/huggingface/transformers/blob/master/%s', '')}
+# Prefix link to always point to corresponding version, uncomment this during version release
+# extlinks = {'prefix_link': ('https://github.com/huggingface/transformers/blob/v'+ release + '/%s', '')}

 # -- General configuration ---------------------------------------------------

@ -40,11 +43,13 @@ release = u'1.2.0'
 # ones.
 extensions = [
    'sphinx.ext.autodoc',
+    'sphinx.ext.extlinks',
    'sphinx.ext.coverage',
    'sphinx.ext.napoleon',
    'recommonmark',
    'sphinx.ext.viewcode',
-    'sphinx_markdown_tables'
+    'sphinx_markdown_tables',
+    'sphinx_copybutton'
 ]

 # Add any paths that contain templates here, relative to this directory.
@ -74,6 +79,9 @@ exclude_patterns = [u'_build', 'Thumbs.db', '.DS_Store']
 # The name of the Pygments (syntax highlighting) style to use.
 pygments_style = None

+# Remove the prompt when copying examples
+copybutton_prompt_text = r">>> |\.\.\. "
+copybutton_prompt_is_regexp = True

 # -- Options for HTML output -------------------------------------------------

@ -105,6 +113,12 @@ html_static_path = ['_static']
 #
 # html_sidebars = {}

+# This must be the name of an image file (path relative to the configuration 
+# directory) that is the favicon of the docs. Modern browsers use this as 
+# the icon for tabs, windows and bookmarks. It should be a Windows-style 
+# icon file (.ico).
+html_favicon = 'favicon.ico'
+

 # -- Options for HTMLHelp output ---------------------------------------------

@ -181,8 +195,8 @@ epub_title = project
 epub_exclude_files = ['search.html']

 def setup(app):
-    app.add_stylesheet('css/huggingface.css')
-    app.add_stylesheet('css/code-snippets.css')
+    app.add_css_file('css/huggingface.css')
+    app.add_css_file('css/code-snippets.css')
    app.add_js_file('js/custom.js')

 # -- Extension configuration -------------------------------------------------
--- a/docs/source/contributing.md
+++ b/docs/source/contributing.md
@ -0,0 +1 @@
+../../CONTRIBUTING.md
--- a/docs/source/converting_tensorflow_models.rst
+++ b/docs/source/converting_tensorflow_models.rst
@ -1,18 +1,51 @@
-Converting Tensorflow Checkpoints
-================================================
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.

-A command-line interface is provided to convert original Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM checkpoints in models than be loaded using the ``from_pretrained`` methods of the library.
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Converting Tensorflow Checkpoints
+=======================================================================================================================
+
+A command-line interface is provided to convert original Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM checkpoints in models
+than be loaded using the ``from_pretrained`` methods of the library.
+
+.. note::
+    Since 2.3.0 the conversion script is now part of the transformers CLI (**transformers-cli**) available in any
+    transformers >= 2.3.0 installation.
+
+    The documentation below reflects the **transformers-cli convert** command format.

 BERT
-^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-You can convert any TensorFlow checkpoint for BERT (in particular `the pre-trained models released by Google <https://github.com/google-research/bert#pre-trained-models>`_\ ) in a PyTorch save file by using the `convert_tf_checkpoint_to_pytorch.py <https://github.com/huggingface/transformers/blob/master/transformers/convert_tf_checkpoint_to_pytorch.py>`_ script.
+You can convert any TensorFlow checkpoint for BERT (in particular `the pre-trained models released by Google
+<https://github.com/google-research/bert#pre-trained-models>`_\ ) in a PyTorch save file by using the
+:prefix_link:`convert_bert_original_tf_checkpoint_to_pytorch.py
+<src/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py>` script.

-This CLI takes as input a TensorFlow checkpoint (three files starting with ``bert_model.ckpt``\ ) and the associated configuration file (\ ``bert_config.json``\ ), and creates a PyTorch model for this configuration, loads the weights from the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can be imported using ``torch.load()`` (see examples in `run_bert_extract_features.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_extract_features.py>`_\ , `run_bert_classifier.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_classifier.py>`_ and `run_bert_squad.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_squad.py>`_\ ).
+This CLI takes as input a TensorFlow checkpoint (three files starting with ``bert_model.ckpt``\ ) and the associated
+configuration file (\ ``bert_config.json``\ ), and creates a PyTorch model for this configuration, loads the weights
+from the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that
+can be imported using ``torch.load()`` (see examples in `run_bert_extract_features.py
+<https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_extract_features.py>`_\ ,
+`run_bert_classifier.py
+<https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_classifier.py>`_ and
+`run_bert_squad.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_squad.py>`_\
+).

-You only need to run this conversion script **once** to get a PyTorch model. You can then disregard the TensorFlow checkpoint (the three files starting with ``bert_model.ckpt``\ ) but be sure to keep the configuration file (\ ``bert_config.json``\ ) and the vocabulary file (\ ``vocab.txt``\ ) as these are needed for the PyTorch model too.
+You only need to run this conversion script **once** to get a PyTorch model. You can then disregard the TensorFlow
+checkpoint (the three files starting with ``bert_model.ckpt``\ ) but be sure to keep the configuration file (\
+``bert_config.json``\ ) and the vocabulary file (\ ``vocab.txt``\ ) as these are needed for the PyTorch model too.

-To run this specific conversion script you will need to have TensorFlow and PyTorch installed (\ ``pip install tensorflow``\ ). The rest of the repository only requires PyTorch.
+To run this specific conversion script you will need to have TensorFlow and PyTorch installed (\ ``pip install
+tensorflow``\ ). The rest of the repository only requires PyTorch.

 Here is an example of the conversion process for a pre-trained ``BERT-Base Uncased`` model:

@ -20,75 +53,109 @@ Here is an example of the conversion process for a pre-trained ``BERT-Base Uncas

   export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12

-   transformers bert \
-     $BERT_BASE_DIR/bert_model.ckpt \
-     $BERT_BASE_DIR/bert_config.json \
-     $BERT_BASE_DIR/pytorch_model.bin
+   transformers-cli convert --model_type bert \
+     --tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \
+     --config $BERT_BASE_DIR/bert_config.json \
+     --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin

-You can download Google's pre-trained models for the conversion `here <https://github.com/google-research/bert#pre-trained-models>`__.
+You can download Google's pre-trained models for the conversion `here
+<https://github.com/google-research/bert#pre-trained-models>`__.
+
+ALBERT
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Convert TensorFlow model checkpoints of ALBERT to PyTorch using the
+:prefix_link:`convert_albert_original_tf_checkpoint_to_pytorch.py
+<src/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py>` script.
+
+The CLI takes as input a TensorFlow checkpoint (three files starting with ``model.ckpt-best``\ ) and the accompanying
+configuration file (\ ``albert_config.json``\ ), then creates and saves a PyTorch model. To run this conversion you
+will need to have TensorFlow and PyTorch installed.
+
+Here is an example of the conversion process for the pre-trained ``ALBERT Base`` model:
+
+.. code-block:: shell
+
+   export ALBERT_BASE_DIR=/path/to/albert/albert_base
+
+   transformers-cli convert --model_type albert \
+     --tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \
+     --config $ALBERT_BASE_DIR/albert_config.json \
+     --pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin
+
+You can download Google's pre-trained models for the conversion `here
+<https://github.com/google-research/albert#pre-trained-models>`__.

 OpenAI GPT
-^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-Here is an example of the conversion process for a pre-trained OpenAI GPT model, assuming that your NumPy checkpoint save as the same format than OpenAI pretrained model (see `here <https://github.com/openai/finetune-transformer-lm>`__\ )
+Here is an example of the conversion process for a pre-trained OpenAI GPT model, assuming that your NumPy checkpoint
+save as the same format than OpenAI pretrained model (see `here <https://github.com/openai/finetune-transformer-lm>`__\
+)

 .. code-block:: shell

   export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights

-   transformers gpt \
-     $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
-     $PYTORCH_DUMP_OUTPUT \
-     [OPENAI_GPT_CONFIG]
+   transformers-cli convert --model_type gpt \
+     --tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
+     --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+     [--config OPENAI_GPT_CONFIG] \
+     [--finetuning_task_name OPENAI_GPT_FINETUNED_TASK] \
+

 OpenAI GPT-2
-^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-Here is an example of the conversion process for a pre-trained OpenAI GPT-2 model (see `here <https://github.com/openai/gpt-2>`__\ )
+Here is an example of the conversion process for a pre-trained OpenAI GPT-2 model (see `here
+<https://github.com/openai/gpt-2>`__\ )

 .. code-block:: shell

   export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights

-   transformers gpt2 \
-     $OPENAI_GPT2_CHECKPOINT_PATH \
-     $PYTORCH_DUMP_OUTPUT \
-     [OPENAI_GPT2_CONFIG]
+   transformers-cli convert --model_type gpt2 \
+     --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
+     --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+     [--config OPENAI_GPT2_CONFIG] \
+     [--finetuning_task_name OPENAI_GPT2_FINETUNED_TASK]

 Transformer-XL
-^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-Here is an example of the conversion process for a pre-trained Transformer-XL model (see `here <https://github.com/kimiyoung/transformer-xl/tree/master/tf#obtain-and-evaluate-pretrained-sota-models>`__\ )
+Here is an example of the conversion process for a pre-trained Transformer-XL model (see `here
+<https://github.com/kimiyoung/transformer-xl/tree/master/tf#obtain-and-evaluate-pretrained-sota-models>`__\ )

 .. code-block:: shell

   export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint

-   transformers transfo_xl \
-     $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
-     $PYTORCH_DUMP_OUTPUT \
-     [TRANSFO_XL_CONFIG]
+   transformers-cli convert --model_type transfo_xl \
+     --tf_checkpoint $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
+     --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+     [--config TRANSFO_XL_CONFIG] \
+     [--finetuning_task_name TRANSFO_XL_FINETUNED_TASK]


 XLNet
-^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-Here is an example of the conversion process for a pre-trained XLNet model, fine-tuned on STS-B using the TensorFlow script:
+Here is an example of the conversion process for a pre-trained XLNet model:

 .. code-block:: shell

   export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
   export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config

-   transformers xlnet \
-     $TRANSFO_XL_CHECKPOINT_PATH \
-     $TRANSFO_XL_CONFIG_PATH \
-     $PYTORCH_DUMP_OUTPUT \
-     STS-B \
+   transformers-cli convert --model_type xlnet \
+     --tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \
+     --config $TRANSFO_XL_CONFIG_PATH \
+     --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
+     [--finetuning_task_name XLNET_FINETUNED_TASK] \


 XLM
-^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

 Here is an example of the conversion process for a pre-trained XLM model:

@ -96,6 +163,8 @@ Here is an example of the conversion process for a pre-trained XLM model:

   export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint

-   transformers xlm \
-     $XLM_CHECKPOINT_PATH \
-     $PYTORCH_DUMP_OUTPUT \
+   transformers-cli convert --model_type xlm \
+     --tf_checkpoint $XLM_CHECKPOINT_PATH \
+     --pytorch_dump_output $PYTORCH_DUMP_OUTPUT
+    [--config XML_CONFIG] \
+    [--finetuning_task_name XML_FINETUNED_TASK]
--- a/docs/source/custom_datasets.rst
+++ b/docs/source/custom_datasets.rst
@ -0,0 +1,729 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Fine-tuning with custom datasets
+=======================================================================================================================
+
+.. note::
+
+    The datasets used in this tutorial are available and can be more easily accessed using the `🤗 NLP library
+    <https://github.com/huggingface/nlp>`_. We do not use this library to access the datasets here since this tutorial
+    meant to illustrate how to work with your own data. A brief of introduction can be found at the end of the tutorial
+    in the section ":ref:`nlplib`".
+
+This tutorial will take you through several examples of using 🤗 Transformers models with your own datasets. The guide
+shows one of many valid workflows for using these models and is meant to be illustrative rather than definitive. We
+show examples of reading in several data formats, preprocessing the data for several types of tasks, and then preparing
+the data into PyTorch/TensorFlow ``Dataset`` objects which can easily be used either with
+:class:`~transformers.Trainer`/:class:`~transformers.TFTrainer` or with native PyTorch/TensorFlow.
+
+We include several examples, each of which demonstrates a different type of common downstream task:
+
+  - :ref:`seq_imdb`
+  - :ref:`tok_ner`
+  - :ref:`qa_squad`
+  - :ref:`resources`
+
+.. _seq_imdb:
+
+Sequence Classification with IMDb Reviews
+-----------------------------------------------------------------------------------------------------------------------
+
+.. note::
+
+    This dataset can be explored in the Hugging Face model hub (`IMDb <https://huggingface.co/datasets/imdb>`_), and
+    can be alternatively downloaded with the 🤗 NLP library with ``load_dataset("imdb")``.
+
+In this example, we'll show how to download, tokenize, and train a model on the IMDb reviews dataset. This task takes
+the text of a review and requires the model to predict whether the sentiment of the review is positive or negative.
+Let's start by downloading the dataset from the `Large Movie Review Dataset
+<http://ai.stanford.edu/~amaas/data/sentiment/>`_ webpage.
+
+.. code-block:: bash
+
+    wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
+    tar -xf aclImdb_v1.tar.gz
+
+This data is organized into ``pos`` and ``neg`` folders with one text file per example. Let's write a function that can
+read this in.
+
+.. code-block:: python
+
+    from pathlib import Path
+
+    def read_imdb_split(split_dir):
+        split_dir = Path(split_dir)
+        texts = []
+        labels = []
+        for label_dir in ["pos", "neg"]:
+            for text_file in (split_dir/label_dir).iterdir():
+                texts.append(text_file.read_text())
+                labels.append(0 if label_dir is "neg" else 1)
+
+        return texts, labels
+
+    train_texts, train_labels = read_imdb_split('aclImdb/train')
+    test_texts, test_labels = read_imdb_split('aclImdb/test')
+
+We now have a train and test dataset, but let's also also create a validation set which we can use for for evaluation
+and tuning without training our test set results. Sklearn has a convenient utility for creating such splits:
+
+.. code-block:: python
+
+    from sklearn.model_selection import train_test_split
+    train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)
+
+Alright, we've read in our dataset. Now let's tackle tokenization. We'll eventually train a classifier using
+pre-trained DistilBert, so let's use the DistilBert tokenizer.
+
+.. code-block:: python
+
+    from transformers import DistilBertTokenizerFast
+    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
+
+Now we can simply pass our texts to the tokenizer. We'll pass ``truncation=True`` and ``padding=True``, which will
+ensure that all of our sequences are padded to the same length and are truncated to be no longer model's maximum input
+length. This will allow us to feed batches of sequences into the model at the same time.
+
+.. code-block:: python
+
+    train_encodings = tokenizer(train_texts, truncation=True, padding=True)
+    val_encodings = tokenizer(val_texts, truncation=True, padding=True)
+    test_encodings = tokenizer(test_texts, truncation=True, padding=True)
+
+Now, let's turn our labels and encodings into a Dataset object. In PyTorch, this is done by subclassing a
+``torch.utils.data.Dataset`` object and implementing ``__len__`` and ``__getitem__``. In TensorFlow, we pass our input
+encodings and labels to the ``from_tensor_slices`` constructor method. We put the data in this format so that the data
+can be easily batched such that each key in the batch encoding corresponds to a named parameter of the
+:meth:`~transformers.DistilBertForSequenceClassification.forward` method of the model we will train.
+
+.. code-block:: python
+
+    ## PYTORCH CODE
+    import torch
+
+    class IMDbDataset(torch.utils.data.Dataset):
+        def __init__(self, encodings, labels):
+            self.encodings = encodings
+            self.labels = labels
+
+        def __getitem__(self, idx):
+            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
+            item['labels'] = torch.tensor(self.labels[idx])
+            return item
+
+        def __len__(self):
+            return len(self.labels)
+
+    train_dataset = IMDbDataset(train_encodings, train_labels)
+    val_dataset = IMDbDataset(val_encodings, val_labels)
+    test_dataset = IMDbDataset(test_encodings, test_labels)
+    ## TENSORFLOW CODE
+    import tensorflow as tf
+
+    train_dataset = tf.data.Dataset.from_tensor_slices((
+        dict(train_encodings),
+        train_labels
+    ))
+    val_dataset = tf.data.Dataset.from_tensor_slices((
+        dict(val_encodings),
+        val_labels
+    ))
+    test_dataset = tf.data.Dataset.from_tensor_slices((
+        dict(test_encodings),
+        test_labels
+    ))
+
+Now that our datasets our ready, we can fine-tune a model either with the 🤗
+:class:`~transformers.Trainer`/:class:`~transformers.TFTrainer` or with native PyTorch/TensorFlow. See :doc:`training
+<training>`.
+
+.. _ft_trainer:
+
+Fine-tuning with Trainer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The steps above prepared the datasets in the way that the trainer is expected. Now all we need to do is create a model
+to fine-tune, define the :class:`~transformers.TrainingArguments`/:class:`~transformers.TFTrainingArguments` and
+instantiate a :class:`~transformers.Trainer`/:class:`~transformers.TFTrainer`.
+
+.. code-block:: python
+
+    ## PYTORCH CODE
+    from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
+
+    training_args = TrainingArguments(
+        output_dir='./results',          # output directory
+        num_train_epochs=3,              # total number of training epochs
+        per_device_train_batch_size=16,  # batch size per device during training
+        per_device_eval_batch_size=64,   # batch size for evaluation
+        warmup_steps=500,                # number of warmup steps for learning rate scheduler
+        weight_decay=0.01,               # strength of weight decay
+        logging_dir='./logs',            # directory for storing logs
+        logging_steps=10,
+    )
+
+    model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
+
+    trainer = Trainer(
+        model=model,                         # the instantiated 🤗 Transformers model to be trained
+        args=training_args,                  # training arguments, defined above
+        train_dataset=train_dataset,         # training dataset
+        eval_dataset=val_dataset             # evaluation dataset
+    )
+
+    trainer.train()
+    ## TENSORFLOW CODE
+    from transformers import TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments
+
+    training_args = TFTrainingArguments(
+        output_dir='./results',          # output directory
+        num_train_epochs=3,              # total number of training epochs
+        per_device_train_batch_size=16,  # batch size per device during training
+        per_device_eval_batch_size=64,   # batch size for evaluation
+        warmup_steps=500,                # number of warmup steps for learning rate scheduler
+        weight_decay=0.01,               # strength of weight decay
+        logging_dir='./logs',            # directory for storing logs
+        logging_steps=10,
+    )
+
+    with training_args.strategy.scope():
+        model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
+
+    trainer = TFTrainer(
+        model=model,                         # the instantiated 🤗 Transformers model to be trained
+        args=training_args,                  # training arguments, defined above
+        train_dataset=train_dataset,         # training dataset
+        eval_dataset=val_dataset             # evaluation dataset
+    )
+
+    trainer.train()
+
+.. _ft_native:
+
+Fine-tuning with native PyTorch/TensorFlow
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+We can also train use native PyTorch or TensorFlow:
+
+.. code-block:: python
+
+    ## PYTORCH CODE
+    from torch.utils.data import DataLoader
+    from transformers import DistilBertForSequenceClassification, AdamW
+
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+
+    model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
+    model.to(device)
+    model.train()
+
+    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
+
+    optim = AdamW(model.parameters(), lr=5e-5)
+
+    for epoch in range(3):
+        for batch in train_loader:
+            optim.zero_grad()
+            input_ids = batch['input_ids'].to(device)
+            attention_mask = batch['attention_mask'].to(device)
+            labels = batch['labels'].to(device)
+            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
+            loss = outputs[0]
+            loss.backward()
+            optim.step()
+
+    model.eval()
+    ## TENSORFLOW CODE
+    from transformers import TFDistilBertForSequenceClassification
+
+    model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
+
+    optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
+    model.compile(optimizer=optimizer, loss=model.compute_loss) # can also use any keras loss fn
+    model.fit(train_dataset.shuffle(1000).batch(16), epochs=3, batch_size=16)
+
+.. _tok_ner:
+
+Token Classification with W-NUT Emerging Entities
+-----------------------------------------------------------------------------------------------------------------------
+
+.. note::
+
+    This dataset can be explored in the Hugging Face model hub (`WNUT-17 <https://huggingface.co/datasets/wnut_17>`_),
+    and can be alternatively downloaded with the 🤗 NLP library with ``load_dataset("wnut_17")``.
+
+Next we will look at token classification. Rather than classifying an entire sequence, this task classifies token by
+token. We'll demonstrate how to do this with `Named Entity Recognition
+<http://nlpprogress.com/english/named_entity_recognition.html>`_, which involves identifying tokens which correspond to
+a predefined set of "entities". Specifically, we'll use the `W-NUT Emerging and Rare entities
+<http://noisy-text.github.io/2017/emerging-rare-entities.html>`_ corpus. The data is given as a collection of
+pre-tokenized documents where each token is assigned a tag.
+
+Let's start by downloading the data.
+
+.. code-block:: bash
+
+    wget http://noisy-text.github.io/2017/files/wnut17train.conll
+
+In this case, we'll just download the train set, which is a single text file. Each line of the file contains either (1)
+a word and tag separated by a tab, or (2) a blank line indicating the end of a document. Let's write a function to read
+this in. We'll take in the file path and return ``token_docs`` which is a list of lists of token strings, and
+``token_tags`` which is a list of lists of tag strings.
+
+.. code-block:: python
+
+    from pathlib import Path
+    import re
+
+    def read_wnut(file_path):
+        file_path = Path(file_path)
+
+        raw_text = file_path.read_text().strip()
+        raw_docs = re.split(r'\n\t?\n', raw_text)
+        token_docs = []
+        tag_docs = []
+        for doc in raw_docs:
+            tokens = []
+            tags = []
+            for line in doc.split('\n'):
+                token, tag = line.split('\t')
+                tokens.append(token)
+                tags.append(tag)
+            token_docs.append(tokens)
+            tag_docs.append(tags)
+
+        return token_docs, tag_docs
+
+    texts, tags = read_wnut('wnut17train.conll')
+
+Just to see what this data looks like, let's take a look at a segment of the first document.
+
+.. code-block:: python
+
+    >>> print(texts[0][10:17], tags[0][10:17], sep='\n')
+    ['for', 'two', 'weeks', '.', 'Empire', 'State', 'Building']
+    ['O', 'O', 'O', 'O', 'B-location', 'I-location', 'I-location']
+
+``location`` is an entity type, ``B-`` indicates the beginning of an entity, and ``I-`` indicates consecutive positions
+of the same entity ("Empire State Building" is considered one entity). ``O`` indicates the token does not correspond to
+any entity.
+
+Now that we've read the data in, let's create a train/validation split:
+
+.. code-block:: python
+
+    from sklearn.model_selection import train_test_split
+    train_texts, val_texts, train_tags, val_tags = train_test_split(texts, tags, test_size=.2)
+
+Next, let's create encodings for our tokens and tags. For the tags, we can start by just create a simple mapping which
+we'll use in a moment:
+
+.. code-block:: python
+
+    unique_tags = set(tag for doc in tags for tag in doc)
+    tag2id = {tag: id for id, tag in enumerate(unique_tags)}
+    id2tag = {id: tag for tag, id in tag2id.items()}
+
+To encode the tokens, we'll use a pre-trained DistilBert tokenizer. We can tell the tokenizer that we're dealing with
+ready-split tokens rather than full sentence strings by passing ``is_split_into_words=True``. We'll also pass
+``padding=True`` and ``truncation=True`` to pad the sequences to be the same length. Lastly, we can tell the model to
+return information about the tokens which are split by the wordpiece tokenization process, which we will need in a
+moment.
+
+.. code-block:: python
+
+    from transformers import DistilBertTokenizerFast
+    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')
+    train_encodings = tokenizer(train_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
+    val_encodings = tokenizer(val_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
+
+Great, so now our tokens are nicely encoded in the format that they need to be in to feed them into our DistilBert
+model below.
+
+Now we arrive at a common obstacle with using pre-trained models for token-level classification: many of the tokens in
+the W-NUT corpus are not in DistilBert's vocabulary. Bert and many models like it use a method called WordPiece
+Tokenization, meaning that single words are split into multiple tokens such that each token is likely to be in the
+vocabulary. For example, DistilBert's tokenizer would split the Twitter handle ``@huggingface`` into the tokens ``['@',
+'hugging', '##face']``. This is a problem for us because we have exactly one tag per token. If the tokenizer splits a
+token into multiple sub-tokens, then we will end up with a mismatch between our tokens and our labels.
+
+One way to handle this is to only train on the tag labels for the first subtoken of a split token. We can do this in 🤗
+Transformers by setting the labels we wish to ignore to ``-100``. In the example above, if the label for
+``@HuggingFace`` is ``3`` (indexing ``B-corporation``), we would set the labels of ``['@', 'hugging', '##face']`` to
+``[3, -100, -100]``.
+
+Let's write a function to do this. This is where we will use the ``offset_mapping`` from the tokenizer as mentioned
+above. For each sub-token returned by the tokenizer, the offset mapping gives us a tuple indicating the sub-token's
+start position and end position relative to the original token it was split from. That means that if the first position
+in the tuple is anything other than ``0``, we will set its corresponding label to ``-100``. While we're at it, we can
+also set labels to ``-100`` if the second position of the offset mapping is ``0``, since this means it must be a
+special token like ``[PAD]`` or ``[CLS]``.
+
+.. note::
+
+    Due to a recently fixed bug, -1 must be used instead of -100 when using TensorFlow in 🤗 Transformers <= 3.02.
+
+.. code-block:: python
+
+    import numpy as np
+
+    def encode_tags(tags, encodings):
+        labels = [[tag2id[tag] for tag in doc] for doc in tags]
+        encoded_labels = []
+        for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
+            # create an empty array of -100
+            doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
+            arr_offset = np.array(doc_offset)
+
+            # set labels whose first offset position is 0 and the second is not 0
+            doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
+            encoded_labels.append(doc_enc_labels.tolist())
+
+        return encoded_labels
+
+    train_labels = encode_tags(train_tags, train_encodings)
+    val_labels = encode_tags(val_tags, val_encodings)
+
+The hard part is now done. Just as in the sequence classification example above, we can create a dataset object:
+
+.. code-block:: python
+
+    ## PYTORCH CODE
+    import torch
+
+    class WNUTDataset(torch.utils.data.Dataset):
+        def __init__(self, encodings, labels):
+            self.encodings = encodings
+            self.labels = labels
+
+        def __getitem__(self, idx):
+            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
+            item['labels'] = torch.tensor(self.labels[idx])
+            return item
+
+        def __len__(self):
+            return len(self.labels)
+
+    train_encodings.pop("offset_mapping") # we don't want to pass this to the model
+    val_encodings.pop("offset_mapping")
+    train_dataset = WNUTDataset(train_encodings, train_labels)
+    val_dataset = WNUTDataset(val_encodings, val_labels)
+    ## TENSORFLOW CODE
+    import tensorflow as tf
+
+    train_encodings.pop("offset_mapping") # we don't want to pass this to the model
+    val_encodings.pop("offset_mapping")
+
+    train_dataset = tf.data.Dataset.from_tensor_slices((
+        dict(train_encodings),
+        train_labels
+    ))
+    val_dataset = tf.data.Dataset.from_tensor_slices((
+        dict(val_encodings),
+        val_labels
+    ))
+
+Now load in a token classification model and specify the number of labels:
+
+.. code-block:: python
+
+    ## PYTORCH CODE
+    from transformers import DistilBertForTokenClassification
+    model = DistilBertForTokenClassification.from_pretrained('distilbert-base-cased', num_labels=len(unique_tags))
+    ## TENSORFLOW CODE
+    from transformers import TFDistilBertForTokenClassification
+    model = TFDistilBertForTokenClassification.from_pretrained('distilbert-base-cased', num_labels=len(unique_tags))
+
+The data and model are both ready to go. You can train the model either with
+:class:`~transformers.Trainer`/:class:`~transformers.TFTrainer` or with native PyTorch/TensorFlow, exactly as in the
+sequence classification example above.
+
+  - :ref:`ft_trainer`
+  - :ref:`ft_native`
+
+.. _qa_squad:
+
+Question Answering with SQuAD 2.0
+-----------------------------------------------------------------------------------------------------------------------
+
+.. note::
+
+    This dataset can be explored in the Hugging Face model hub (`SQuAD V2
+    <https://huggingface.co/datasets/squad_v2>`_), and can be alternatively downloaded with the 🤗 NLP library with
+    ``load_dataset("squad_v2")``.
+
+Question answering comes in many forms. In this example, we'll look at the particular type of extractive QA that
+involves answering a question about a passage by highlighting the segment of the passage that answers the question.
+This involves fine-tuning a model which predicts a start position and an end position in the passage. We will use the
+`Stanford Question Answering Dataset (SQuAD) 2.0 <https://rajpurkar.github.io/SQuAD-explorer/>`_.
+
+We will start by downloading the data:
+
+.. code-block:: bash
+
+    mkdir squad
+    wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json -O squad/train-v2.0.json
+    wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json -O squad/dev-v2.0.json
+
+Each split is in a structured json file with a number of questions and answers for each passage (or context). We'll
+take this apart into parallel lists of contexts, questions, and answers (note that the contexts here are repeated since
+there are multiple questions per context):
+
+.. code-block:: python
+
+    import json
+    from pathlib import Path
+
+    def read_squad(path):
+        path = Path(path)
+        with open(path, 'rb') as f:
+            squad_dict = json.load(f)
+
+        contexts = []
+        questions = []
+        answers = []
+        for group in squad_dict['data']:
+            for passage in group['paragraphs']:
+                context = passage['context']
+                for qa in passage['qas']:
+                    question = qa['question']
+                    for answer in qa['answers']:
+                        contexts.append(context)
+                        questions.append(question)
+                        answers.append(answer)
+
+        return contexts, questions, answers
+
+    train_contexts, train_questions, train_answers = read_squad('squad/train-v2.0.json')
+    val_contexts, val_questions, val_answers = read_squad('squad/dev-v2.0.json')
+
+The contexts and questions are just strings. The answers are dicts containing the subsequence of the passage with the
+correct answer as well as an integer indicating the character at which the answer begins. In order to train a model on
+this data we need (1) the tokenized context/question pairs, and (2) integers indicating at which *token* positions the
+answer begins and ends.
+
+First, let's get the *character* position at which the answer ends in the passage (we are given the starting position).
+Sometimes SQuAD answers are off by one or two characters, so we will also adjust for that.
+
+.. code-block:: python
+
+    def add_end_idx(answers, contexts):
+        for answer, context in zip(answers, contexts):
+            gold_text = answer['text']
+            start_idx = answer['answer_start']
+            end_idx = start_idx + len(gold_text)
+
+            # sometimes squad answers are off by a character or two – fix this
+            if context[start_idx:end_idx] == gold_text:
+                answer['answer_end'] = end_idx
+            elif context[start_idx-1:end_idx-1] == gold_text:
+                answer['answer_start'] = start_idx - 1
+                answer['answer_end'] = end_idx - 1     # When the gold label is off by one character
+            elif context[start_idx-2:end_idx-2] == gold_text:
+                answer['answer_start'] = start_idx - 2
+                answer['answer_end'] = end_idx - 2     # When the gold label is off by two characters
+
+    add_end_idx(train_answers, train_contexts)
+    add_end_idx(val_answers, val_contexts)
+
+Now ``train_answers`` and ``val_answers`` include the character end positions and the corrected start positions. Next,
+let's tokenize our context/question pairs. 🤗 Tokenizers can accept parallel lists of sequences and encode them together
+as sequence pairs.
+
+.. code-block:: python
+
+    from transformers import DistilBertTokenizerFast
+    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
+
+    train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
+    val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)
+
+Next we need to convert our character start/end positions to token start/end positions. When using 🤗 Fast Tokenizers,
+we can use the built in :func:`~transformers.BatchEncoding.char_to_token` method.
+
+.. code-block:: python
+
+    def add_token_positions(encodings, answers):
+        start_positions = []
+        end_positions = []
+        for i in range(len(answers)):
+            start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
+            end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))
+
+            # if start position is None, the answer passage has been truncated
+            if start_positions[-1] is None:
+                start_positions[-1] = tokenizer.model_max_length
+
+            # if end position is None, the 'char_to_token' function points to the space before the correct token - > add + 1
+            if end_positions[-1] is None:
+                end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end'] + 1)
+        encodings.update({'start_positions': start_positions, 'end_positions': end_positions})
+
+    add_token_positions(train_encodings, train_answers)
+    add_token_positions(val_encodings, val_answers)
+
+Our data is ready. Let's just put it in a PyTorch/TensorFlow dataset so that we can easily use it for training. In
+PyTorch, we define a custom ``Dataset`` class. In TensorFlow, we pass a tuple of ``(inputs_dict, labels_dict)`` to the
+``from_tensor_slices`` method.
+
+.. code-block:: python
+
+    ## PYTORCH CODE
+    import torch
+
+    class SquadDataset(torch.utils.data.Dataset):
+        def __init__(self, encodings):
+            self.encodings = encodings
+
+        def __getitem__(self, idx):
+            return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
+
+        def __len__(self):
+            return len(self.encodings.input_ids)
+
+    train_dataset = SquadDataset(train_encodings)
+    val_dataset = SquadDataset(val_encodings)
+    ## TENSORFLOW CODE
+    import tensorflow as tf
+
+    train_dataset = tf.data.Dataset.from_tensor_slices((
+        {key: train_encodings[key] for key in ['input_ids', 'attention_mask']},
+        {key: train_encodings[key] for key in ['start_positions', 'end_positions']}
+    ))
+    val_dataset = tf.data.Dataset.from_tensor_slices((
+        {key: val_encodings[key] for key in ['input_ids', 'attention_mask']},
+        {key: val_encodings[key] for key in ['start_positions', 'end_positions']}
+    ))
+
+Now we can use a DistilBert model with a QA head for training:
+
+.. code-block:: python
+
+    ## PYTORCH CODE
+    from transformers import DistilBertForQuestionAnswering
+    model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
+    ## TENSORFLOW CODE
+    from transformers import TFDistilBertForQuestionAnswering
+    model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
+
+
+The data and model are both ready to go. You can train the model with
+:class:`~transformers.Trainer`/:class:`~transformers.TFTrainer` exactly as in the sequence classification example
+above. If using native PyTorch, replace ``labels`` with ``start_positions`` and ``end_positions`` in the training
+example. If using Keras's ``fit``, we need to make a minor modification to handle this example since it involves
+multiple model outputs.
+
+  - :ref:`ft_trainer`
+
+.. code-block:: python
+
+    ## PYTORCH CODE
+    from torch.utils.data import DataLoader
+    from transformers import AdamW
+
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+
+    model.to(device)
+    model.train()
+
+    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
+
+    optim = AdamW(model.parameters(), lr=5e-5)
+
+    for epoch in range(3):
+        for batch in train_loader:
+            optim.zero_grad()
+            input_ids = batch['input_ids'].to(device)
+            attention_mask = batch['attention_mask'].to(device)
+            start_positions = batch['start_positions'].to(device)
+            end_positions = batch['end_positions'].to(device)
+            outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
+            loss = outputs[0]
+            loss.backward()
+            optim.step()
+
+    model.eval()
+    ## TENSORFLOW CODE
+    # Keras will expect a tuple when dealing with labels
+    train_dataset = train_dataset.map(lambda x, y: (x, (y['start_positions'], y['end_positions'])))
+
+    # Keras will assign a separate loss for each output and add them together. So we'll just use the standard CE loss
+    # instead of using the built-in model.compute_loss, which expects a dict of outputs and averages the two terms.
+    # Note that this means the loss will be 2x of when using TFTrainer since we're adding instead of averaging them.
+    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+    model.distilbert.return_dict = False # if using 🤗 Transformers >3.02, make sure outputs are tuples
+
+    optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
+    model.compile(optimizer=optimizer, loss=loss) # can also use any keras loss fn
+    model.fit(train_dataset.shuffle(1000).batch(16), epochs=3, batch_size=16)
+
+.. _resources:
+
+Additional Resources
+-----------------------------------------------------------------------------------------------------------------------
+
+  - `How to train a new language model from scratch using Transformers and Tokenizers
+    <https://huggingface.co/blog/how-to-train>`_. Blog post showing the steps to load in Esperanto data and train a
+    masked language model from scratch.
+  - :doc:`Preprocessing <preprocessing>`. Docs page on data preprocessing.
+  - :doc:`Training <training>`. Docs page on training and fine-tuning.
+
+.. _nlplib:
+
+Using the 🤗 NLP Datasets & Metrics library
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This tutorial demonstrates how to read in datasets from various raw text formats and prepare them for training with 🤗
+Transformers so that you can do the same thing with your own custom datasets. However, we recommend users use the `🤗
+NLP library <https://github.com/huggingface/nlp>`_ for working with the 150+ datasets included in the `hub
+<https://huggingface.co/datasets>`_, including the three datasets used in this tutorial. As a very brief overview, we
+will show how to use the NLP library to download and prepare the IMDb dataset from the first example, :ref:`seq_imdb`.
+
+Start by downloading the dataset:
+
+.. code-block:: python
+
+    from nlp import load_dataset
+    train = load_dataset("imdb", split="train")
+
+Each dataset has multiple columns corresponding to different features. Let's see what our columns are.
+
+.. code-block:: python
+
+    >>> print(train.column_names)
+    ['label', 'text']
+
+Great. Now let's tokenize the text. We can do this using the ``map`` method. We'll also rename the ``label`` column to
+``labels`` to match the model's input arguments.
+
+.. code-block:: python
+
+    train = train.map(lambda batch: tokenizer(batch["text"], truncation=True, padding=True), batched=True)
+    train.rename_column_("label", "labels")
+
+Lastly, we can use the ``set_format`` method to determine which columns and in what data format we want to access
+dataset elements.
+
+.. code-block:: python
+
+    ## PYTORCH CODE
+    >>> train.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
+    >>> {key: val.shape for key, val in train[0].items()})
+    {'labels': torch.Size([]), 'input_ids': torch.Size([512]), 'attention_mask': torch.Size([512])}
+    ## TENSORFLOW CODE
+    >>> train.set_format("tensorflow", columns=["input_ids", "attention_mask", "labels"])
+    >>> {key: val.shape for key, val in train[0].items()})
+    {'labels': TensorShape([]), 'input_ids': TensorShape([512]), 'attention_mask': TensorShape([512])}
+
+We now have a fully-prepared dataset. Check out `the 🤗 NLP docs <https://huggingface.co/nlp/processing.html>`_ for a
+more thorough introduction.
--- a/docs/source/examples.md
+++ b/docs/source/examples.md
@ -0,0 +1 @@
+../../examples/README.md
--- a/docs/source/favicon.ico
+++ b/docs/source/favicon.ico
--- a/docs/source/glossary.rst
+++ b/docs/source/glossary.rst
@ -0,0 +1,299 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Glossary
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+General terms
+-----------------------------------------------------------------------------------------------------------------------
+
+- autoencoding models: see MLM
+- autoregressive models: see CLM
+- CLM: causal language modeling, a pretraining task where the model reads the texts in order and has to predict the
+  next word. It's usually done by reading the whole sentence but using a mask inside the model to hide the future
+  tokens at a certain timestep.
+- MLM: masked language modeling, a pretraining task where the model sees a corrupted version of the texts, usually done
+  by masking some tokens randomly, and has to predict the original text.
+- multimodal: a task that combines texts with another kind of inputs (for instance images).
+- NLG: natural language generation, all tasks related to generating text ( for instance talk with transformers,
+  translation)
+- NLP: natural language processing, a generic way to say "deal with texts".
+- NLU: natural language understanding, all tasks related to understanding what is in a text (for instance classifying
+  the whole text, individual words)
+- pretrained model: a model that has been pretrained on some data (for instance all of Wikipedia). Pretraining methods
+  involve a self-supervised objective, which can be reading the text and trying to predict the next word (see CLM) or
+  masking some words and trying to predict them (see MLM).
+- RNN: recurrent neural network, a type of model that uses a loop over a layer to process texts.
+- seq2seq or sequence-to-sequence: models that generate a new sequence from an input, like translation models, or
+  summarization models (such as :doc:`Bart </model_doc/bart>` or :doc:`T5 </model_doc/t5>`).
+- token: a part of a sentence, usually a word, but can also be a subword (non-common words are often split in subwords)
+  or a punctuation symbol.
+
+Model inputs
+-----------------------------------------------------------------------------------------------------------------------
+
+Every model is different yet bears similarities with the others. Therefore most models use the same inputs, which are
+detailed here alongside usage examples.
+
+.. _input-ids:
+
+Input IDs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The input ids are often the only required parameters to be passed to the model as input. *They are token indices,
+numerical representations of tokens building the sequences that will be used as input by the model*.
+
+Each tokenizer works differently but the underlying mechanism remains the same. Here's an example using the BERT
+tokenizer, which is a `WordPiece <https://arxiv.org/pdf/1609.08144.pdf>`__ tokenizer:
+
+.. code-block::
+
+    >>> from transformers import BertTokenizer
+    >>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+
+    >>> sequence = "A Titan RTX has 24GB of VRAM"
+
+The tokenizer takes care of splitting the sequence into tokens available in the tokenizer vocabulary.
+
+.. code-block::
+
+    >>> tokenized_sequence = tokenizer.tokenize(sequence)
+
+The tokens are either words or subwords. Here for instance, "VRAM" wasn't in the model vocabulary, so it's been split
+in "V", "RA" and "M". To indicate those tokens are not separate words but parts of the same word, a double-hash prefix
+is added for "RA" and "M":
+
+.. code-block::
+
+    >>> print(tokenized_sequence)
+    ['A', 'Titan', 'R', '##T', '##X', 'has', '24', '##GB', 'of', 'V', '##RA', '##M']
+
+These tokens can then be converted into IDs which are understandable by the model. This can be done by directly feeding
+the sentence to the tokenizer, which leverages the Rust implementation of `huggingface/tokenizers
+<https://github.com/huggingface/tokenizers>`__ for peak performance.
+
+.. code-block::
+
+    >>> inputs = tokenizer(sequence)
+
+The tokenizer returns a dictionary with all the arguments necessary for its corresponding model to work properly. The
+token indices are under the key "input_ids":
+
+.. code-block::
+
+    >>> encoded_sequence = inputs["input_ids"]
+    >>> print(encoded_sequence)
+    [101, 138, 18696, 155, 1942, 3190, 1144, 1572, 13745, 1104, 159, 9664, 2107, 102]
+
+Note that the tokenizer automatically adds "special tokens" (if the associated model relies on them) which are special
+IDs the model sometimes uses.
+
+If we decode the previous sequence of ids,
+
+.. code-block::
+
+    >>> decoded_sequence = tokenizer.decode(encoded_sequence)
+
+we will see
+
+.. code-block::
+
+    >>> print(decoded_sequence)
+    [CLS] A Titan RTX has 24GB of VRAM [SEP]
+
+because this is the way a :class:`~transformers.BertModel` is going to expect its inputs.
+
+.. _attention-mask:
+
+Attention mask
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The attention mask is an optional argument used when batching sequences together. This argument indicates to the model
+which tokens should be attended to, and which should not.
+
+For example, consider these two sequences:
+
+.. code-block::
+
+    >>> from transformers import BertTokenizer
+    >>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+
+    >>> sequence_a = "This is a short sequence."
+    >>> sequence_b = "This is a rather long sequence. It is at least longer than the sequence A."
+
+    >>> encoded_sequence_a = tokenizer(sequence_a)["input_ids"]
+    >>> encoded_sequence_b = tokenizer(sequence_b)["input_ids"]
+
+The encoded versions have different lengths:
+
+.. code-block::
+
+    >>> len(encoded_sequence_a), len(encoded_sequence_b)
+    (8, 19)
+
+Therefore, we can't put them together in the same tensor as-is. The first sequence needs to be padded up to the length
+of the second one, or the second one needs to be truncated down to the length of the first one.
+
+In the first case, the list of IDs will be extended by the padding indices. We can pass a list to the tokenizer and ask
+it to pad like this:
+
+.. code-block::
+
+    >>> padded_sequences = tokenizer([sequence_a, sequence_b], padding=True)
+
+We can see that 0s have been added on the right of the first sentence to make it the same length as the second one:
+
+.. code-block::
+
+    >>> padded_sequences["input_ids"]
+    [[101, 1188, 1110, 170, 1603, 4954, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1188, 1110, 170, 1897, 1263, 4954, 119, 1135, 1110, 1120, 1655, 2039, 1190, 1103, 4954, 138, 119, 102]]
+
+This can then be converted into a tensor in PyTorch or TensorFlow. The attention mask is a binary tensor indicating the
+position of the padded indices so that the model does not attend to them. For the :class:`~transformers.BertTokenizer`,
+:obj:`1` indicates a value that should be attended to, while :obj:`0` indicates a padded value. This attention mask is
+in the dictionary returned by the tokenizer under the key "attention_mask":
+
+.. code-block::
+
+    >>> padded_sequences["attention_mask"]
+    [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
+
+.. _token-type-ids:
+
+Token Type IDs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Some models' purpose is to do sequence classification or question answering. These require two different sequences to
+be joined in a single "input_ids" entry, which usually is performed with the help of special tokens, such as the
+classifier (``[CLS]``) and separator (``[SEP]``) tokens. For example, the BERT model builds its two sequence input as
+such:
+
+.. code-block::
+
+   >>> # [CLS] SEQUENCE_A [SEP] SEQUENCE_B [SEP]
+
+We can use our tokenizer to automatically generate such a sentence by passing the two sequences to ``tokenizer`` as two
+arguments (and not a list, like before) like this:
+
+.. code-block::
+
+    >>> from transformers import BertTokenizer
+    >>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+    >>> sequence_a = "HuggingFace is based in NYC"
+    >>> sequence_b = "Where is HuggingFace based?"
+
+    >>> encoded_dict = tokenizer(sequence_a, sequence_b)
+    >>> decoded = tokenizer.decode(encoded_dict["input_ids"])
+
+which will return:
+
+.. code-block::
+
+    >>> print(decoded)
+    [CLS] HuggingFace is based in NYC [SEP] Where is HuggingFace based? [SEP]
+
+This is enough for some models to understand where one sequence ends and where another begins. However, other models,
+such as BERT, also deploy token type IDs (also called segment IDs). They are represented as a binary mask identifying
+the two types of sequence in the model.
+
+The tokenizer returns this mask as the "token_type_ids" entry:
+
+.. code-block::
+
+    >>> encoded_dict['token_type_ids']
+    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+
+The first sequence, the "context" used for the question, has all its tokens represented by a :obj:`0`, whereas the
+second sequence, corresponding to the "question", has all its tokens represented by a :obj:`1`.
+
+Some models, like :class:`~transformers.XLNetModel` use an additional token represented by a :obj:`2`.
+
+.. _position-ids:
+
+Position IDs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Contrary to RNNs that have the position of each token embedded within them, transformers are unaware of the position of
+each token. Therefore, the position IDs (``position_ids``) are used by the model to identify each token's position in
+the list of tokens.
+
+They are an optional parameter. If no ``position_ids`` are passed to the model, the IDs are automatically created as
+absolute positional embeddings.
+
+Absolute positional embeddings are selected in the range ``[0, config.max_position_embeddings - 1]``. Some models use
+other types of positional embeddings, such as sinusoidal position embeddings or relative position embeddings.
+
+.. _labels:
+
+Labels
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The labels are an optional argument which can be passed in order for the model to compute the loss itself. These labels
+should be the expected prediction of the model: it will use the standard loss in order to compute the loss between its
+predictions and the expected value (the label).
+
+These labels are different according to the model head, for example:
+
+- For sequence classification models (e.g., :class:`~transformers.BertForSequenceClassification`), the model expects a
+  tensor of dimension :obj:`(batch_size)` with each value of the batch corresponding to the expected label of the
+  entire sequence.
+- For token classification models (e.g., :class:`~transformers.BertForTokenClassification`), the model expects a tensor
+  of dimension :obj:`(batch_size, seq_length)` with each value corresponding to the expected label of each individual
+  token.
+- For masked language modeling (e.g., :class:`~transformers.BertForMaskedLM`), the model expects a tensor of dimension
+  :obj:`(batch_size, seq_length)` with each value corresponding to the expected label of each individual token: the
+  labels being the token ID for the masked token, and values to be ignored for the rest (usually -100).
+- For sequence to sequence tasks,(e.g., :class:`~transformers.BartForConditionalGeneration`,
+  :class:`~transformers.MBartForConditionalGeneration`), the model expects a tensor of dimension :obj:`(batch_size,
+  tgt_seq_length)` with each value corresponding to the target sequences associated with each input sequence. During
+  training, both `BART` and `T5` will make the appropriate `decoder_input_ids` and decoder attention masks internally.
+  They usually do not need to be supplied. This does not apply to models leveraging the Encoder-Decoder framework. See
+  the documentation of each model for more information on each specific model's labels.
+
+The base models (e.g., :class:`~transformers.BertModel`) do not accept labels, as these are the base transformer
+models, simply outputting features.
+
+.. _decoder-input-ids:
+
+Decoder input IDs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This input is specific to encoder-decoder models, and contains the input IDs that will be fed to the decoder. These
+inputs should be used for sequence to sequence tasks, such as translation or summarization, and are usually built in a
+way specific to each model.
+
+Most encoder-decoder models (BART, T5) create their :obj:`decoder_input_ids` on their own from the :obj:`labels`. In
+such models, passing the :obj:`labels` is the preferred way to handle training.
+
+Please check each model's docs to see how they handle these input IDs for sequence to sequence training.
+
+.. _feed-forward-chunking:
+
+Feed Forward Chunking
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In each residual attention block in transformers the self-attention layer is usually followed by 2 feed forward layers.
+The intermediate embedding size of the feed forward layers is often bigger than the hidden size of the model (e.g., for
+``bert-base-uncased``).
+
+For an input of size ``[batch_size, sequence_length]``, the memory required to store the intermediate feed forward
+embeddings ``[batch_size, sequence_length, config.intermediate_size]`` can account for a large fraction of the memory
+use. The authors of `Reformer: The Efficient Transformer <https://arxiv.org/abs/2001.04451>`_ noticed that since the
+computation is independent of the ``sequence_length`` dimension, it is mathematically equivalent to compute the output
+embeddings of both feed forward layers ``[batch_size, config.hidden_size]_0, ..., [batch_size, config.hidden_size]_n``
+individually and concat them afterward to ``[batch_size, sequence_length, config.hidden_size]`` with ``n =
+sequence_length``, which trades increased computation time against reduced memory use, but yields a mathematically
+**equivalent** result.
+
+For models employing the function :func:`~.transformers.apply_chunking_to_forward`, the ``chunk_size`` defines the
+number of output embeddings that are computed in parallel and thus defines the trade-off between memory and time
+complexity. If ``chunk_size`` is set to 0, no feed forward chunking is done.
--- a/docs/source/imgs/local_attention_mask.png
+++ b/docs/source/imgs/local_attention_mask.png
--- a/docs/source/imgs/ppl_chunked.gif
+++ b/docs/source/imgs/ppl_chunked.gif
--- a/docs/source/imgs/ppl_full.gif
+++ b/docs/source/imgs/ppl_full.gif
--- a/docs/source/imgs/ppl_sliding.gif
+++ b/docs/source/imgs/ppl_sliding.gif
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -1,84 +1,419 @@
 Transformers
-================================================================================================================================================
+=======================================================================================================================

-🤗 Transformers (formerly known as `pytorch-transformers` and `pytorch-pretrained-bert`) provides general-purpose architectures
-(BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural Language Understanding (NLU) and Natural Language Generation
-(NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between TensorFlow 2.0 and PyTorch.
+State-of-the-art Natural Language Processing for Pytorch and TensorFlow 2.0.
+
+🤗 Transformers (formerly known as `pytorch-transformers` and `pytorch-pretrained-bert`) provides general-purpose
+architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural Language Understanding (NLU) and Natural
+Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
+TensorFlow 2.0 and PyTorch.
+
+This is the documentation of our repository `transformers <https://github.com/huggingface/transformers>`_.

 Features
---------------------------------------------------
+-----------------------------------------------------------------------------------------------------------------------

- As easy to use as pytorch-transformers
- As powerful and concise as Keras
 - High performance on NLU and NLG tasks
 - Low barrier to entry for educators and practitioners

-State-of-the-art NLP for everyone
+State-of-the-art NLP for everyone:
+
 - Deep learning researchers
 - Hands-on practitioners
 - AI/ML/NLP teachers and educators

-Lower compute costs, smaller carbon footprint
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Lower compute costs, smaller carbon footprint:
+
 - Researchers can share trained models instead of always retraining
 - Practitioners can reduce compute time and production costs
 - 8 architectures with over 30 pretrained models, some in more than 100 languages

-Choose the right framework for every part of a model's lifetime
+Choose the right framework for every part of a model's lifetime:
+
 - Train state-of-the-art models in 3 lines of code
 - Deep interoperability between TensorFlow 2.0 and PyTorch models
 - Move a single model between TF2.0/PyTorch frameworks at will
 - Seamlessly pick the right framework for training, evaluation, production

+Experimental support for Flax with a few models right now, expected to grow in the coming months.
+
+`All the model checkpoints <https://huggingface.co/models>`__ are seamlessly integrated from the huggingface.co `model
+hub <https://huggingface.co>`__ where they are uploaded directly by `users <https://huggingface.co/users>`__ and
+`organizations <https://huggingface.co/organizations>`__.
+
+Current number of checkpoints: |checkpoints|
+
+.. |checkpoints| image:: https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen
+
 Contents
---------------------------------
+-----------------------------------------------------------------------------------------------------------------------

-The library currently contains PyTorch and Tensorflow implementations, pre-trained model weights, usage scripts and conversion utilities for the following models:
+The documentation is organized in five parts:

-1. `BERT <https://github.com/google-research/bert>`_ (from Google) released with the paper `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding <https://arxiv.org/abs/1810.04805>`_ by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
-2. `GPT <https://github.com/openai/finetune-transformer-lm>`_ (from OpenAI) released with the paper `Improving Language Understanding by Generative Pre-Training <https://blog.openai.com/language-unsupervised>`_ by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
-3. `GPT-2 <https://blog.openai.com/better-language-models>`_ (from OpenAI) released with the paper `Language Models are Unsupervised Multitask Learners <https://blog.openai.com/better-language-models>`_ by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
-4. `Transformer-XL <https://github.com/kimiyoung/transformer-xl>`_ (from Google/CMU) released with the paper `Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`_ by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-5. `XLNet <https://github.com/zihangdai/xlnet>`_ (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`_ by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-6. `XLM <https://github.com/facebookresearch/XLM>`_ (from Facebook) released together with the paper `Cross-lingual Language Model Pretraining <https://arxiv.org/abs/1901.07291>`_ by Guillaume Lample and Alexis Conneau.
-7. `RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`_ (from Facebook), released together with the paper a `Robustly Optimized BERT Pretraining Approach <https://arxiv.org/abs/1907.11692>`_ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-8. `DistilBERT <https://huggingface.co/transformers/model_doc/distilbert.html>`_ (from HuggingFace) released together with the blog post `Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT <https://medium.com/huggingface/distilbert-8cf3380435b5>`_ by Victor Sanh, Lysandre Debut and Thomas Wolf.
+- **GET STARTED** contains a quick tour, the installation instructions and some useful information about our philosophy
+  and a glossary.
+- **USING 🤗 TRANSFORMERS** contains general tutorials on how to use the library.
+- **ADVANCED GUIDES** contains more advanced guides that are more specific to a given script or part of the library.
+- **RESEARCH** focuses on tutorials that have less to do with how to use the library but more about general research in
+  transformers model
+- The three last section contain the documentation of each public class and function, grouped in:

+    - **MAIN CLASSES** for the main classes exposing the important APIs of the library.
+    - **MODELS** for the classes and functions related to each model implemented in the library.
+    - **INTERNAL HELPERS** for the classes and functions we use internally.
+
+The library currently contains PyTorch, Tensorflow and Flax implementations, pretrained model weights, usage scripts
+and conversion utilities for the following models:
+
+..
+    This list is updated automatically from the README with `make fix-copies`. Do not update manually!
+
+1. :doc:`ALBERT <model_doc/albert>` (from Google Research and the Toyota Technological Institute at Chicago) released
+   with the paper `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations
+   <https://arxiv.org/abs/1909.11942>`__, by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush
+   Sharma, Radu Soricut.
+2. :doc:`BART <model_doc/bart>` (from Facebook) released with the paper `BART: Denoising Sequence-to-Sequence
+   Pre-training for Natural Language Generation, Translation, and Comprehension
+   <https://arxiv.org/pdf/1910.13461.pdf>`__ by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman
+   Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
+3. :doc:`BARThez <model_doc/barthez>` (from École polytechnique) released with the paper `BARThez: a Skilled Pretrained
+   French Sequence-to-Sequence Model <https://arxiv.org/abs/2010.12321>`__ by Moussa Kamal Eddine, Antoine J.-P.
+   Tixier, Michalis Vazirgiannis.
+4. :doc:`BERT <model_doc/bert>` (from Google) released with the paper `BERT: Pre-training of Deep Bidirectional
+   Transformers for Language Understanding <https://arxiv.org/abs/1810.04805>`__ by Jacob Devlin, Ming-Wei Chang,
+   Kenton Lee and Kristina Toutanova.
+5. :doc:`BERT For Sequence Generation <model_doc/bertgeneration>` (from Google) released with the paper `Leveraging
+   Pre-trained Checkpoints for Sequence Generation Tasks <https://arxiv.org/abs/1907.12461>`__ by Sascha Rothe, Shashi
+   Narayan, Aliaksei Severyn.
+6. :doc:`Blenderbot <model_doc/blenderbot>` (from Facebook) released with the paper `Recipes for building an
+   open-domain chatbot <https://arxiv.org/abs/2004.13637>`__ by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary
+   Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+7. :doc:`BlenderbotSmall <model_doc/blenderbot_small>` (from Facebook) released with the paper `Recipes for building an
+   open-domain chatbot <https://arxiv.org/abs/2004.13637>`__ by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary
+   Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+8. :doc:`CamemBERT <model_doc/camembert>` (from Inria/Facebook/Sorbonne) released with the paper `CamemBERT: a Tasty
+   French Language Model <https://arxiv.org/abs/1911.03894>`__ by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz
+   Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
+9. :doc:`CTRL <model_doc/ctrl>` (from Salesforce) released with the paper `CTRL: A Conditional Transformer Language
+   Model for Controllable Generation <https://arxiv.org/abs/1909.05858>`__ by Nitish Shirish Keskar*, Bryan McCann*,
+   Lav R. Varshney, Caiming Xiong and Richard Socher.
+10. :doc:`DeBERTa <model_doc/deberta>` (from Microsoft Research) released with the paper `DeBERTa: Decoding-enhanced
+    BERT with Disentangled Attention <https://arxiv.org/abs/2006.03654>`__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao,
+    Weizhu Chen.
+11. :doc:`DialoGPT <model_doc/dialogpt>` (from Microsoft Research) released with the paper `DialoGPT: Large-Scale
+    Generative Pre-training for Conversational Response Generation <https://arxiv.org/abs/1911.00536>`__ by Yizhe
+    Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
+12. :doc:`DistilBERT <model_doc/distilbert>` (from HuggingFace), released together with the paper `DistilBERT, a
+    distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`__ by Victor
+    Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into `DistilGPT2
+    <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__, RoBERTa into `DistilRoBERTa
+    <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__, Multilingual BERT into
+    `DistilmBERT <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__ and a German
+    version of DistilBERT.
+13. :doc:`DPR <model_doc/dpr>` (from Facebook) released with the paper `Dense Passage Retrieval for Open-Domain
+    Question Answering <https://arxiv.org/abs/2004.04906>`__ by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick
+    Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
+14. :doc:`ELECTRA <model_doc/electra>` (from Google Research/Stanford University) released with the paper `ELECTRA:
+    Pre-training text encoders as discriminators rather than generators <https://arxiv.org/abs/2003.10555>`__ by Kevin
+    Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
+15. :doc:`FlauBERT <model_doc/flaubert>` (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model
+    Pre-training for French <https://arxiv.org/abs/1912.05372>`__ by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne,
+    Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
+16. :doc:`Funnel Transformer <model_doc/funnel>` (from CMU/Google Brain) released with the paper `Funnel-Transformer:
+    Filtering out Sequential Redundancy for Efficient Language Processing <https://arxiv.org/abs/2006.03236>`__ by
+    Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
+17. :doc:`GPT <model_doc/gpt>` (from OpenAI) released with the paper `Improving Language Understanding by Generative
+    Pre-Training <https://blog.openai.com/language-unsupervised/>`__ by Alec Radford, Karthik Narasimhan, Tim Salimans
+    and Ilya Sutskever.
+18. :doc:`GPT-2 <model_doc/gpt2>` (from OpenAI) released with the paper `Language Models are Unsupervised Multitask
+    Learners <https://blog.openai.com/better-language-models/>`__ by Alec Radford*, Jeffrey Wu*, Rewon Child, David
+    Luan, Dario Amodei** and Ilya Sutskever**.
+19. :doc:`LayoutLM <model_doc/layoutlm>` (from Microsoft Research Asia) released with the paper `LayoutLM: Pre-training
+    of Text and Layout for Document Image Understanding <https://arxiv.org/abs/1912.13318>`__ by Yiheng Xu, Minghao Li,
+    Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
+20. :doc:`LED <model_doc/led>` (from AllenAI) released with the paper `Longformer: The Long-Document Transformer
+    <https://arxiv.org/abs/2004.05150>`__ by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+21. :doc:`Longformer <model_doc/longformer>` (from AllenAI) released with the paper `Longformer: The Long-Document
+    Transformer <https://arxiv.org/abs/2004.05150>`__ by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+22. :doc:`LXMERT <model_doc/lxmert>` (from UNC Chapel Hill) released with the paper `LXMERT: Learning Cross-Modality
+    Encoder Representations from Transformers for Open-Domain Question Answering <https://arxiv.org/abs/1908.07490>`__
+    by Hao Tan and Mohit Bansal.
+23. :doc:`MarianMT <model_doc/marian>` Machine translation models trained using `OPUS <http://opus.nlpl.eu/>`__ data by
+    Jörg Tiedemann. The `Marian Framework <https://marian-nmt.github.io/>`__ is being developed by the Microsoft
+    Translator Team.
+24. :doc:`MBart <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Denoising Pre-training for
+    Neural Machine Translation <https://arxiv.org/abs/2001.08210>`__ by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li,
+    Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
+25. :doc:`MPNet <model_doc/mpnet>` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted
+    Pre-training for Language Understanding <https://arxiv.org/abs/2004.09297>`__ by Kaitao Song, Xu Tan, Tao Qin,
+    Jianfeng Lu, Tie-Yan Liu.
+26. :doc:`MT5 <model_doc/mt5>` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained
+    text-to-text transformer <https://arxiv.org/abs/2010.11934>`__ by Linting Xue, Noah Constant, Adam Roberts, Mihir
+    Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
+27. :doc:`Pegasus <model_doc/pegasus>` (from Google) released with the paper `PEGASUS: Pre-training with Extracted
+    Gap-sentences for Abstractive Summarization <https://arxiv.org/abs/1912.08777>`__> by Jingqing Zhang, Yao Zhao,
+    Mohammad Saleh and Peter J. Liu.
+28. :doc:`ProphetNet <model_doc/prophetnet>` (from Microsoft Research) released with the paper `ProphetNet: Predicting
+    Future N-gram for Sequence-to-Sequence Pre-training <https://arxiv.org/abs/2001.04063>`__ by Yu Yan, Weizhen Qi,
+    Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+29. :doc:`Reformer <model_doc/reformer>` (from Google Research) released with the paper `Reformer: The Efficient
+    Transformer <https://arxiv.org/abs/2001.04451>`__ by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
+30. :doc:`RoBERTa <model_doc/roberta>` (from Facebook), released together with the paper a `Robustly Optimized BERT
+    Pretraining Approach <https://arxiv.org/abs/1907.11692>`__ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar
+    Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. ultilingual BERT into `DistilmBERT
+    <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__ and a German version of
+    DistilBERT.
+31. :doc:`SqueezeBert <model_doc/squeezebert>` released with the paper `SqueezeBERT: What can computer vision teach NLP
+    about efficient neural networks? <https://arxiv.org/abs/2006.11316>`__ by Forrest N. Iandola, Albert E. Shaw, Ravi
+    Krishna, and Kurt W. Keutzer.
+32. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
+    Unified Text-to-Text Transformer <https://arxiv.org/abs/1910.10683>`__ by Colin Raffel and Noam Shazeer and Adam
+    Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+33. :doc:`TAPAS <model_doc/tapas>` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via
+    Pre-training <https://arxiv.org/abs/2004.02349>`__ by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller,
+    Francesco Piccinno and Julian Martin Eisenschlos.
+34. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
+    Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`__ by Zihang Dai*,
+    Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+35. :doc:`XLM <model_doc/xlm>` (from Facebook) released together with the paper `Cross-lingual Language Model
+    Pretraining <https://arxiv.org/abs/1901.07291>`__ by Guillaume Lample and Alexis Conneau.
+36. :doc:`XLM-ProphetNet <model_doc/xlmprophetnet>` (from Microsoft Research) released with the paper `ProphetNet:
+    Predicting Future N-gram for Sequence-to-Sequence Pre-training <https://arxiv.org/abs/2001.04063>`__ by Yu Yan,
+    Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+37. :doc:`XLM-RoBERTa <model_doc/xlmroberta>` (from Facebook AI), released together with the paper `Unsupervised
+    Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`__ by Alexis Conneau*, Kartikay
+    Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke
+    Zettlemoyer and Veselin Stoyanov.
+38. :doc:`XLNet <model_doc/xlnet>` (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive
+    Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`__ by Zhilin Yang*, Zihang Dai*, Yiming
+    Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
+
+
+.. _bigtable:
+
+The table below represents the current support in the library for each of those models, whether they have a Python
+tokenizer (called "slow"). A "fast" tokenizer backed by the 🤗 Tokenizers library, whether they have support in PyTorch,
+TensorFlow and/or Flax.
+
+..
+    This table is updated automatically from the auto modules with `make fix-copies`. Do not update manually!
+
+.. rst-class:: center-aligned-table
+
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            Model            | Tokenizer slow | Tokenizer fast | PyTorch support | TensorFlow support | Flax Support |
+=============================+================+================+=================+====================+==============+
+|           ALBERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            BART             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            BERT             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|       Bert Generation       |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|         Blenderbot          |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|       BlenderbotSmall       |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|          CamemBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|             DPR             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|           DeBERTa           |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|         DistilBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|           ELECTRA           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|       Encoder decoder       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+| FairSeq Machine-Translation |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|          FlauBERT           |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|     Funnel Transformer      |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|             LED             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|           LXMERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|          LayoutLM           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|         Longformer          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            MPNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|           Marian            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|         MobileBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|         OpenAI GPT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|        OpenAI GPT-2         |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|           Pegasus           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|         ProphetNet          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|             RAG             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|          Reformer           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|          RetriBERT          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|           RoBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|         SqueezeBERT         |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|             T5              |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            TAPAS            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|       Transformer-XL        |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|             XLM             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|         XLM-RoBERTa         |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|        XLMProphetNet        |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            XLNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            mBART            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|             mT5             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+

 .. toctree::
    :maxdepth: 2
-    :caption: Notes
+    :caption: Get started

+    quicktour
    installation
-    quickstart
+    philosophy
+    glossary
+
+.. toctree::
+    :maxdepth: 2
+    :caption: Using 🤗 Transformers
+
+    task_summary
+    model_summary
+    preprocessing
+    training
+    model_sharing
+    tokenizer_summary
+    multilingual
+
+.. toctree::
+    :maxdepth: 2
+    :caption: Advanced guides
+
    pretrained_models
    examples
+    custom_datasets
    notebooks
-    serialization
    converting_tensorflow_models
    migration
+    contributing
+    testing
+    serialization
+
+.. toctree::
+    :maxdepth: 2
+    :caption: Research
+
    bertology
-    torchscript
+    perplexity
+    benchmarks

 .. toctree::
    :maxdepth: 2
-    :caption: Main classes
+    :caption: Main Classes

+    main_classes/callback
    main_classes/configuration
+    main_classes/logging
    main_classes/model
-    main_classes/tokenizer
    main_classes/optimizer_schedules
+    main_classes/output
+    main_classes/pipelines
    main_classes/processors
+    main_classes/tokenizer
+    main_classes/trainer

 .. toctree::
    :maxdepth: 2
-    :caption: Package Reference
+    :caption: Models

+    model_doc/albert
    model_doc/auto
+    model_doc/bart
+    model_doc/barthez
    model_doc/bert
-    model_doc/gpt
-    model_doc/transformerxl
-    model_doc/gpt2
-    model_doc/xlm
-    model_doc/xlnet
-    model_doc/roberta
+    model_doc/bertweet
+    model_doc/bertgeneration
+    model_doc/blenderbot
+    model_doc/blenderbot_small
+    model_doc/camembert
+    model_doc/ctrl
+    model_doc/deberta
+    model_doc/dialogpt
    model_doc/distilbert
+    model_doc/dpr
+    model_doc/electra
+    model_doc/encoderdecoder
+    model_doc/flaubert
+    model_doc/fsmt
+    model_doc/funnel
+    model_doc/herbert
+    model_doc/layoutlm
+    model_doc/led
+    model_doc/longformer
+    model_doc/lxmert
+    model_doc/marian
+    model_doc/mbart
+    model_doc/mobilebert
+    model_doc/mpnet
+    model_doc/mt5
+    model_doc/gpt
+    model_doc/gpt2
+    model_doc/pegasus
+    model_doc/phobert
+    model_doc/prophetnet
+    model_doc/rag
+    model_doc/reformer
+    model_doc/retribert
+    model_doc/roberta
+    model_doc/squeezebert
+    model_doc/t5
+    model_doc/tapas
+    model_doc/transformerxl
+    model_doc/xlm
+    model_doc/xlmprophetnet
+    model_doc/xlmroberta
+    model_doc/xlnet
+
+.. toctree::
+    :maxdepth: 2
+    :caption: Internal Helpers
+
+    internal/modeling_utils
+    internal/pipelines_utils
+    internal/tokenization_utils
+    internal/trainer_utils
+    internal/generation_utils
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@ -0,0 +1,138 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Installation
+
+🤗 Transformers is tested on Python 3.6+, and PyTorch 1.1.0+ or TensorFlow 2.0+.
+
+You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're
+unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). Create a virtual environment with the version of Python you're going 
+to use and activate it.
+
+Now, if you want to use 🤗 Transformers, you can install it with pip. If you'd like to play with the examples, you
+must install it from source.
+
+## Installation with pip
+
+First you need to install one of, or both, TensorFlow 2.0 and PyTorch.
+Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available), 
+[PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) and/or 
+[Flax installation page](https://github.com/google/flax#quick-install)
+regarding the specific install command for your platform.
+
+When TensorFlow 2.0 and/or PyTorch has been installed, 🤗 Transformers can be installed using pip as follows:
+
+```bash
+pip install transformers
+```
+
+Alternatively, for CPU-support only, you can install 🤗 Transformers and PyTorch in one line with:
+
+```bash
+pip install transformers[torch]
+```
+
+or 🤗 Transformers and TensorFlow 2.0 in one line with:
+
+```bash
+pip install transformers[tf-cpu]
+```
+
+or 🤗 Transformers and Flax in one line with:
+
+```bash
+pip install transformers[flax]
+```
+
+To check 🤗 Transformers is properly installed, run the following command:
+
+```bash
+python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))"
+```
+
+It should download a pretrained model then print something like
+
+```bash
+[{'label': 'POSITIVE', 'score': 0.9998704791069031}]
+```
+
+(Note that TensorFlow will print additional stuff before that last statement.)
+
+## Installing from source
+
+To install from source, clone the repository and install with the following commands:
+
+``` bash
+git clone https://github.com/huggingface/transformers.git
+cd transformers
+pip install -e .
+```
+
+Again, you can run 
+
+```bash
+python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I hate you'))"
+```
+
+to check 🤗 Transformers is properly installed.
+
+
+## With conda
+
+Since Transformers version v4.0.0, we now have a conda channel: `huggingface`.
+
+🤗 Transformers can be installed using conda as follows:
+
+```
+conda install -c huggingface transformers
+```
+
+Follow the installation pages of TensorFlow, PyTorch or Flax to see how to install them with conda. 
+
+## Caching models
+
+This library provides pretrained models that will be downloaded and cached locally. Unless you specify a location with
+`cache_dir=...` when you use methods like `from_pretrained`, these models will automatically be downloaded in the
+folder given by the shell environment variable ``TRANSFORMERS_CACHE``. The default value for it will be the Hugging
+Face cache home followed by ``/transformers/``. This is (by order of priority):
+
+  * shell environment variable ``HF_HOME`` 
+  * shell environment variable ``XDG_CACHE_HOME`` + ``/huggingface/``
+  * default: ``~/.cache/huggingface/``
+
+So if you don't have any specific environment variable set, the cache directory will be at
+``~/.cache/huggingface/transformers/``.
+
+**Note:** If you have set a shell environment variable for one of the predecessors of this library
+(``PYTORCH_TRANSFORMERS_CACHE`` or ``PYTORCH_PRETRAINED_BERT_CACHE``), those will be used if there is no shell
+environment variable for ``TRANSFORMERS_CACHE``.
+
+### Note on model downloads (Continuous Integration or large-scale deployments)
+
+If you expect to be downloading large volumes of models (more than 1,000) from our hosted bucket (for instance through
+your CI setup, or a large-scale production deployment), please cache the model files on your end. It will be way
+faster, and cheaper. Feel free to contact us privately if you need any help.
+
+## Do you want to run a Transformer model on a mobile device?
+
+You should check out our [swift-coreml-transformers](https://github.com/huggingface/swift-coreml-transformers) repo.
+
+It contains a set of tools to convert PyTorch or TensorFlow 2.0 trained Transformer models (currently contains `GPT-2`, 
+`DistilGPT-2`, `BERT`, and `DistilBERT`) to CoreML models that run on iOS devices.
+
+At some point in the future, you'll be able to seamlessly move from pretraining or fine-tuning models in PyTorch or
+TensorFlow 2.0 to productizing them in CoreML, or prototype a model or an app in CoreML then research its
+hyperparameters or architecture from PyTorch or TensorFlow 2.0. Super exciting!
--- a/docs/source/installation.rst
+++ b/docs/source/installation.rst
@ -1,71 +0,0 @@
-Installation
-================================================
-
-Transformers is tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+) and PyTorch 1.1.0
-
-With pip
-^^^^^^^^
-
-PyTorch Transformers can be installed using pip as follows:
-
-.. code-block:: bash
-
-   pip install transformers
-
-From source
-^^^^^^^^^^^
-
-To install from source, clone the repository and install with:
-
-.. code-block:: bash
-
-    git clone https://github.com/huggingface/transformers.git
-    cd transformers
-    pip install [--editable] .
-
-
-Tests
-^^^^^
-
-An extensive test suite is included to test the library behavior and several examples. Library tests can be found in the `tests folder <https://github.com/huggingface/transformers/tree/master/transformers/tests>`_ and examples tests in the `examples folder <https://github.com/huggingface/transformers/tree/master/examples>`_.
-
-Tests can be run using `pytest` (install pytest if needed with `pip install pytest`).
-
-Run all the tests from the root of the cloned repository with the commands:
-
-.. code-block:: bash
-
-    python -m pytest -sv ./transformers/tests/
-    python -m pytest -sv ./examples/
-
-
-OpenAI GPT original tokenization workflow
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-If you want to reproduce the original tokenization process of the ``OpenAI GPT`` paper, you will need to install ``ftfy`` (use version 4.4.3 if you are using Python 2) and ``SpaCy`` :
-
-.. code-block:: bash
-
-   pip install spacy ftfy==4.4.3
-   python -m spacy download en
-
-If you don't install ``ftfy`` and ``SpaCy``\ , the ``OpenAI GPT`` tokenizer will default to tokenize using BERT's ``BasicTokenizer`` followed by Byte-Pair Encoding (which should be fine for most usage, don't worry).
-
-
-Note on model downloads (Continuous Integration or large-scale deployments)
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-If you expect to be downloading large volumes of models (more than 1,000) from our hosted bucket (for instance through your CI setup, or a large-scale production deployment), please cache the model files on your end. It will be way faster, and cheaper. Feel free to contact us privately if you need any help.
-
-
-Do you want to run a Transformer model on a mobile device?
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-You should check out our `swift-coreml-transformers <https://github.com/huggingface/swift-coreml-transformers>`_ repo.
-
-It contains an example of a conversion script from a Pytorch trained Transformer model (here, ``GPT-2``) to a CoreML model that runs on iOS devices.
-
-It also contains an implementation of BERT for Question answering.
-
-At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models in PyTorch to productizing them in CoreML,
-or prototype a model or an app in CoreML then research its hyperparameters or architecture from PyTorch. Super exciting!
--- a/docs/source/internal/generation_utils.rst
+++ b/docs/source/internal/generation_utils.rst
@ -0,0 +1,168 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Utilities for Generation
+-----------------------------------------------------------------------------------------------------------------------
+
+This page lists all the utility functions used by :meth:`~transformers.PreTrainedModel.generate`,
+:meth:`~transformers.PreTrainedModel.greedy_search`, :meth:`~transformers.PreTrainedModel.sample`,
+:meth:`~transformers.PreTrainedModel.beam_search`, :meth:`~transformers.PreTrainedModel.beam_sample`, and
+:meth:`~transformers.PreTrainedModel.group_beam_search`.
+
+Most of those are only useful if you are studying the code of the generate methods in the library.
+
+Generate Outputs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The output of :meth:`~transformers.PreTrainedModel.generate` is an instance of a subclass of
+:class:`~transformers.file_utils.ModelOutput`. This output is a data structure containing all the information returned
+by :meth:`~transformers.PreTrainedModel.generate`, but that can also be used as tuple or dictionary.
+
+Here's an example:
+
+.. code-block::
+
+    from transformers import GPT2Tokenizer, GPT2LMHeadModel
+
+    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+    model = GPT2LMHeadModel.from_pretrained('gpt2')
+
+    inputs = tokenizer("Hello, my dog is cute and ", return_tensors="pt")
+    generation_output = model.generate(**inputs, return_dict_in_generate=True, output_scores=True)
+
+The ``generation_output`` object is a :class:`~transformers.generation_utils.GreedySearchDecoderOnlyOutput`, as we can
+see in the documentation of that class below, it means it has the following attributes:
+
+- ``sequences``: the generated sequences of tokens
+- ``scores`` (optional): the prediction scores of the language modelling head, for each generation step
+- ``hidden_states`` (optional): the hidden states of the model, for each generation step
+- ``attentions`` (optional): the attention weights of the model, for each generation step
+
+Here we have the ``scores`` since we passed along ``output_scores=True``, but we don't have ``hidden_states`` and
+``attentions`` because we didn't pass ``output_hidden_states=True`` or ``output_attentions=True``.
+
+You can access each attribute as you would usually do, and if that attribute has not been returned by the model, you
+will get ``None``. Here for instance ``generation_output.scores`` are all the generated prediction scores of the
+language modeling head, and ``generation_output.attentions`` is ``None``.
+
+When using our ``generation_output`` object as a tuple, it only keeps the attributes that don't have ``None`` values.
+Here, for instance, it has two elements, ``loss`` then ``logits``, so
+
+.. code-block::
+
+    generation_output[:2]
+
+will return the tuple ``(generation_output.sequences, generation_output.scores)`` for instance.
+
+When using our ``generation_output`` object as a dictionary, it only keeps the attributes that don't have ``None``
+values. Here, for instance, it has two keys that are ``sequences`` and ``scores``.
+
+We document here all output types.
+
+
+GreedySearchOutput
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autoclass:: transformers.generation_utils.GreedySearchDecoderOnlyOutput
+    :members:
+
+.. autoclass:: transformers.generation_utils.GreedySearchEncoderDecoderOutput
+    :members:
+
+
+SampleOutput
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autoclass:: transformers.generation_utils.SampleDecoderOnlyOutput
+    :members:
+
+.. autoclass:: transformers.generation_utils.SampleEncoderDecoderOutput
+    :members:
+
+
+BeamSearchOutput
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autoclass:: transformers.generation_utils.BeamSearchDecoderOnlyOutput
+    :members:
+
+.. autoclass:: transformers.generation_utils.BeamSearchEncoderDecoderOutput
+    :members:
+
+
+BeamSampleOutput
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autoclass:: transformers.generation_utils.BeamSampleDecoderOnlyOutput
+    :members:
+
+.. autoclass:: transformers.generation_utils.BeamSampleEncoderDecoderOutput
+    :members:
+
+
+LogitsProcessor
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+A :class:`~transformers.LogitsProcessor` can be used to modify the prediction scores of a language model head for
+generation.
+
+.. autoclass:: transformers.LogitsProcessor
+    :members: __call__
+
+.. autoclass:: transformers.LogitsProcessorList
+    :members: __call__
+
+.. autoclass:: transformers.LogitsWarper
+    :members: __call__
+
+.. autoclass:: transformers.MinLengthLogitsProcessor
+    :members: __call__
+
+.. autoclass:: transformers.TemperatureLogitsWarper
+    :members: __call__
+
+.. autoclass:: transformers.RepetitionPenaltyLogitsProcessor
+    :members: __call__
+
+.. autoclass:: transformers.TopPLogitsWarper
+    :members: __call__
+
+.. autoclass:: transformers.TopKLogitsWarper
+    :members: __call__
+
+.. autoclass:: transformers.NoRepeatNGramLogitsProcessor
+    :members: __call__
+
+.. autoclass:: transformers.NoBadWordsLogitsProcessor
+    :members: __call__
+
+.. autoclass:: transformers.PrefixConstrainedLogitsProcessor
+    :members: __call__
+
+.. autoclass:: transformers.HammingDiversityLogitsProcessor
+    :members: __call__
+
+BeamSearch
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BeamScorer
+    :members: process, finalize
+
+.. autoclass:: transformers.BeamSearchScorer
+    :members: process, finalize
+
+Utilities
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: transformers.top_k_top_p_filtering
+
+.. autofunction:: transformers.tf_top_k_top_p_filtering
--- a/docs/source/internal/modeling_utils.rst
+++ b/docs/source/internal/modeling_utils.rst
@ -0,0 +1,98 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Custom Layers and Utilities
+-----------------------------------------------------------------------------------------------------------------------
+
+This page lists all the custom layers used by the library, as well as the utility functions it provides for modeling.
+
+Most of those are only useful if you are studying the code of the models in the library.
+
+
+Pytorch custom modules
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_utils.Conv1D
+
+.. autoclass:: transformers.modeling_utils.PoolerStartLogits
+    :members: forward
+
+.. autoclass:: transformers.modeling_utils.PoolerEndLogits
+    :members: forward
+
+.. autoclass:: transformers.modeling_utils.PoolerAnswerClass
+    :members: forward
+
+.. autoclass:: transformers.modeling_utils.SquadHeadOutput
+
+.. autoclass:: transformers.modeling_utils.SQuADHead
+    :members: forward
+
+.. autoclass:: transformers.modeling_utils.SequenceSummary
+    :members: forward
+
+
+PyTorch Helper Functions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: transformers.apply_chunking_to_forward
+
+.. autofunction:: transformers.modeling_utils.find_pruneable_heads_and_indices
+
+.. autofunction:: transformers.modeling_utils.prune_layer
+
+.. autofunction:: transformers.modeling_utils.prune_conv1d_layer
+
+.. autofunction:: transformers.modeling_utils.prune_linear_layer
+
+TensorFlow custom layers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_tf_utils.TFConv1D
+
+.. autoclass:: transformers.modeling_tf_utils.TFSharedEmbeddings
+    :members: call
+
+.. autoclass:: transformers.modeling_tf_utils.TFSequenceSummary
+    :members: call
+
+
+TensorFlow loss functions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_tf_utils.TFCausalLanguageModelingLoss
+    :members:
+
+.. autoclass:: transformers.modeling_tf_utils.TFMaskedLanguageModelingLoss
+    :members:
+
+.. autoclass:: transformers.modeling_tf_utils.TFMultipleChoiceLoss
+    :members:
+
+.. autoclass:: transformers.modeling_tf_utils.TFQuestionAnsweringLoss
+    :members:
+
+.. autoclass:: transformers.modeling_tf_utils.TFSequenceClassificationLoss
+    :members:
+
+.. autoclass:: transformers.modeling_tf_utils.TFTokenClassificationLoss
+    :members:
+
+
+TensorFlow Helper Functions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: transformers.modeling_tf_utils.get_initializer
+
+.. autofunction:: transformers.modeling_tf_utils.keras_serializable
+
+.. autofunction:: transformers.modeling_tf_utils.shape_list
--- a/docs/source/internal/pipelines_utils.rst
+++ b/docs/source/internal/pipelines_utils.rst
@ -0,0 +1,52 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Utilities for pipelines
+-----------------------------------------------------------------------------------------------------------------------
+
+This page lists all the utility functions the library provides for pipelines.
+
+Most of those are only useful if you are studying the code of the models in the library.
+
+
+Argument handling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.pipelines.ArgumentHandler
+
+.. autoclass:: transformers.pipelines.ZeroShotClassificationArgumentHandler
+
+.. autoclass:: transformers.pipelines.QuestionAnsweringArgumentHandler
+
+
+Data format
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.pipelines.PipelineDataFormat
+    :members:
+
+.. autoclass:: transformers.pipelines.CsvPipelineDataFormat
+    :members:
+
+.. autoclass:: transformers.pipelines.JsonPipelineDataFormat
+    :members:
+
+.. autoclass:: transformers.pipelines.PipedPipelineDataFormat
+    :members:
+
+
+Utilities
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: transformers.pipelines.get_framework
+
+.. autoclass:: transformers.pipelines.PipelineException
--- a/docs/source/internal/tokenization_utils.rst
+++ b/docs/source/internal/tokenization_utils.rst
@ -0,0 +1,51 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Utilities for Tokenizers
+-----------------------------------------------------------------------------------------------------------------------
+
+This page lists all the utility functions used by the tokenizers, mainly the class
+:class:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase` that implements the common methods between
+:class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast` and the mixin
+:class:`~transformers.tokenization_utils_base.SpecialTokensMixin`.
+
+Most of those are only useful if you are studying the code of the tokenizers in the library.
+
+PreTrainedTokenizerBase
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.tokenization_utils_base.PreTrainedTokenizerBase
+    :special-members: __call__
+    :members:
+
+
+SpecialTokensMixin
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.tokenization_utils_base.SpecialTokensMixin
+    :members:
+
+
+Enums and namedtuples
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.tokenization_utils_base.ExplicitEnum
+
+.. autoclass:: transformers.tokenization_utils_base.PaddingStrategy
+
+.. autoclass:: transformers.tokenization_utils_base.TensorType
+
+.. autoclass:: transformers.tokenization_utils_base.TruncationStrategy
+
+.. autoclass:: transformers.tokenization_utils_base.CharSpan
+
+.. autoclass:: transformers.tokenization_utils_base.TokenSpan
--- a/docs/source/internal/trainer_utils.rst
+++ b/docs/source/internal/trainer_utils.rst
@ -0,0 +1,48 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Utilities for Trainer
+-----------------------------------------------------------------------------------------------------------------------
+
+This page lists all the utility functions used by :class:`~transformers.Trainer`.
+
+Most of those are only useful if you are studying the code of the Trainer in the library.
+
+Utilities
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.EvalPrediction
+
+.. autoclass:: transformers.EvaluationStrategy
+
+.. autofunction:: transformers.set_seed
+
+.. autofunction:: transformers.torch_distributed_zero_first
+
+
+Callbacks internals
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.trainer_callback.CallbackHandler
+
+
+Distributed Evaluation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.trainer_pt_utils.DistributedTensorGatherer
+    :members:
+
+
+Distributed Evaluation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.HfArgumentParser
--- a/docs/source/main_classes/callback.rst
+++ b/docs/source/main_classes/callback.rst
@ -0,0 +1,89 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Callbacks
+-----------------------------------------------------------------------------------------------------------------------
+
+Callbacks are objects that can customize the behavior of the training loop in the PyTorch
+:class:`~transformers.Trainer` (this feature is not yet implemented in TensorFlow) that can inspect the training loop
+state (for progress reporting, logging on TensorBoard or other ML platforms...) and take decisions (like early
+stopping).
+
+Callbacks are "read only" pieces of code, apart from the :class:`~transformers.TrainerControl` object they return, they
+cannot change anything in the training loop. For customizations that require changes in the training loop, you should
+subclass :class:`~transformers.Trainer` and override the methods you need (see :doc:`trainer` for examples).
+
+By default a :class:`~transformers.Trainer` will use the following callbacks:
+
+- :class:`~transformers.DefaultFlowCallback` which handles the default behavior for logging, saving and evaluation.
+- :class:`~transformers.PrinterCallback` or :class:`~transformers.ProgressCallback` to display progress and print the
+  logs (the first one is used if you deactivate tqdm through the :class:`~transformers.TrainingArguments`, otherwise
+  it's the second one).
+- :class:`~transformers.integrations.TensorBoardCallback` if tensorboard is accessible (either through PyTorch >= 1.4
+  or tensorboardX).
+- :class:`~transformers.integrations.WandbCallback` if `wandb <https://www.wandb.com/>`__ is installed.
+- :class:`~transformers.integrations.CometCallback` if `comet_ml <https://www.comet.ml/site/>`__ is installed.
+- :class:`~transformers.integrations.MLflowCallback` if `mlflow <https://www.mlflow.org/>`__ is installed.
+- :class:`~transformers.integrations.AzureMLCallback` if `azureml-sdk <https://pypi.org/project/azureml-sdk/>`__ is
+  installed.
+
+The main class that implements callbacks is :class:`~transformers.TrainerCallback`. It gets the
+:class:`~transformers.TrainingArguments` used to instantiate the :class:`~transformers.Trainer`, can access that
+Trainer's internal state via :class:`~transformers.TrainerState`, and can take some actions on the training loop via
+:class:`~transformers.TrainerControl`.
+
+
+Available Callbacks
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Here is the list of the available :class:`~transformers.TrainerCallback` in the library:
+
+.. autoclass:: transformers.integrations.CometCallback
+    :members: setup
+
+.. autoclass:: transformers.DefaultFlowCallback
+
+.. autoclass:: transformers.PrinterCallback
+
+.. autoclass:: transformers.ProgressCallback
+
+.. autoclass:: transformers.EarlyStoppingCallback
+
+.. autoclass:: transformers.integrations.TensorBoardCallback
+
+.. autoclass:: transformers.integrations.WandbCallback
+    :members: setup
+
+.. autoclass:: transformers.integrations.MLflowCallback
+    :members: setup
+
+.. autoclass:: transformers.integrations.AzureMLCallback
+
+TrainerCallback
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TrainerCallback
+    :members:
+
+
+TrainerState
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TrainerState
+    :members:
+
+
+TrainerControl
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TrainerControl
+    :members:
--- a/docs/source/main_classes/configuration.rst
+++ b/docs/source/main_classes/configuration.rst
@ -1,10 +1,25 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Configuration
----------------------------------------------------
+-----------------------------------------------------------------------------------------------------------------------

-The base class ``PretrainedConfig`` implements the common methods for loading/saving a configuration either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded from HuggingFace's AWS S3 repository).
+The base class :class:`~transformers.PretrainedConfig` implements the common methods for loading/saving a configuration
+either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded
+from HuggingFace's AWS S3 repository).

-``PretrainedConfig``
-~~~~~~~~~~~~~~~~~~~~~
+
+PretrainedConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.PretrainedConfig
    :members:
--- a/docs/source/main_classes/logging.rst
+++ b/docs/source/main_classes/logging.rst
@ -0,0 +1,70 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Logging
+-----------------------------------------------------------------------------------------------------------------------
+
+🤗 Transformers has a centralized logging system, so that you can setup the verbosity of the library easily.
+
+Currently the default verbosity of the library is ``WARNING``.
+
+To change the level of verbosity, just use one of the direct setters. For instance, here is how to change the verbosity
+to the INFO level.
+
+.. code-block:: python
+
+    import transformers
+    transformers.logging.set_verbosity_info()
+
+You can also use the environment variable ``TRANSFORMERS_VERBOSITY`` to override the default verbosity. You can set it
+to one of the following: ``debug``, ``info``, ``warning``, ``error``, ``critical``. For example:
+
+.. code-block:: bash
+
+    TRANSFORMERS_VERBOSITY=error ./myprogram.py
+
+All the methods of this logging module are documented below, the main ones are
+:func:`transformers.logging.get_verbosity` to get the current level of verbosity in the logger and
+:func:`transformers.logging.set_verbosity` to set the verbosity to the level of your choice. In order (from the least
+verbose to the most verbose), those levels (with their corresponding int values in parenthesis) are:
+
+- :obj:`transformers.logging.CRITICAL` or :obj:`transformers.logging.FATAL` (int value, 50): only report the most
+  critical errors.
+- :obj:`transformers.logging.ERROR` (int value, 40): only report errors.
+- :obj:`transformers.logging.WARNING` or :obj:`transformers.logging.WARN` (int value, 30): only reports error and
+  warnings. This the default level used by the library.
+- :obj:`transformers.logging.INFO` (int value, 20): reports error, warnings and basic information.
+- :obj:`transformers.logging.DEBUG` (int value, 10): report all information.
+
+Base setters
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: transformers.logging.set_verbosity_error
+
+.. autofunction:: transformers.logging.set_verbosity_warning
+
+.. autofunction:: transformers.logging.set_verbosity_info
+
+.. autofunction:: transformers.logging.set_verbosity_debug
+
+Other functions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: transformers.logging.get_verbosity
+
+.. autofunction:: transformers.logging.set_verbosity
+
+.. autofunction:: transformers.logging.get_logger
+
+.. autofunction:: transformers.logging.enable_explicit_format
+
+.. autofunction:: transformers.logging.reset_format
--- a/docs/source/main_classes/model.rst
+++ b/docs/source/main_classes/model.rst
@ -1,21 +1,75 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Models
----------------------------------------------------
+-----------------------------------------------------------------------------------------------------------------------

-The base class ``PreTrainedModel`` implements the common methods for loading/saving a model either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded from HuggingFace's AWS S3 repository).
+The base classes :class:`~transformers.PreTrainedModel`, :class:`~transformers.TFPreTrainedModel`, and
+:class:`~transformers.FlaxPreTrainedModel` implement the common methods for loading/saving a model either from a local
+file or directory, or from a pretrained model configuration provided by the library (downloaded from HuggingFace's AWS
+S3 repository).

-``PreTrainedModel`` also implements a few methods which are common among all the models to:
+:class:`~transformers.PreTrainedModel` and :class:`~transformers.TFPreTrainedModel` also implement a few methods which
+are common among all the models to:

 - resize the input token embeddings when new tokens are added to the vocabulary
 - prune the attention heads of the model.

-``PreTrainedModel``
-~~~~~~~~~~~~~~~~~~~~~
+The other methods that are common to each model are defined in :class:`~transformers.modeling_utils.ModuleUtilsMixin`
+(for the PyTorch models) and :class:`~transformers.modeling_tf_utils.TFModuleUtilsMixin` (for the TensorFlow models) or
+for text generation, :class:`~transformers.generation_utils.GenerationMixin` (for the PyTorch models) and
+:class:`~transformers.generation_tf_utils.TFGenerationMixin` (for the TensorFlow models)
+
+
+PreTrainedModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.PreTrainedModel
    :members:

-``TFPreTrainedModel``
-~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.TFPreTrainedModel
+ModuleUtilsMixin
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_utils.ModuleUtilsMixin
+    :members:
+
+
+TFPreTrainedModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFPreTrainedModel
+    :members:
+
+
+TFModelUtilsMixin
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_tf_utils.TFModelUtilsMixin
+    :members:
+
+
+FlaxPreTrainedModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxPreTrainedModel
+    :members:
+
+
+Generation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.generation_utils.GenerationMixin
+    :members:
+
+.. autoclass:: transformers.generation_tf_utils.TFGenerationMixin
    :members:
--- a/docs/source/main_classes/optimizer_schedules.rst
+++ b/docs/source/main_classes/optimizer_schedules.rst
@ -1,45 +1,70 @@
-Optimizer
----------------------------------------------------
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Optimization
+-----------------------------------------------------------------------------------------------------------------------

 The ``.optimization`` module provides:

 - an optimizer with weight decay fixed that can be used to fine-tuned models, and
 - several schedules in the form of schedule objects that inherit from ``_LRSchedule``:
+- a gradient accumulation class to accumulate the gradients of multiple batches

-``AdamW``
-~~~~~~~~~~~~~~~~
+AdamW (PyTorch)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.AdamW
    :members:

+AdaFactor (PyTorch)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.Adafactor
+
+AdamWeightDecay (TensorFlow)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AdamWeightDecay
+
+.. autofunction:: transformers.create_optimizer
+
 Schedules
----------------------------------------------------
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-Learning Rate Schedules
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Learning Rate Schedules (Pytorch)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-.. autoclass:: transformers.ConstantLRSchedule
-    :members:
+.. autoclass:: transformers.SchedulerType
+
+.. autofunction:: transformers.get_scheduler
+
+.. autofunction:: transformers.get_constant_schedule


-.. autoclass:: transformers.WarmupConstantSchedule
-    :members:
+.. autofunction:: transformers.get_constant_schedule_with_warmup

 .. image:: /imgs/warmup_constant_schedule.png
    :target: /imgs/warmup_constant_schedule.png
    :alt:


-.. autoclass:: transformers.WarmupCosineSchedule
-    :members:
+.. autofunction:: transformers.get_cosine_schedule_with_warmup

 .. image:: /imgs/warmup_cosine_schedule.png
    :target: /imgs/warmup_cosine_schedule.png
    :alt:


-.. autoclass:: transformers.WarmupCosineWithHardRestartsSchedule
-    :members:
+.. autofunction:: transformers.get_cosine_with_hard_restarts_schedule_with_warmup

 .. image:: /imgs/warmup_cosine_hard_restarts_schedule.png
    :target: /imgs/warmup_cosine_hard_restarts_schedule.png
@ -47,9 +72,26 @@ Learning Rate Schedules



-.. autoclass:: transformers.WarmupLinearSchedule
-    :members:
+.. autofunction:: transformers.get_linear_schedule_with_warmup

 .. image:: /imgs/warmup_linear_schedule.png
    :target: /imgs/warmup_linear_schedule.png
    :alt:
+
+
+.. autofunction:: transformers.get_polynomial_decay_schedule_with_warmup
+
+
+Warmup (TensorFlow)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autoclass:: transformers.WarmUp
+    :members:
+
+Gradient Strategies
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+GradientAccumulator (TensorFlow)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autoclass:: transformers.GradientAccumulator
--- a/docs/source/main_classes/output.rst
+++ b/docs/source/main_classes/output.rst
@ -0,0 +1,301 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Model outputs
+-----------------------------------------------------------------------------------------------------------------------
+
+PyTorch models have outputs that are instances of subclasses of :class:`~transformers.file_utils.ModelOutput`. Those
+are data structures containing all the information returned by the model, but that can also be used as tuples or
+dictionaries.
+
+Let's see of this looks on an example:
+
+.. code-block::
+
+    from transformers import BertTokenizer, BertForSequenceClassification
+    import torch
+
+    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+    model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
+
+    inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+    labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+    outputs = model(**inputs, labels=labels)
+
+The ``outputs`` object is a :class:`~transformers.modeling_outputs.SequenceClassifierOutput`, as we can see in the
+documentation of that class below, it means it has an optional ``loss``, a ``logits`` an optional ``hidden_states`` and
+an optional ``attentions`` attribute. Here we have the ``loss`` since we passed along ``labels``, but we don't have
+``hidden_states`` and ``attentions`` because we didn't pass ``output_hidden_states=True`` or
+``output_attentions=True``.
+
+You can access each attribute as you would usually do, and if that attribute has not been returned by the model, you
+will get ``None``. Here for instance ``outputs.loss`` is the loss computed by the model, and ``outputs.attentions`` is
+``None``.
+
+When considering our ``outputs`` object as tuple, it only considers the attributes that don't have ``None`` values.
+Here for instance, it has two elements, ``loss`` then ``logits``, so
+
+.. code-block::
+
+    outputs[:2]
+
+will return the tuple ``(outputs.loss, outputs.logits)`` for instance.
+
+When considering our ``outputs`` object as dictionary, it only considers the attributes that don't have ``None``
+values. Here for instance, it has two keys that are ``loss`` and ``logits``.
+
+We document here the generic model outputs that are used by more than one model type. Specific output types are
+documented on their corresponding model page.
+
+ModelOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.file_utils.ModelOutput
+    :members:
+
+
+BaseModelOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.BaseModelOutput
+    :members:
+
+
+BaseModelOutputWithPooling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.BaseModelOutputWithPooling
+    :members:
+
+
+BaseModelOutputWithCrossAttentions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.BaseModelOutputWithCrossAttentions
+    :members:
+
+
+BaseModelOutputWithPoolingAndCrossAttentions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.BaseModelOutputWithPoolingAndCrossAttentions
+    :members:
+
+
+BaseModelOutputWithPast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.BaseModelOutputWithPast
+    :members:
+
+
+BaseModelOutputWithPastAndCrossAttentions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.BaseModelOutputWithPastAndCrossAttentions
+    :members:
+
+
+Seq2SeqModelOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.Seq2SeqModelOutput
+    :members:
+
+
+CausalLMOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.CausalLMOutput
+    :members:
+
+
+CausalLMOutputWithCrossAttentions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.CausalLMOutputWithCrossAttentions
+    :members:
+
+
+CausalLMOutputWithPast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.CausalLMOutputWithPast
+    :members:
+
+
+MaskedLMOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.MaskedLMOutput
+    :members:
+
+
+Seq2SeqLMOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.Seq2SeqLMOutput
+    :members:
+
+
+NextSentencePredictorOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.NextSentencePredictorOutput
+    :members:
+
+
+SequenceClassifierOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.SequenceClassifierOutput
+    :members:
+
+
+Seq2SeqSequenceClassifierOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.Seq2SeqSequenceClassifierOutput
+    :members:
+
+
+MultipleChoiceModelOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.MultipleChoiceModelOutput
+    :members:
+
+
+TokenClassifierOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.TokenClassifierOutput
+    :members:
+
+
+QuestionAnsweringModelOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.QuestionAnsweringModelOutput
+    :members:
+
+
+Seq2SeqQuestionAnsweringModelOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.Seq2SeqQuestionAnsweringModelOutput
+    :members:
+
+
+TFBaseModelOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_tf_outputs.TFBaseModelOutput
+    :members:
+
+
+TFBaseModelOutputWithPooling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_tf_outputs.TFBaseModelOutputWithPooling
+    :members:
+
+
+TFBaseModelOutputWithPast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_tf_outputs.TFBaseModelOutputWithPast
+    :members:
+
+
+TFSeq2SeqModelOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_tf_outputs.TFSeq2SeqModelOutput
+    :members:
+
+
+TFCausalLMOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_tf_outputs.TFCausalLMOutput
+    :members:
+
+
+TFCausalLMOutputWithPast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_tf_outputs.TFCausalLMOutputWithPast
+    :members:
+
+
+TFMaskedLMOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_tf_outputs.TFMaskedLMOutput
+    :members:
+
+
+TFSeq2SeqLMOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_tf_outputs.TFSeq2SeqLMOutput
+    :members:
+
+
+TFNextSentencePredictorOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_tf_outputs.TFNextSentencePredictorOutput
+    :members:
+
+
+TFSequenceClassifierOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_tf_outputs.TFSequenceClassifierOutput
+    :members:
+
+
+TFSeq2SeqSequenceClassifierOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_tf_outputs.TFSeq2SeqSequenceClassifierOutput
+    :members:
+
+
+TFMultipleChoiceModelOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_tf_outputs.TFMultipleChoiceModelOutput
+    :members:
+
+
+TFTokenClassifierOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_tf_outputs.TFTokenClassifierOutput
+    :members:
+
+
+TFQuestionAnsweringModelOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_tf_outputs.TFQuestionAnsweringModelOutput
+    :members:
+
+
+TFSeq2SeqQuestionAnsweringModelOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_tf_outputs.TFSeq2SeqQuestionAnsweringModelOutput
+    :members:
--- a/docs/source/main_classes/pipelines.rst
+++ b/docs/source/main_classes/pipelines.rst
@ -0,0 +1,148 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Pipelines
+-----------------------------------------------------------------------------------------------------------------------
+
+The pipelines are a great and easy way to use models for inference. These pipelines are objects that abstract most of
+the complex code from the library, offering a simple API dedicated to several tasks, including Named Entity
+Recognition, Masked Language Modeling, Sentiment Analysis, Feature Extraction and Question Answering. See the
+:doc:`task summary <../task_summary>` for examples of use.
+
+There are two categories of pipeline abstractions to be aware about:
+
+- The :func:`~transformers.pipeline` which is the most powerful object encapsulating all other pipelines.
+- The other task-specific pipelines:
+
+    - :class:`~transformers.ConversationalPipeline`
+    - :class:`~transformers.FeatureExtractionPipeline`
+    - :class:`~transformers.FillMaskPipeline`
+    - :class:`~transformers.QuestionAnsweringPipeline`
+    - :class:`~transformers.SummarizationPipeline`
+    - :class:`~transformers.TextClassificationPipeline`
+    - :class:`~transformers.TextGenerationPipeline`
+    - :class:`~transformers.TokenClassificationPipeline`
+    - :class:`~transformers.TranslationPipeline`
+    - :class:`~transformers.ZeroShotClassificationPipeline`
+    - :class:`~transformers.Text2TextGenerationPipeline`
+    - :class:`~transformers.TableQuestionAnsweringPipeline`
+
+The pipeline abstraction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The `pipeline` abstraction is a wrapper around all the other available pipelines. It is instantiated as any other
+pipeline but requires an additional argument which is the `task`.
+
+.. autofunction:: transformers.pipeline
+
+
+The task specific pipelines
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ConversationalPipeline
+=======================================================================================================================
+
+.. autoclass:: transformers.Conversation
+
+.. autoclass:: transformers.ConversationalPipeline
+    :special-members: __call__
+    :members:
+
+FeatureExtractionPipeline
+=======================================================================================================================
+
+.. autoclass:: transformers.FeatureExtractionPipeline
+    :special-members: __call__
+    :members:
+
+FillMaskPipeline
+=======================================================================================================================
+
+.. autoclass:: transformers.FillMaskPipeline
+    :special-members: __call__
+    :members:
+
+NerPipeline
+=======================================================================================================================
+
+.. autoclass:: transformers.NerPipeline
+
+See :class:`~transformers.TokenClassificationPipeline` for all details.
+
+QuestionAnsweringPipeline
+=======================================================================================================================
+
+.. autoclass:: transformers.QuestionAnsweringPipeline
+    :special-members: __call__
+    :members:
+
+SummarizationPipeline
+=======================================================================================================================
+
+.. autoclass:: transformers.SummarizationPipeline
+    :special-members: __call__
+    :members:
+
+TableQuestionAnsweringPipeline
+=======================================================================================================================
+
+.. autoclass:: transformers.TableQuestionAnsweringPipeline
+    :special-members: __call__
+
+
+TextClassificationPipeline
+=======================================================================================================================
+
+.. autoclass:: transformers.TextClassificationPipeline
+    :special-members: __call__
+    :members:
+
+TextGenerationPipeline
+=======================================================================================================================
+
+.. autoclass:: transformers.TextGenerationPipeline
+    :special-members: __call__
+    :members:
+
+Text2TextGenerationPipeline
+=======================================================================================================================
+
+.. autoclass:: transformers.Text2TextGenerationPipeline
+    :special-members: __call__
+    :members:
+
+TokenClassificationPipeline
+=======================================================================================================================
+
+.. autoclass:: transformers.TokenClassificationPipeline
+    :special-members: __call__
+    :members:
+
+TranslationPipeline
+=======================================================================================================================
+
+.. autoclass:: transformers.TranslationPipeline
+    :special-members: __call__
+    :members:
+
+ZeroShotClassificationPipeline
+=======================================================================================================================
+
+.. autoclass:: transformers.ZeroShotClassificationPipeline
+    :special-members: __call__
+    :members:
+
+Parent class: :obj:`Pipeline`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.Pipeline
+    :members:
--- a/docs/source/main_classes/processors.rst
+++ b/docs/source/main_classes/processors.rst
@ -1,58 +1,172 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Processors
----------------------------------------------------
+-----------------------------------------------------------------------------------------------------------------------

 This library includes processors for several traditional tasks. These processors can be used to process a dataset into
 examples that can be fed to a model.

 Processors
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 All processors follow the same architecture which is that of the
-:class:`~pytorch_transformers.data.processors.utils.DataProcessor`. The processor returns a list
-of :class:`~pytorch_transformers.data.processors.utils.InputExample`. These
-:class:`~pytorch_transformers.data.processors.utils.InputExample` can be converted to
-:class:`~pytorch_transformers.data.processors.utils.InputFeatures` in order to be fed to the model.
+:class:`~transformers.data.processors.utils.DataProcessor`. The processor returns a list of
+:class:`~transformers.data.processors.utils.InputExample`. These
+:class:`~transformers.data.processors.utils.InputExample` can be converted to
+:class:`~transformers.data.processors.utils.InputFeatures` in order to be fed to the model.

-.. autoclass:: pytorch_transformers.data.processors.utils.DataProcessor
+.. autoclass:: transformers.data.processors.utils.DataProcessor
    :members:


-.. autoclass:: pytorch_transformers.data.processors.utils.InputExample
+.. autoclass:: transformers.data.processors.utils.InputExample
    :members:


-.. autoclass:: pytorch_transformers.data.processors.utils.InputFeatures
+.. autoclass:: transformers.data.processors.utils.InputFeatures
    :members:


 GLUE
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-`General Language Understanding Evaluation (GLUE) <https://gluebenchmark.com/>`__ is a benchmark that evaluates
-the performance of models across a diverse set of existing NLU tasks. It was released together with the paper
-`GLUE: A multi-task benchmark and analysis platform for natural language understanding <https://openreview.net/pdf?id=rJ4km2R5t7>`__
+`General Language Understanding Evaluation (GLUE) <https://gluebenchmark.com/>`__ is a benchmark that evaluates the
+performance of models across a diverse set of existing NLU tasks. It was released together with the paper `GLUE: A
+multi-task benchmark and analysis platform for natural language understanding
+<https://openreview.net/pdf?id=rJ4km2R5t7>`__

-This library hosts a total of 10 processors for the following tasks: MRPC, MNLI, MNLI (mismatched),
-CoLA, SST2, STSB, QQP, QNLI, RTE and WNLI.
+This library hosts a total of 10 processors for the following tasks: MRPC, MNLI, MNLI (mismatched), CoLA, SST2, STSB,
+QQP, QNLI, RTE and WNLI.

 Those processors are:
-    - :class:`~pytorch_transformers.data.processors.utils.MrpcProcessor`
-    - :class:`~pytorch_transformers.data.processors.utils.MnliProcessor`
-    - :class:`~pytorch_transformers.data.processors.utils.MnliMismatchedProcessor`
-    - :class:`~pytorch_transformers.data.processors.utils.Sst2Processor`
-    - :class:`~pytorch_transformers.data.processors.utils.StsbProcessor`
-    - :class:`~pytorch_transformers.data.processors.utils.QqpProcessor`
-    - :class:`~pytorch_transformers.data.processors.utils.QnliProcessor`
-    - :class:`~pytorch_transformers.data.processors.utils.RteProcessor`
-    - :class:`~pytorch_transformers.data.processors.utils.WnliProcessor`

-Additionally, the following method  can be used to load values from a data file and convert them to a list of
-:class:`~pytorch_transformers.data.processors.utils.InputExample`.
+    - :class:`~transformers.data.processors.utils.MrpcProcessor`
+    - :class:`~transformers.data.processors.utils.MnliProcessor`
+    - :class:`~transformers.data.processors.utils.MnliMismatchedProcessor`
+    - :class:`~transformers.data.processors.utils.Sst2Processor`
+    - :class:`~transformers.data.processors.utils.StsbProcessor`
+    - :class:`~transformers.data.processors.utils.QqpProcessor`
+    - :class:`~transformers.data.processors.utils.QnliProcessor`
+    - :class:`~transformers.data.processors.utils.RteProcessor`
+    - :class:`~transformers.data.processors.utils.WnliProcessor`

-.. automethod:: pytorch_transformers.data.processors.glue.glue_convert_examples_to_features
+Additionally, the following method can be used to load values from a data file and convert them to a list of
+:class:`~transformers.data.processors.utils.InputExample`.
+
+.. automethod:: transformers.data.processors.glue.glue_convert_examples_to_features

 Example usage
-^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-An example using these processors is given in the
-`run_glue.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/run_glue.py>`__ script.
+An example using these processors is given in the `run_glue.py
+<https://github.com/huggingface/pytorch-transformers/blob/master/examples/text-classification/run_glue.py>`__ script.
+
+
+XNLI
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+`The Cross-Lingual NLI Corpus (XNLI) <https://www.nyu.edu/projects/bowman/xnli/>`__ is a benchmark that evaluates the
+quality of cross-lingual text representations. XNLI is crowd-sourced dataset based on `MultiNLI
+<http://www.nyu.edu/projects/bowman/multinli/>`: pairs of text are labeled with textual entailment annotations for 15
+different languages (including both high-resource language such as English and low-resource languages such as Swahili).
+
+It was released together with the paper `XNLI: Evaluating Cross-lingual Sentence Representations
+<https://arxiv.org/abs/1809.05053>`__
+
+This library hosts the processor to load the XNLI data:
+
+    - :class:`~transformers.data.processors.utils.XnliProcessor`
+
+Please note that since the gold labels are available on the test set, evaluation is performed on the test set.
+
+An example using these processors is given in the `run_xnli.py
+<https://github.com/huggingface/pytorch-transformers/blob/master/examples/text-classification/run_xnli.py>`__ script.
+
+
+SQuAD
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+`The Stanford Question Answering Dataset (SQuAD) <https://rajpurkar.github.io/SQuAD-explorer//>`__ is a benchmark that
+evaluates the performance of models on question answering. Two versions are available, v1.1 and v2.0. The first version
+(v1.1) was released together with the paper `SQuAD: 100,000+ Questions for Machine Comprehension of Text
+<https://arxiv.org/abs/1606.05250>`__. The second version (v2.0) was released alongside the paper `Know What You Don't
+Know: Unanswerable Questions for SQuAD <https://arxiv.org/abs/1806.03822>`__.
+
+This library hosts a processor for each of the two versions:
+
+Processors
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Those processors are:
+
+    - :class:`~transformers.data.processors.utils.SquadV1Processor`
+    - :class:`~transformers.data.processors.utils.SquadV2Processor`
+
+They both inherit from the abstract class :class:`~transformers.data.processors.utils.SquadProcessor`
+
+.. autoclass:: transformers.data.processors.squad.SquadProcessor
+    :members:
+
+Additionally, the following method can be used to convert SQuAD examples into
+:class:`~transformers.data.processors.utils.SquadFeatures` that can be used as model inputs.
+
+.. automethod:: transformers.data.processors.squad.squad_convert_examples_to_features
+
+These processors as well as the aforementionned method can be used with files containing the data as well as with the
+`tensorflow_datasets` package. Examples are given below.
+
+
+Example usage
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Here is an example using the processors as well as the conversion method using data files:
+
+.. code-block::
+
+    # Loading a V2 processor
+    processor = SquadV2Processor()
+    examples = processor.get_dev_examples(squad_v2_data_dir)
+
+    # Loading a V1 processor
+    processor = SquadV1Processor()
+    examples = processor.get_dev_examples(squad_v1_data_dir)
+
+    features = squad_convert_examples_to_features( 
+        examples=examples,
+        tokenizer=tokenizer,
+        max_seq_length=max_seq_length,
+        doc_stride=args.doc_stride,
+        max_query_length=max_query_length,
+        is_training=not evaluate,
+    )
+
+Using `tensorflow_datasets` is as easy as using a data file:
+
+.. code-block::
+
+    # tensorflow_datasets only handle Squad V1.
+    tfds_examples = tfds.load("squad")
+    examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
+
+    features = squad_convert_examples_to_features( 
+        examples=examples,
+        tokenizer=tokenizer,
+        max_seq_length=max_seq_length,
+        doc_stride=args.doc_stride,
+        max_query_length=max_query_length,
+        is_training=not evaluate,
+    )
+
+
+Another example using these processors is given in the :prefix_link:`run_squad.py
+<examples/question-answering/run_squad.py>` script.
--- a/docs/source/main_classes/tokenizer.rst
+++ b/docs/source/main_classes/tokenizer.rst
@ -1,16 +1,72 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Tokenizer
----------------------------------------------------
+-----------------------------------------------------------------------------------------------------------------------

-The base class ``PreTrainedTokenizer`` implements the common methods for loading/saving a tokenizer either from a local file or directory, or from a pretrained tokenizer provided by the library (downloaded from HuggingFace's AWS S3 repository).
+A tokenizer is in charge of preparing the inputs for a model. The library contains tokenizers for all the models. Most
+of the tokenizers are available in two flavors: a full python implementation and a "Fast" implementation based on the
+Rust library `tokenizers <https://github.com/huggingface/tokenizers>`__. The "Fast" implementations allows:

-``PreTrainedTokenizer`` is the main entry point into tokenizers as it also implements the main methods for using all the tokenizers:
+1. a significant speed-up in particular when doing batched tokenization and
+2. additional methods to map between the original string (character and words) and the token space (e.g. getting the
+   index of the token comprising a given character or the span of characters corresponding to a given token). Currently
+   no "Fast" implementation is available for the SentencePiece-based tokenizers (for T5, ALBERT, CamemBERT, XLMRoBERTa
+   and XLNet models).

- tokenizing, converting tokens to ids and back and encoding/decoding,
- adding new tokens to the vocabulary in a way that is independant of the underlying structure (BPE, SentencePiece...),
- managing special tokens (adding them, assigning them to roles, making sure they are not split during tokenization)
+The base classes :class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast`
+implement the common methods for encoding string inputs in model inputs (see below) and instantiating/saving python and
+"Fast" tokenizers either from a local file or directory or from a pretrained tokenizer provided by the library
+(downloaded from HuggingFace's AWS S3 repository). They both rely on
+:class:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase` that contains the common methods, and
+:class:`~transformers.tokenization_utils_base.SpecialTokensMixin`.

-``PreTrainedTokenizer``
-~~~~~~~~~~~~~~~~~~~~~~~~
+:class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast` thus implement the main
+methods for using all the tokenizers:
+
+- Tokenizing (splitting strings in sub-word token strings), converting tokens strings to ids and back, and
+  encoding/decoding (i.e., tokenizing and converting to integers).
+- Adding new tokens to the vocabulary in a way that is independent of the underlying structure (BPE, SentencePiece...).
+- Managing special tokens (like mask, beginning-of-sentence, etc.): adding them, assigning them to attributes in the
+  tokenizer for easy access and making sure they are not split during tokenization.
+
+:class:`~transformers.BatchEncoding` holds the output of the tokenizer's encoding methods (``__call__``,
+``encode_plus`` and ``batch_encode_plus``) and is derived from a Python dictionary. When the tokenizer is a pure python
+tokenizer, this class behaves just like a standard python dictionary and holds the various model inputs computed by
+these methods (``input_ids``, ``attention_mask``...). When the tokenizer is a "Fast" tokenizer (i.e., backed by
+HuggingFace `tokenizers library <https://github.com/huggingface/tokenizers>`__), this class provides in addition
+several advanced alignment methods which can be used to map between the original string (character and words) and the
+token space (e.g., getting the index of the token comprising a given character or the span of characters corresponding
+to a given token).
+
+
+PreTrainedTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.PreTrainedTokenizer
+    :special-members: __call__
+    :members:
+
+
+PreTrainedTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.PreTrainedTokenizerFast
+    :special-members: __call__
+    :members:
+
+
+BatchEncoding
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BatchEncoding
    :members:
--- a/docs/source/main_classes/trainer.rst
+++ b/docs/source/main_classes/trainer.rst
@ -0,0 +1,533 @@
+..
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Trainer
+-----------------------------------------------------------------------------------------------------------------------
+
+The :class:`~transformers.Trainer` and :class:`~transformers.TFTrainer` classes provide an API for feature-complete
+training in most standard use cases. It's used in most of the :doc:`example scripts <../examples>`.
+
+Before instantiating your :class:`~transformers.Trainer`/:class:`~transformers.TFTrainer`, create a
+:class:`~transformers.TrainingArguments`/:class:`~transformers.TFTrainingArguments` to access all the points of
+customization during training.
+
+The API supports distributed training on multiple GPUs/TPUs, mixed precision through `NVIDIA Apex
+<https://github.com/NVIDIA/apex>`__ for PyTorch and :obj:`tf.keras.mixed_precision` for TensorFlow.
+
+Both :class:`~transformers.Trainer` and :class:`~transformers.TFTrainer` contain the basic training loop supporting the
+previous features. To inject custom behavior you can subclass them and override the following methods:
+
+- **get_train_dataloader**/**get_train_tfdataset** -- Creates the training DataLoader (PyTorch) or TF Dataset.
+- **get_eval_dataloader**/**get_eval_tfdataset** -- Creates the evaluation DataLoader (PyTorch) or TF Dataset.
+- **get_test_dataloader**/**get_test_tfdataset** -- Creates the test DataLoader (PyTorch) or TF Dataset.
+- **log** -- Logs information on the various objects watching training.
+- **create_optimizer_and_scheduler** -- Setups the optimizer and learning rate scheduler if they were not passed at
+  init.
+- **compute_loss** - Computes the loss on a batch of training inputs.
+- **training_step** -- Performs a training step.
+- **prediction_step** -- Performs an evaluation/test step.
+- **run_model** (TensorFlow only) -- Basic pass through the model.
+- **evaluate** -- Runs an evaluation loop and returns metrics.
+- **predict** -- Returns predictions (with metrics if labels are available) on a test set.
+
+Here is an example of how to customize :class:`~transformers.Trainer` using a custom loss function:
+
+.. code-block:: python
+
+    from transformers import Trainer
+    class MyTrainer(Trainer):
+        def compute_loss(self, model, inputs):
+            labels = inputs.pop("labels")
+            outputs = model(**inputs)
+            logits = outputs[0]
+            return my_custom_loss(logits, labels)
+
+Another way to customize the training loop behavior for the PyTorch :class:`~transformers.Trainer` is to use
+:doc:`callbacks <callback>` that can inspect the training loop state (for progress reporting, logging on TensorBoard or
+other ML platforms...) and take decisions (like early stopping).
+
+
+Trainer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.Trainer
+    :members:
+
+
+Seq2SeqTrainer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.Seq2SeqTrainer
+    :members: evaluate, predict
+
+
+TFTrainer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFTrainer
+    :members:
+
+
+TrainingArguments
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TrainingArguments
+    :members:
+
+
+Seq2SeqTrainingArguments
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.Seq2SeqTrainingArguments
+    :members:
+
+
+TFTrainingArguments
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFTrainingArguments
+    :members:
+
+
+Trainer Integrations
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+
+
+The :class:`~transformers.Trainer` has been extended to support libraries that may dramatically improve your training
+time and fit much bigger models.
+
+Currently it supports third party solutions, `DeepSpeed <https://github.com/microsoft/DeepSpeed>`__ and `FairScale
+<https://github.com/facebookresearch/fairscale/>`__, which implement parts of the paper `ZeRO: Memory Optimizations
+Toward Training Trillion Parameter Models, by Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, Yuxiong He
+<https://arxiv.org/abs/1910.02054>`__.
+
+This provided support is new and experimental as of this writing.
+
+You will need at least 2 GPUs to benefit from these features.
+
+FairScale
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+By integrating `FairScale <https://github.com/facebookresearch/fairscale/>`__ the :class:`~transformers.Trainer`
+provides support for the following features from `the ZeRO paper <https://arxiv.org/abs/1910.02054>`__:
+
+1. Optimizer State Sharding
+2. Gradient Sharding
+
+To deploy this feature:
+
+1. Install the library via pypi:
+
+   .. code-block:: bash
+
+       pip install fairscale
+
+   or find more details on `the FairScale's github page
+   <https://github.com/facebookresearch/fairscale/#installation>`__.
+
+2. Add ``--sharded_ddp`` to the command line arguments, and make sure you have added the distributed launcher ``-m
+   torch.distributed.launch --nproc_per_node=NUMBER_OF_GPUS_YOU_HAVE`` if you haven't been using it already.
+
+For example here is how you could use it for ``finetune_trainer.py`` with 2 GPUs:
+
+.. code-block:: bash
+
+    cd examples/seq2seq
+    python -m torch.distributed.launch --nproc_per_node=2 ./finetune_trainer.py \
+    --model_name_or_path sshleifer/distill-mbart-en-ro-12-4 --data_dir wmt_en_ro \
+    --output_dir output_dir --overwrite_output_dir \
+    --do_train --n_train 500 --num_train_epochs 1 \
+    --per_device_train_batch_size 1  --freeze_embeds \
+    --src_lang en_XX --tgt_lang ro_RO --task translation \
+    --fp16 --sharded_ddp
+
+Notes:
+
+- This feature requires distributed training (so multiple GPUs).
+- It is not implemented for TPUs.
+- It works with ``--fp16`` too, to make things even faster.
+- One of the main benefits of enabling ``--sharded_ddp`` is that it uses a lot less GPU memory, so you should be able
+  to use significantly larger batch sizes using the same hardware (e.g. 3x and even bigger) which should lead to
+  significantly shorter training time.
+
+
+DeepSpeed
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+
+`DeepSpeed <https://github.com/microsoft/DeepSpeed>`__ implements everything described in the `ZeRO paper
+<https://arxiv.org/abs/1910.02054>`__, except ZeRO's stage 3. "Parameter Partitioning (Pos+g+p)". Currently it provides
+full support for:
+
+1. Optimizer State Partitioning (ZeRO stage 1)
+2. Add Gradient Partitioning (ZeRO stage 2)
+
+To deploy this feature:
+
+1. Install the library via pypi:
+
+   .. code-block:: bash
+
+       pip install deepspeed
+
+   or find more details on `the DeepSpeed's github page <https://github.com/microsoft/deepspeed#installation>`__.
+
+2. Adjust the :class:`~transformers.Trainer` command line arguments as following:
+
+   1. replace ``python -m torch.distributed.launch`` with ``deepspeed``.
+   2. add a new argument ``--deepspeed ds_config.json``, where ``ds_config.json`` is the DeepSpeed configuration file
+      as documented `here <https://www.deepspeed.ai/docs/config-json/>`__. The file naming is up to you.
+
+   Therefore, if your original command line looked as following:
+
+   .. code-block:: bash
+
+       python -m torch.distributed.launch --nproc_per_node=2 your_program.py <normal cl args>
+
+   Now it should be:
+
+   .. code-block:: bash
+
+       deepspeed --num_gpus=2 your_program.py <normal cl args> --deepspeed ds_config.json
+
+   Unlike, ``torch.distributed.launch`` where you have to specify how many GPUs to use with ``--nproc_per_node``, with
+   the ``deepspeed`` launcher you don't have to use the corresponding ``--num_gpus`` if you want all of your GPUs used.
+   The full details on how to configure various nodes and GPUs can be found `here
+   <https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node>`__.
+
+   Here is an example of running ``finetune_trainer.py`` under DeepSpeed deploying all available GPUs:
+
+   .. code-block:: bash
+
+       cd examples/seq2seq
+       deepspeed ./finetune_trainer.py --deepspeed ds_config.json \
+       --model_name_or_path sshleifer/distill-mbart-en-ro-12-4 --data_dir wmt_en_ro \
+       --output_dir output_dir --overwrite_output_dir \
+       --do_train --n_train 500 --num_train_epochs 1 \
+       --per_device_train_batch_size 1  --freeze_embeds \
+       --src_lang en_XX --tgt_lang ro_RO --task translation
+
+   Note that in the DeepSpeed documentation you are likely to see ``--deepspeed --deepspeed_config ds_config.json`` -
+   i.e. two DeepSpeed-related arguments, but for the sake of simplicity, and since there are already so many arguments
+   to deal with, we combined the two into a single argument.
+
+Before you can deploy DeepSpeed, let's discuss its configuration.
+
+**Configuration:**
+
+For the complete guide to the DeepSpeed configuration options that can be used in its configuration file please refer
+to the `following documentation <https://www.deepspeed.ai/docs/config-json/>`__.
+
+While you always have to supply the DeepSpeed configuration file, you can configure the DeepSpeed integration in
+several ways:
+
+1. Supply most of the configuration inside the file, and just use a few required command line arguments. This is the
+   recommended way as it puts most of the configuration params in one place.
+2. Supply just the ZeRO configuration params inside the file, and configure the rest using the normal
+   :class:`~transformers.Trainer` command line arguments.
+3. Any variation of the first two ways.
+
+To get an idea of what DeepSpeed configuration file looks like, here is one that activates ZeRO stage 2 features,
+enables FP16, uses AdamW optimizer and WarmupLR scheduler:
+
+.. code-block:: json
+
+    {
+        "fp16": {
+            "enabled": true,
+            "loss_scale": 0,
+            "loss_scale_window": 1000,
+            "hysteresis": 2,
+            "min_loss_scale": 1
+        },
+
+       "zero_optimization": {
+           "stage": 2,
+           "allgather_partitions": true,
+           "allgather_bucket_size": 5e8,
+           "overlap_comm": true,
+           "reduce_scatter": true,
+           "reduce_bucket_size": 5e8,
+           "contiguous_gradients": true,
+           "cpu_offload": true
+       },
+
+       "optimizer": {
+         "type": "AdamW",
+         "params": {
+           "lr": 3e-5,
+           "betas": [ 0.8, 0.999 ],
+           "eps": 1e-8,
+           "weight_decay": 3e-7
+         }
+       },
+       "zero_allow_untested_optimizer": true,
+
+       "scheduler": {
+         "type": "WarmupLR",
+         "params": {
+           "warmup_min_lr": 0,
+           "warmup_max_lr": 3e-5,
+           "warmup_num_steps": 500
+         }
+       }
+    }
+
+If you already have a command line that you have been using with :class:`transformers.Trainer` args, you can continue
+using those and the :class:`~transformers.Trainer` will automatically convert them into the corresponding DeepSpeed
+configuration at run time. For example, you could use the following configuration file:
+
+.. code-block:: json
+
+    {
+       "zero_optimization": {
+           "stage": 2,
+           "allgather_partitions": true,
+           "allgather_bucket_size": 5e8,
+           "overlap_comm": true,
+           "reduce_scatter": true,
+           "reduce_bucket_size": 5e8,
+           "contiguous_gradients": true,
+           "cpu_offload": true
+       }
+    }
+
+and the following command line arguments:
+
+.. code-block:: bash
+
+    --learning_rate 3e-5 --warmup_steps 500 --adam_beta1 0.8 --adam_beta2 0.999 --adam_epsilon 1e-8 \
+    --weight_decay 3e-7 --lr_scheduler_type constant_with_warmup --fp16 --fp16_backend amp
+
+to achieve the same configuration as provided by the longer json file in the first example.
+
+When you execute the program, DeepSpeed will log the configuration it received from the :class:`~transformers.Trainer`
+to the console, so you can see exactly what the final configuration was passed to it.
+
+**Shared Configuration:**
+
+Some configuration information is required by both the :class:`~transformers.Trainer` and DeepSpeed to function
+correctly, therefore, to prevent conflicting definitions, which could lead to hard to detect errors, we chose to
+configure those via the :class:`~transformers.Trainer` command line arguments.
+
+Therefore, the following DeepSpeed configuration params shouldn't be used with the :class:`~transformers.Trainer`:
+
+* ``train_batch_size``
+* ``train_micro_batch_size_per_gpu``
+* ``gradient_accumulation_steps``
+
+as these will be automatically derived from the run time environment and the following 2 command line arguments:
+
+.. code-block:: bash
+
+    --per_device_train_batch_size 8 --gradient_accumulation_steps 2
+
+which are always required to be supplied.
+
+Of course, you will need to adjust the values in this example to your situation.
+
+
+
+**ZeRO:**
+
+The ``zero_optimization`` section of the configuration file is the most important part (`docs
+<https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training>`__), since that is where you define
+which ZeRO stages you want to enable and how to configure them.
+
+.. code-block:: json
+
+    {
+       "zero_optimization": {
+           "stage": 2,
+           "allgather_partitions": true,
+           "allgather_bucket_size": 5e8,
+           "overlap_comm": true,
+           "reduce_scatter": true,
+           "reduce_bucket_size": 5e8,
+           "contiguous_gradients": true,
+           "cpu_offload": true
+       }
+    }
+
+Notes:
+
+- enabling ``cpu_offload`` should reduce GPU RAM usage (it requires ``"stage": 2``)
+- ``"overlap_comm": true`` trades off increased GPU RAM usage to lower all-reduce latency. ``overlap_comm`` uses 4.5x
+  the ``allgather_bucket_size`` and ``reduce_bucket_size`` values. So if they are set to 5e8, this requires a 9GB
+  footprint (``5e8 x 2Bytes x 2 x 4.5``). Therefore, if you have a GPU with 8GB or less RAM, to avoid getting
+  OOM-errors you will need to reduce those parameters to about ``2e8``, which would require 3.6GB.
+
+This section has to be configured exclusively via DeepSpeed configuration - the :class:`~transformers.Trainer` provides
+no equivalent command line arguments.
+
+
+
+**Optimizer:**
+
+
+DeepSpeed's main optimizers are Adam, OneBitAdam, and Lamb. These have been thoroughly tested with ZeRO and are thus
+recommended to be used. It, however, can import other optimizers from ``torch``. The full documentation is `here
+<https://www.deepspeed.ai/docs/config-json/#optimizer-parameters>`__.
+
+If you don't configure the ``optimizer`` entry in the configuration file, the :class:`~transformers.Trainer` will
+automatically set it to ``AdamW`` and will use the supplied values or the defaults for the following command line
+arguments: ``--learning_rate``, ``--adam_beta1``, ``--adam_beta2``, ``--adam_epsilon`` and ``--weight_decay``.
+
+Here is an example of the pre-configured ``optimizer`` entry for AdamW:
+
+.. code-block:: json
+
+    {
+       "zero_allow_untested_optimizer": true,
+       "optimizer": {
+           "type": "AdamW",
+           "params": {
+             "lr": 0.001,
+             "betas": [0.8, 0.999],
+             "eps": 1e-8,
+             "weight_decay": 3e-7
+           }
+         }
+    }
+
+Since AdamW isn't on the list of tested with DeepSpeed/ZeRO optimizers, we have to add
+``zero_allow_untested_optimizer`` flag.
+
+If you want to use one of the officially supported optimizers, configure them explicitly in the configuration file, and
+make sure to adjust the values. e.g. if use Adam you will want ``weight_decay`` around ``0.01``.
+
+
+**Scheduler:**
+
+DeepSpeed supports LRRangeTest, OneCycle, WarmupLR and WarmupDecayLR LR schedulers. The full documentation is `here
+<https://www.deepspeed.ai/docs/config-json/#scheduler-parameters>`__.
+
+If you don't configure the ``scheduler`` entry in the configuration file, the :class:`~transformers.Trainer` will use
+the value of ``--lr_scheduler_type`` to configure it. Currently the :class:`~transformers.Trainer` supports only 2 LR
+schedulers that are also supported by DeepSpeed:
+
+* ``WarmupLR`` via ``--lr_scheduler_type constant_with_warmup``
+* ``WarmupDecayLR`` via ``--lr_scheduler_type linear``. This is also the default value for ``--lr_scheduler_type``,
+  therefore, if you don't configure the scheduler this is scheduler that will get configured by default.
+
+In either case, the values of ``--learning_rate`` and ``--warmup_steps`` will be used for the configuration.
+
+In other words, if you don't use the configuration file to set the ``scheduler`` entry, provide either:
+
+.. code-block:: bash
+
+    --lr_scheduler_type constant_with_warmup --learning_rate 3e-5 --warmup_steps 500
+
+or
+
+.. code-block:: bash
+
+    --lr_scheduler_type linear --learning_rate 3e-5 --warmup_steps 500
+
+with the desired values. If you don't pass these arguments, reasonable default values will be used instead.
+
+In the case of WarmupDecayLR ``total_num_steps`` gets set either via the ``--max_steps`` command line argument, or if
+it is not provided, derived automatically at run time based on the environment and the size of the dataset and other
+command line arguments.
+
+Here is an example of the pre-configured ``scheduler`` entry for WarmupLR (``constant_with_warmup`` in the
+:class:`~transformers.Trainer` API):
+
+.. code-block:: json
+
+    {
+       "scheduler": {
+             "type": "WarmupLR",
+             "params": {
+                 "warmup_min_lr": 0,
+                 "warmup_max_lr": 0.001,
+                 "warmup_num_steps": 1000
+             }
+         }
+    }
+
+**Automatic Mixed Precision:**
+
+You can work with FP16 in one of the following ways:
+
+1. Pytorch native amp, as documented `here <https://www.deepspeed.ai/docs/config-json/#fp16-training-options>`__.
+2. NVIDIA's apex, as documented `here
+   <https://www.deepspeed.ai/docs/config-json/#automatic-mixed-precision-amp-training-options>`__.
+
+If you want to use an equivalent of the pytorch native amp, you can either configure the ``fp16`` entry in the
+configuration file, or use the following command line arguments: ``--fp16 --fp16_backend amp``.
+
+Here is an example of the ``fp16`` configuration:
+
+.. code-block:: json
+
+    {
+        "fp16": {
+            "enabled": true,
+            "loss_scale": 0,
+            "loss_scale_window": 1000,
+            "hysteresis": 2,
+            "min_loss_scale": 1
+        },
+    }
+
+If you want to use NVIDIA's apex instead, you can can either configure the ``amp`` entry in the configuration file, or
+use the following command line arguments: ``--fp16 --fp16_backend apex --fp16_opt_level 01``.
+
+Here is an example of the ``amp`` configuration:
+
+.. code-block:: json
+
+    {
+        "amp": {
+            "enabled": true,
+            "opt_level": "O1"
+        }
+    }
+
+
+
+**Gradient Clipping:**
+
+If you don't configure the ``gradient_clipping`` entry in the configuration file, the :class:`~transformers.Trainer`
+will use the value of the ``--max_grad_norm`` command line argument to set it.
+
+Here is an example of the ``gradient_clipping`` configuration:
+
+.. code-block:: json
+
+    {
+        "gradient_clipping": 1.0,
+    }
+
+
+
+**Notes:**
+
+* DeepSpeed works with the PyTorch :class:`~transformers.Trainer` but not TF :class:`~transformers.TFTrainer`.
+* While DeepSpeed has a pip installable PyPI package, it is highly recommended that it gets installed from `source
+  <https://github.com/microsoft/deepspeed#installation>`__ to best match your hardware and also if you need to enable
+  certain features, like 1-bit Adam, which aren't available in the pypi distribution.
+* You don't have to use the :class:`~transformers.Trainer` to use DeepSpeed with HuggingFace ``transformers`` - you can
+  use any model with your own trainer, and you will have to adapt the latter according to `the DeepSpeed integration
+  instructions <https://www.deepspeed.ai/getting-started/#writing-deepspeed-models>`__.
+
+**Main DeepSpeed Resources:**
+
+- `github <https://github.com/microsoft/deepspeed>`__
+- `Usage docs <https://www.deepspeed.ai/getting-started/>`__
+- `API docs <https://deepspeed.readthedocs.io/en/latest/index.html>`__
+
+Finally, please, remember that, HuggingFace :class:`~transformers.Trainer` only integrates DeepSpeed, therefore if you
+have any problems or questions with regards to DeepSpeed usage, please, file an issue with `DeepSpeed github
+<https://github.com/microsoft/DeepSpeed/issues>`__.
--- a/docs/source/migration.md
+++ b/docs/source/migration.md
@ -1,17 +1,211 @@
-# Migrating from pytorch-pretrained-bert
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Migrating from previous packages
+
+## Migrating from transformers `v3.x` to `v4.x`
+
+A couple of changes were introduced when the switch from version 3 to version 4 was done. Below is a summary of the
+expected changes:
+
+#### 1. AutoTokenizers and pipelines now use fast (rust) tokenizers by default.
+
+The python and rust tokenizers have roughly the same API, but the rust tokenizers have a more complete feature set. 
+
+This introduces two breaking changes:
+- The handling of overflowing tokens between the python and rust tokenizers is different.
+- The rust tokenizers do not accept integers in the encoding methods.
+
+##### How to obtain the same behavior as v3.x in v4.x
+
+- The pipelines now contain additional features out of the box. See the [token-classification pipeline with the `grouped_entities` flag](https://huggingface.co/transformers/main_classes/pipelines.html?highlight=textclassification#tokenclassificationpipeline).
+- The auto-tokenizers now return rust tokenizers. In order to obtain the python tokenizers instead, the user may use the `use_fast` flag by setting it to `False`:
+
+In version `v3.x`:
+```py
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+```
+to obtain the same in version `v4.x`:
+```py
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False)
+```
+
+#### 2. SentencePiece is removed from the required dependencies
+
+The requirement on the SentencePiece dependency has been lifted from the `setup.py`. This is done so that we may have a channel on anaconda cloud without relying on `conda-forge`. This means that the tokenizers that depend on the SentencePiece library will not be available with a standard `transformers` installation.
+
+This includes the **slow** versions of:
+- `XLNetTokenizer`
+- `AlbertTokenizer`
+- `CamembertTokenizer`
+- `MBartTokenizer`
+- `PegasusTokenizer`
+- `T5Tokenizer`
+- `ReformerTokenizer`
+- `XLMRobertaTokenizer`
+
+##### How to obtain the same behavior as v3.x in v4.x
+
+In order to obtain the same behavior as version `v3.x`, you should install `sentencepiece` additionally:
+
+In version `v3.x`:
+```bash
+pip install transformers
+```
+to obtain the same in version `v4.x`:
+```bash
+pip install transformers[sentencepiece]
+```
+or
+```bash
+pip install transformers sentencepiece
+```
+#### 3. The architecture of the repo has been updated so that each model resides in its folder
+
+The past and foreseeable addition of new models means that the number of files in the directory `src/transformers` keeps growing and becomes harder to navigate and understand. We made the choice to put each model and the files accompanying it in their own sub-directories.
+
+This is a breaking change as importing intermediary layers using a model's module directly needs to be done via a different path.
+
+##### How to obtain the same behavior as v3.x in v4.x
+
+In order to obtain the same behavior as version `v3.x`, you should update the path used to access the layers. 
+
+In version `v3.x`:
+```bash
+from transformers.modeling_bert import BertLayer
+```
+to obtain the same in version `v4.x`:
+```bash
+from transformers.models.bert.modeling_bert import BertLayer
+```
+
+#### 4. Switching the `return_dict` argument to `True` by default
+
+The [`return_dict` argument](https://huggingface.co/transformers/main_classes/output.html) enables the return of dict-like python objects containing the model outputs, instead of the standard tuples. This object is self-documented as keys can be used to retrieve values, while also behaving as a tuple as users may retrieve objects by index or by slice.
+
+This is a breaking change as the limitation of that tuple is that it cannot be unpacked: `value0, value1 = outputs` will not work.
+
+##### How to obtain the same behavior as v3.x in v4.x
+
+In order to obtain the same behavior as version `v3.x`, you should specify the `return_dict` argument to `False`, either in the model configuration or during the forward pass.
+
+In version `v3.x`:
+```bash
+model = BertModel.from_pretrained("bert-base-cased")
+outputs = model(**inputs)
+```
+to obtain the same in version `v4.x`:
+```bash
+model = BertModel.from_pretrained("bert-base-cased")
+outputs = model(**inputs, return_dict=False)
+```
+or
+```bash
+model = BertModel.from_pretrained("bert-base-cased", return_dict=False)
+outputs = model(**inputs)
+```
+
+#### 5. Removed some deprecated attributes
+
+Attributes that were deprecated have been removed if they had been deprecated for at least a month. The full list of deprecated attributes can be found in [#8604](https://github.com/huggingface/transformers/pull/8604).
+
+Here is a list of these attributes/methods/arguments and what their replacements should be:
+
+In several models, the labels become consistent with the other models:
+- `masked_lm_labels` becomes `labels` in `AlbertForMaskedLM` and `AlbertForPreTraining`.
+- `masked_lm_labels` becomes `labels` in `BertForMaskedLM` and `BertForPreTraining`.
+- `masked_lm_labels` becomes `labels` in `DistilBertForMaskedLM`.
+- `masked_lm_labels` becomes `labels` in `ElectraForMaskedLM`.
+- `masked_lm_labels` becomes `labels` in `LongformerForMaskedLM`.
+- `masked_lm_labels` becomes `labels` in `MobileBertForMaskedLM`.
+- `masked_lm_labels` becomes `labels` in `RobertaForMaskedLM`.
+- `lm_labels` becomes `labels` in `BartForConditionalGeneration`.
+- `lm_labels` becomes `labels` in `GPT2DoubleHeadsModel`.
+- `lm_labels` becomes `labels` in `OpenAIGPTDoubleHeadsModel`.
+- `lm_labels` becomes `labels` in `T5ForConditionalGeneration`.
+
+In several models, the caching mechanism becomes consistent with the other models:
+- `decoder_cached_states` becomes `past_key_values` in all BART-like, FSMT and T5 models.
+- `decoder_past_key_values` becomes `past_key_values` in all BART-like, FSMT and T5 models.
+- `past` becomes `past_key_values` in all CTRL models.
+- `past` becomes `past_key_values` in all GPT-2 models.
+
+Regarding the tokenizer classes:
+- The tokenizer attribute `max_len` becomes `model_max_length`.
+- The tokenizer attribute `return_lengths` becomes `return_length`.
+- The tokenizer encoding argument `is_pretokenized` becomes `is_split_into_words`.
+
+Regarding the `Trainer` class:
+- The `Trainer` argument `tb_writer` is removed in favor of the callback `TensorBoardCallback(tb_writer=...)`.
+- The `Trainer` argument `prediction_loss_only` is removed in favor of the class argument `args.prediction_loss_only`.
+- The `Trainer` attribute `data_collator` should be a callable.
+- The `Trainer` method `_log` is deprecated in favor of `log`.
+- The `Trainer` method `_training_step` is deprecated in favor of `training_step`.
+- The `Trainer` method `_prediction_loop` is deprecated in favor of `prediction_loop`.
+- The `Trainer` method `is_local_master` is deprecated in favor of `is_local_process_zero`.
+- The `Trainer` method `is_world_master` is deprecated in favor of `is_world_process_zero`.
+
+Regarding the `TFTrainer` class:
+- The `TFTrainer` argument `prediction_loss_only` is removed in favor of the class argument `args.prediction_loss_only`.
+- The `Trainer` method `_log` is deprecated in favor of `log`.
+- The `TFTrainer` method `_prediction_loop` is deprecated in favor of `prediction_loop`.
+- The `TFTrainer` method `_setup_wandb` is deprecated in favor of `setup_wandb`.
+- The `TFTrainer` method `_run_model` is deprecated in favor of `run_model`.
+
+Regarding the `TrainerArgument` class:
+- The `TrainerArgument` argument `evaluate_during_training` is deprecated in favor of `evaluation_strategy`.
+
+Regarding the Transfo-XL model:
+- The Transfo-XL configuration attribute `tie_weight` becomes `tie_words_embeddings`.
+- The Transfo-XL modeling method `reset_length` becomes `reset_memory_length`.
+
+Regarding pipelines:
+- The `FillMaskPipeline` argument `topk` becomes `top_k`.


-Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to `transformers`
+
+## Migrating from pytorch-transformers to 🤗 Transformers
+
+Here is a quick summary of what you should take care of when migrating from `pytorch-transformers` to 🤗 Transformers.
+
+### Positional order of some models' keywords inputs (`attention_mask`, `token_type_ids`...) changed
+
+To be able to use Torchscript (see #1010, #1204 and #1195) the specific order of some models **keywords inputs** (`attention_mask`, `token_type_ids`...) has been changed.
+
+If you used to call the models with keyword names for keyword arguments, e.g. `model(inputs_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)`, this should not cause any change.
+
+If you used to call the models with positional inputs for keyword arguments, e.g. `model(inputs_ids, attention_mask, token_type_ids)`, you may have to double check the exact order of input arguments.
+
+## Migrating from pytorch-pretrained-bert
+
+Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to 🤗 Transformers

 ### Models always output `tuples`

-The main breaking change when migrating from `pytorch-pretrained-bert` to `transformers` is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
+The main breaking change when migrating from `pytorch-pretrained-bert` to 🤗 Transformers is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.

-The exact content of the tuples for each model are detailled in the models' docstrings and the [documentation](https://huggingface.co/transformers/).
+The exact content of the tuples for each model are detailed in the models' docstrings and the [documentation](https://huggingface.co/transformers/).

 In pretty much every case, you will be fine by taking the first element of the output as the output you previously used in `pytorch-pretrained-bert`.

-Here is a `pytorch-pretrained-bert` to `transformers` conversion example for a `BertForSequenceClassification` classification model:
+Here is a `pytorch-pretrained-bert` to 🤗 Transformers conversion example for a `BertForSequenceClassification` classification model:

 ```python
 # Let's load our model
@ -20,14 +214,14 @@ model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
 # If you used to have this line in pytorch-pretrained-bert:
 loss = model(input_ids, labels=labels)

-# Now just use this line in transformers to extract the loss from the output tuple:
+# Now just use this line in 🤗 Transformers to extract the loss from the output tuple:
 outputs = model(input_ids, labels=labels)
 loss = outputs[0]

-# In transformers you can also have access to the logits:
+# In 🤗 Transformers you can also have access to the logits:
 loss, logits = outputs[:2]

-# And even the attention weigths if you configure the model to output them (and other outputs too, see the docstrings and documentation)
+# And even the attention weights if you configure the model to output them (and other outputs too, see the docstrings and documentation)
 model = BertForSequenceClassification.from_pretrained('bert-base-uncased', output_attentions=True)
 outputs = model(input_ids, labels=labels)
 loss, logits, attentions = outputs
@ -84,26 +278,26 @@ Here is a conversion examples from `BertAdam` with a linear warmup and decay sch
 # Parameters:
 lr = 1e-3
 max_grad_norm = 1.0
-num_total_steps = 1000
+num_training_steps = 1000
 num_warmup_steps = 100
-warmup_proportion = float(num_warmup_steps) / float(num_total_steps)  # 0.1
+warmup_proportion = float(num_warmup_steps) / float(num_training_steps)  # 0.1

 ### Previously BertAdam optimizer was instantiated like this:
-optimizer = BertAdam(model.parameters(), lr=lr, schedule='warmup_linear', warmup=warmup_proportion, t_total=num_total_steps)
+optimizer = BertAdam(model.parameters(), lr=lr, schedule='warmup_linear', warmup=warmup_proportion, num_training_steps=num_training_steps)
 ### and used like this:
 for batch in train_data:
    loss = model(batch)
    loss.backward()
    optimizer.step()

-### In Transformers, optimizer and schedules are splitted and instantiated like this:
+### In 🤗 Transformers, optimizer and schedules are split and instantiated like this:
 optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
-scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_total=num_total_steps)  # PyTorch scheduler
+scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)  # PyTorch scheduler
 ### and used like this:
 for batch in train_data:
    loss = model(batch)
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)  # Gradient clipping is not in AdamW anymore (so you can use amp without issue)
-    scheduler.step()
    optimizer.step()
+    scheduler.step()
 ```
--- a/docs/source/model_doc/albert.rst
+++ b/docs/source/model_doc/albert.rst
@ -0,0 +1,175 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+ALBERT
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The ALBERT model was proposed in `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations
+<https://arxiv.org/abs/1909.11942>`__ by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma,
+Radu Soricut. It presents two parameter-reduction techniques to lower memory consumption and increase the training
+speed of BERT:
+
+- Splitting the embedding matrix into two smaller matrices.
+- Using repeating layers split among groups.
+
+The abstract from the paper is the following:
+
+*Increasing model size when pretraining natural language representations often results in improved performance on
+downstream tasks. However, at some point further model increases become harder due to GPU/TPU memory limitations,
+longer training times, and unexpected model degradation. To address these problems, we present two parameter-reduction
+techniques to lower memory consumption and increase the training speed of BERT. Comprehensive empirical evidence shows
+that our proposed methods lead to models that scale much better compared to the original BERT. We also use a
+self-supervised loss that focuses on modeling inter-sentence coherence, and show it consistently helps downstream tasks
+with multi-sentence inputs. As a result, our best model establishes new state-of-the-art results on the GLUE, RACE, and
+SQuAD benchmarks while having fewer parameters compared to BERT-large.*
+
+Tips:
+
+- ALBERT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather
+  than the left.
+- ALBERT uses repeating layers which results in a small memory footprint, however the computational cost remains
+  similar to a BERT-like architecture with the same number of hidden layers as it has to iterate through the same
+  number of (repeating) layers.
+
+The original code can be found `here <https://github.com/google-research/ALBERT>`__.
+
+AlbertConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AlbertConfig
+    :members:
+
+
+AlbertTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AlbertTokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+AlbertTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AlbertTokenizerFast
+    :members:
+
+
+Albert specific outputs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.models.albert.modeling_albert.AlbertForPreTrainingOutput
+    :members:
+
+.. autoclass:: transformers.models.albert.modeling_tf_albert.TFAlbertForPreTrainingOutput
+    :members:
+
+
+AlbertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AlbertModel
+    :members: forward
+
+
+AlbertForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AlbertForPreTraining
+    :members: forward
+
+
+AlbertForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AlbertForMaskedLM
+    :members: forward
+
+
+AlbertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AlbertForSequenceClassification
+    :members: forward
+
+
+AlbertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AlbertForMultipleChoice
+    :members:
+
+
+AlbertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AlbertForTokenClassification
+    :members: forward
+
+
+AlbertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AlbertForQuestionAnswering
+    :members: forward
+
+
+TFAlbertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAlbertModel
+    :members: call
+
+
+TFAlbertForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAlbertForPreTraining
+    :members: call
+
+
+TFAlbertForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAlbertForMaskedLM
+    :members: call
+
+
+TFAlbertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAlbertForSequenceClassification
+    :members: call
+
+
+TFAlbertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAlbertForMultipleChoice
+    :members: call
+
+
+TFAlbertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAlbertForTokenClassification
+    :members: call
+
+
+TFAlbertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAlbertForQuestionAnswering
+    :members: call
--- a/docs/source/model_doc/auto.rst
+++ b/docs/source/model_doc/auto.rst
@ -1,29 +1,191 @@
-AutoModels
-----------
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.

-In many cases, the architecture you want to use can be guessed from the name or the path of the pretrained model you are supplying to the ``from_pretrained`` method.
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at

-AutoClasses are here to do this job for you so that you automatically retreive the relevant model given the name/path to the pretrained weights/config/vocabulary:
+        http://www.apache.org/licenses/LICENSE-2.0

-Instantiating one of ``AutoModel``, ``AutoConfig`` and ``AutoTokenizer`` will directly create a class of the relevant architecture (ex: ``model = AutoModel.from_pretrained('bert-base-cased')`` will create a instance of ``BertModel``).
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Auto Classes
+-----------------------------------------------------------------------------------------------------------------------
+
+In many cases, the architecture you want to use can be guessed from the name or the path of the pretrained model you
+are supplying to the :obj:`from_pretrained()` method. AutoClasses are here to do this job for you so that you
+automatically retrieve the relevant model given the name/path to the pretrained weights/config/vocabulary.
+
+Instantiating one of :class:`~transformers.AutoConfig`, :class:`~transformers.AutoModel`, and
+:class:`~transformers.AutoTokenizer` will directly create a class of the relevant architecture. For instance


-``AutoConfig``
-~~~~~~~~~~~~~~~~~~~~~
+.. code-block:: python
+
+    model = AutoModel.from_pretrained('bert-base-cased')
+
+will create a model that is an instance of :class:`~transformers.BertModel`.
+
+There is one class of :obj:`AutoModel` for each task, and for each backend (PyTorch or TensorFlow).
+
+
+AutoConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.AutoConfig
    :members:


-``AutoModel``
-~~~~~~~~~~~~~~~~~~~~~
+AutoTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoTokenizer
+    :members:
+
+
+AutoModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.AutoModel
    :members:


-``AutoTokenizer``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+AutoModelForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: transformers.AutoTokenizer
+.. autoclass:: transformers.AutoModelForPreTraining
+    :members:
+
+
+AutoModelForCausalLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoModelForCausalLM
+    :members:
+
+
+AutoModelForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoModelForMaskedLM
+    :members:
+
+
+AutoModelForSeq2SeqLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoModelForSeq2SeqLM
+    :members:
+
+
+AutoModelForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoModelForSequenceClassification
+    :members:
+
+
+AutoModelForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoModelForMultipleChoice
+    :members:
+
+
+AutoModelForNextSentencePrediction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoModelForNextSentencePrediction
+    :members:
+
+
+AutoModelForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoModelForTokenClassification
+    :members:
+
+
+AutoModelForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoModelForQuestionAnswering
+    :members:
+
+
+AutoModelForTableQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoModelForTableQuestionAnswering
+    :members:
+
+
+TFAutoModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAutoModel
+    :members:
+
+
+TFAutoModelForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAutoModelForPreTraining
+    :members:
+
+
+TFAutoModelForCausalLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAutoModelForCausalLM
+    :members:
+
+
+TFAutoModelForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAutoModelForMaskedLM
+    :members:
+
+
+TFAutoModelForSeq2SeqLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAutoModelForSeq2SeqLM
+    :members:
+
+
+TFAutoModelForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAutoModelForSequenceClassification
+    :members:
+
+
+TFAutoModelForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAutoModelForMultipleChoice
+    :members:
+
+
+TFAutoModelForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAutoModelForTokenClassification
+    :members:
+
+
+TFAutoModelForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAutoModelForQuestionAnswering
+    :members:
+
+
+FlaxAutoModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxAutoModel
    :members:
--- a/docs/source/model_doc/bart.rst
+++ b/docs/source/model_doc/bart.rst
@ -0,0 +1,146 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+BART
+-----------------------------------------------------------------------------------------------------------------------
+
+**DISCLAIMER:** If you see something strange, file a `Github Issue
+<https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`__ and assign
+@patrickvonplaten
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The Bart model was proposed in `BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation,
+Translation, and Comprehension <https://arxiv.org/abs/1910.13461>`__ by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan
+Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer on 29 Oct, 2019.
+
+According to the abstract,
+
+- Bart uses a standard seq2seq/machine translation architecture with a bidirectional encoder (like BERT) and a
+  left-to-right decoder (like GPT).
+- The pretraining task involves randomly shuffling the order of the original sentences and a novel in-filling scheme,
+  where spans of text are replaced with a single mask token.
+- BART is particularly effective when fine tuned for text generation but also works well for comprehension tasks. It
+  matches the performance of RoBERTa with comparable training resources on GLUE and SQuAD, achieves new
+  state-of-the-art results on a range of abstractive dialogue, question answering, and summarization tasks, with gains
+  of up to 6 ROUGE.
+
+The Authors' code can be found `here <https://github.com/pytorch/fairseq/tree/master/examples/bart>`__.
+
+
+Examples
+_______________________________________________________________________________________________________________________
+
+- Examples and scripts for fine-tuning BART and other models for sequence to sequence tasks can be found in
+  :prefix_link:`examples/seq2seq/ <examples/seq2seq/README.md>`.
+- An example of how to train :class:`~transformers.BartForConditionalGeneration` with a Hugging Face :obj:`datasets`
+  object can be found in this `forum discussion
+  <https://discuss.huggingface.co/t/train-bart-for-conditional-generation-e-g-summarization/1904>`__.
+- `Distilled checkpoints <https://huggingface.co/models?search=distilbart>`__ are described in this `paper
+  <https://arxiv.org/abs/2010.13002>`__.
+
+
+Implementation Notes
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+- Bart doesn't use :obj:`token_type_ids` for sequence classification. Use :class:`~transformers.BartTokenizer` or
+  :meth:`~transformers.BartTokenizer.encode` to get the proper splitting.
+- The forward pass of :class:`~transformers.BartModel` will create the ``decoder_input_ids`` if they are not passed.
+  This is different than some other modeling APIs. A typical use case of this feature is mask filling.
+- Model predictions are intended to be identical to the original implementation when
+  :obj:`force_bos_token_to_be_generated=True`. This only works, however, if the string you pass to
+  :func:`fairseq.encode` starts with a space.
+- :meth:`~transformers.BartForConditionalGeneration.generate` should be used for conditional generation tasks like
+  summarization, see the example in that docstrings.
+- Models that load the `facebook/bart-large-cnn` weights will not have a :obj:`mask_token_id`, or be able to perform
+  mask-filling tasks.
+
+Mask Filling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The :obj:`facebook/bart-base` and :obj:`facebook/bart-large` checkpoints can be used to fill multi-token masks.
+
+.. code-block::
+
+    from transformers import BartForConditionalGeneration, BartTokenizer
+    model = BartForConditionalGeneration.from_pretrained("facebook/bart-large", force_bos_token_to_be_generated=True)
+    tok = BartTokenizer.from_pretrained("facebook/bart-large")
+    example_english_phrase = "UN Chief Says There Is No <mask> in Syria"
+    batch = tok(example_english_phrase, return_tensors='pt')
+    generated_ids = model.generate(batch['input_ids'])
+    assert tok.batch_decode(generated_ids, skip_special_tokens=True) == ['UN Chief Says There Is No Plan to Stop Chemical Weapons in Syria']
+
+
+
+BartConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BartConfig
+    :members:
+
+
+BartTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BartTokenizer
+    :members:
+
+
+BartTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BartTokenizerFast
+    :members:
+
+
+BartModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BartModel
+    :members: forward
+
+
+BartForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BartForConditionalGeneration
+    :members: forward
+
+
+BartForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BartForSequenceClassification
+    :members: forward
+
+
+BartForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BartForQuestionAnswering
+    :members: forward
+
+
+
+TFBartModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFBartModel
+    :members: call
+
+
+TFBartForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFBartForConditionalGeneration
+    :members: call
--- a/docs/source/model_doc/barthez.rst
+++ b/docs/source/model_doc/barthez.rst
@ -0,0 +1,59 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+BARThez
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The BARThez model was proposed in `BARThez: a Skilled Pretrained French Sequence-to-Sequence Model`
+<https://arxiv.org/abs/2010.12321>`__ by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis on 23 Oct,
+2020.
+
+The abstract of the paper:
+
+
+*Inductive transfer learning, enabled by self-supervised learning, have taken the entire Natural Language Processing
+(NLP) field by storm, with models such as BERT and BART setting new state of the art on countless natural language
+understanding tasks. While there are some notable exceptions, most of the available models and research have been
+conducted for the English language. In this work, we introduce BARThez, the first BART model for the French language
+(to the best of our knowledge). BARThez was pretrained on a very large monolingual French corpus from past research
+that we adapted to suit BART's perturbation schemes. Unlike already existing BERT-based French language models such as
+CamemBERT and FlauBERT, BARThez is particularly well-suited for generative tasks, since not only its encoder but also
+its decoder is pretrained. In addition to discriminative tasks from the FLUE benchmark, we evaluate BARThez on a novel
+summarization dataset, OrangeSum, that we release with this paper. We also continue the pretraining of an already
+pretrained multilingual BART on BARThez's corpus, and we show that the resulting model, which we call mBARTHez,
+provides a significant boost over vanilla BARThez, and is on par with or outperforms CamemBERT and FlauBERT.*
+
+The Authors' code can be found `here <https://github.com/moussaKam/BARThez>`__.
+
+
+Examples
+_______________________________________________________________________________________________________________________
+
+- BARThez can be fine-tuned on sequence-to-sequence tasks in a similar way as BART, check:
+  :prefix_link:`examples/seq2seq/ <examples/seq2seq/README.md>`.
+
+
+BarthezTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BarthezTokenizer
+    :members:
+
+
+BarthezTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BarthezTokenizerFast
+    :members:
--- a/docs/source/model_doc/bert.rst
+++ b/docs/source/model_doc/bert.rst
@ -1,128 +1,216 @@
-BERT
----------------------------------------------------
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.

-``BertConfig``
-~~~~~~~~~~~~~~~~~~~~~
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+BERT
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The BERT model was proposed in `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding
+<https://arxiv.org/abs/1810.04805>`__ by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. It's a
+bidirectional transformer pretrained using a combination of masked language modeling objective and next sentence
+prediction on a large corpus comprising the Toronto Book Corpus and Wikipedia.
+
+The abstract from the paper is the following:
+
+*We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations
+from Transformers. Unlike recent language representation models, BERT is designed to pre-train deep bidirectional
+representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a result,
+the pre-trained BERT model can be fine-tuned with just one additional output layer to create state-of-the-art models
+for a wide range of tasks, such as question answering and language inference, without substantial task-specific
+architecture modifications.*
+
+*BERT is conceptually simple and empirically powerful. It obtains new state-of-the-art results on eleven natural
+language processing tasks, including pushing the GLUE score to 80.5% (7.7% point absolute improvement), MultiNLI
+accuracy to 86.7% (4.6% absolute improvement), SQuAD v1.1 question answering Test F1 to 93.2 (1.5 point absolute
+improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute improvement).*
+
+Tips:
+
+- BERT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
+  the left.
+- BERT was trained with the masked language modeling (MLM) and next sentence prediction (NSP) objectives. It is
+  efficient at predicting masked tokens and at NLU in general, but is not optimal for text generation.
+
+The original code can be found `here <https://github.com/google-research/bert>`__.
+
+BertConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.BertConfig
    :members:


-``BertTokenizer``
-~~~~~~~~~~~~~~~~~~~~~
+BertTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.BertTokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+BertTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BertTokenizerFast
    :members:


-``BertModel``
-~~~~~~~~~~~~~~~~~~~~
+Bert specific outputs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.models.bert.modeling_bert.BertForPreTrainingOutput
+    :members:
+
+.. autoclass:: transformers.models.bert.modeling_tf_bert.TFBertForPreTrainingOutput
+    :members:
+
+
+BertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.BertModel
-    :members:
+    :members: forward


-``BertForPreTraining``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+BertForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.BertForPreTraining
-    :members:
+    :members: forward


-``BertForMaskedLM``
-~~~~~~~~~~~~~~~~~~~~~~~~~~
+BertModelLMHeadModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BertLMHeadModel
+    :members: forward
+
+
+BertForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.BertForMaskedLM
-    :members:
+    :members: forward


-``BertForNextSentencePrediction``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+BertForNextSentencePrediction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.BertForNextSentencePrediction
-    :members:
+    :members: forward


-``BertForSequenceClassification``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+BertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.BertForSequenceClassification
-    :members:
+    :members: forward


-``BertForMultipleChoice``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+BertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.BertForMultipleChoice
-    :members:
+    :members: forward


-``BertForTokenClassification``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+BertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.BertForTokenClassification
-    :members:
+    :members: forward


-``BertForQuestionAnswering``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+BertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.BertForQuestionAnswering
-    :members:
+    :members: forward


-``TFBertModel``
-~~~~~~~~~~~~~~~~~~~~
+TFBertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.TFBertModel
-    :members:
+.. autoclass:: transformers.TFBertModel
+    :members: call


-``TFBertForPreTraining``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+TFBertForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.TFBertForPreTraining
-    :members:
+.. autoclass:: transformers.TFBertForPreTraining
+    :members: call


-``TFBertForMaskedLM``
-~~~~~~~~~~~~~~~~~~~~~~~~~~
+TFBertModelLMHeadModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.TFBertForMaskedLM
-    :members:
+.. autoclass:: transformers.TFBertLMHeadModel
+    :members: call


-``TFBertForNextSentencePrediction``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+TFBertForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.TFBertForNextSentencePrediction
-    :members:
+.. autoclass:: transformers.TFBertForMaskedLM
+    :members: call


-``TFBertForSequenceClassification``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+TFBertForNextSentencePrediction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.TFBertForSequenceClassification
-    :members:
+.. autoclass:: transformers.TFBertForNextSentencePrediction
+    :members: call


-``TFBertForMultipleChoice``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+TFBertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.TFBertForMultipleChoice
-    :members:
+.. autoclass:: transformers.TFBertForSequenceClassification
+    :members: call


-``TFBertForTokenClassification``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+TFBertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.TFBertForTokenClassification
-    :members:
+.. autoclass:: transformers.TFBertForMultipleChoice
+    :members: call


-``TFBertForQuestionAnswering``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+TFBertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.TFBertForQuestionAnswering
-    :members:
+.. autoclass:: transformers.TFBertForTokenClassification
+    :members: call

+
+TFBertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFBertForQuestionAnswering
+    :members: call
+
+
+FlaxBertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxBertModel
+    :members: __call__
+
+
+FlaxBertForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxBertForMaskedLM
+    :members: __call__
--- a/docs/source/model_doc/bertgeneration.rst
+++ b/docs/source/model_doc/bertgeneration.rst
@ -0,0 +1,108 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+BertGeneration
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The BertGeneration model is a BERT model that can be leveraged for sequence-to-sequence tasks using
+:class:`~transformers.EncoderDecoderModel` as proposed in `Leveraging Pre-trained Checkpoints for Sequence Generation
+Tasks <https://arxiv.org/abs/1907.12461>`__ by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+
+The abstract from the paper is the following:
+
+*Unsupervised pretraining of large neural models has recently revolutionized Natural Language Processing. By
+warm-starting from the publicly released checkpoints, NLP practitioners have pushed the state-of-the-art on multiple
+benchmarks while saving significant amounts of compute time. So far the focus has been mainly on the Natural Language
+Understanding tasks. In this paper, we demonstrate the efficacy of pre-trained checkpoints for Sequence Generation. We
+developed a Transformer-based sequence-to-sequence model that is compatible with publicly available pre-trained BERT,
+GPT-2 and RoBERTa checkpoints and conducted an extensive empirical study on the utility of initializing our model, both
+encoder and decoder, with these checkpoints. Our models result in new state-of-the-art results on Machine Translation,
+Text Summarization, Sentence Splitting, and Sentence Fusion.*
+
+Usage:
+
+- The model can be used in combination with the :class:`~transformers.EncoderDecoderModel` to leverage two pretrained
+  BERT checkpoints for subsequent fine-tuning.
+
+.. code-block::
+
+  # leverage checkpoints for Bert2Bert model...
+  # use BERT's cls token as BOS token and sep token as EOS token
+  encoder = BertGenerationEncoder.from_pretrained("bert-large-uncased", bos_token_id=101, eos_token_id=102)
+  # add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token
+  decoder = BertGenerationDecoder.from_pretrained("bert-large-uncased", add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102)
+  bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder)
+
+  # create tokenizer...
+  tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")
+
+  input_ids = tokenizer('This is a long article to summarize', add_special_tokens=False, return_tensors="pt").input_ids
+  labels = tokenizer('This is a short summary', return_tensors="pt").input_ids
+
+  # train...
+  loss = bert2bert(input_ids=input_ids, decoder_input_ids=labels, labels=labels).loss
+  loss.backward()
+
+
+- Pretrained :class:`~transformers.EncoderDecoderModel` are also directly available in the model hub, e.g.,
+
+
+.. code-block::
+
+  # instantiate sentence fusion model
+  sentence_fuser = EncoderDecoderModel.from_pretrained("google/roberta2roberta_L-24_discofuse")
+  tokenizer = AutoTokenizer.from_pretrained("google/roberta2roberta_L-24_discofuse")
+
+  input_ids = tokenizer('This is the first sentence. This is the second sentence.', add_special_tokens=False, return_tensors="pt").input_ids
+
+  outputs = sentence_fuser.generate(input_ids)
+
+  print(tokenizer.decode(outputs[0]))
+
+
+Tips:
+
+- :class:`~transformers.BertGenerationEncoder` and :class:`~transformers.BertGenerationDecoder` should be used in
+  combination with :class:`~transformers.EncoderDecoder`.
+- For summarization, sentence splitting, sentence fusion and translation, no special tokens are required for the input.
+  Therefore, no EOS token should be added to the end of the input.
+
+The original code can be found `here <https://tfhub.dev/s?module-type=text-generation&subtype=module,placeholder>`__.
+
+BertGenerationConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BertGenerationConfig
+    :members:
+
+
+BertGenerationTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BertGenerationTokenizer
+    :members: save_vocabulary
+
+BertGenerationEncoder
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BertGenerationEncoder
+    :members: forward
+
+
+BertGenerationDecoder
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BertGenerationDecoder
+    :members: forward
--- a/docs/source/model_doc/bertweet.rst
+++ b/docs/source/model_doc/bertweet.rst
@ -0,0 +1,64 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Bertweet
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The BERTweet model was proposed in `BERTweet: A pre-trained language model for English Tweets
+<https://www.aclweb.org/anthology/2020.emnlp-demos.2.pdf>`__ by Dat Quoc Nguyen, Thanh Vu, Anh Tuan Nguyen.
+
+The abstract from the paper is the following:
+
+*We present BERTweet, the first public large-scale pre-trained language model for English Tweets. Our BERTweet, having
+the same architecture as BERT-base (Devlin et al., 2019), is trained using the RoBERTa pre-training procedure (Liu et
+al., 2019). Experiments show that BERTweet outperforms strong baselines RoBERTa-base and XLM-R-base (Conneau et al.,
+2020), producing better performance results than the previous state-of-the-art models on three Tweet NLP tasks:
+Part-of-speech tagging, Named-entity recognition and text classification.*
+
+Example of use:
+
+.. code-block::
+
+  import torch
+  from transformers import AutoModel, AutoTokenizer 
+
+  bertweet = AutoModel.from_pretrained("vinai/bertweet-base")
+
+  # For transformers v4.x+: 
+  tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)
+
+  # For transformers v3.x: 
+  # tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")
+
+  # INPUT TWEET IS ALREADY NORMALIZED!
+  line = "SC has first two presumptive cases of coronavirus , DHEC confirms HTTPURL via @USER :cry:"
+
+  input_ids = torch.tensor([tokenizer.encode(line)])
+
+  with torch.no_grad():
+      features = bertweet(input_ids)  # Models outputs are now tuples
+
+  ## With TensorFlow 2.0+:
+  # from transformers import TFAutoModel
+  # bertweet = TFAutoModel.from_pretrained("vinai/bertweet-base")
+
+
+The original code can be found `here <https://github.com/VinAIResearch/BERTweet>`__.
+
+BertweetTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BertweetTokenizer
+    :members: 
--- a/docs/source/model_doc/blenderbot.rst
+++ b/docs/source/model_doc/blenderbot.rst
@ -0,0 +1,112 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Blenderbot
+-----------------------------------------------------------------------------------------------------------------------
+
+**DISCLAIMER:** If you see something strange, file a `Github Issue
+<https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`__ .
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The Blender chatbot model was proposed in `Recipes for building an open-domain chatbot
+<https://arxiv.org/pdf/2004.13637.pdf>`__ Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu,
+Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston on 30 Apr 2020.
+
+The abstract of the paper is the following:
+
+*Building open-domain chatbots is a challenging area for machine learning research. While prior work has shown that
+scaling neural models in the number of parameters and the size of the data they are trained on gives improved results,
+we show that other ingredients are important for a high-performing chatbot. Good conversation requires a number of
+skills that an expert conversationalist blends in a seamless way: providing engaging talking points and listening to
+their partners, and displaying knowledge, empathy and personality appropriately, while maintaining a consistent
+persona. We show that large scale models can learn these skills when given appropriate training data and choice of
+generation strategy. We build variants of these recipes with 90M, 2.7B and 9.4B parameter models, and make our models
+and code publicly available. Human evaluations show our best models are superior to existing approaches in multi-turn
+dialogue in terms of engagingness and humanness measurements. We then discuss the limitations of this work by analyzing
+failure cases of our models.*
+
+The authors' code can be found `here <https://github.com/facebookresearch/ParlAI>`__ .
+
+
+Implementation Notes
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+- Blenderbot uses a standard `seq2seq model transformer <https://arxiv.org/pdf/1706.03762.pdf>`__ based architecture.
+- Available checkpoints can be found in the `model hub <https://huggingface.co/models?search=blenderbot>`__.
+- This is the `default` Blenderbot model class. However, some smaller checkpoints, such as
+  ``facebook/blenderbot_small_90M``, have a different architecture and consequently should be used with
+  `BlenderbotSmall <https://huggingface.co/transformers/master/model_doc/blenderbot_small.html>`__.
+
+
+Usage
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Here is an example of model usage:
+
+.. code-block::
+
+        >>> from transformers import BlenderbotTokenizer, BlenderbotForConditionalGeneration
+        >>> mname = 'facebook/blenderbot-400M-distill'
+        >>> model = BlenderbotForConditionalGeneration.from_pretrained(mname)
+        >>> tokenizer = BlenderbotTokenizer.from_pretrained(mname)
+        >>> UTTERANCE = "My friends are cool but they eat too many carbs."
+        >>> inputs = tokenizer([UTTERANCE], return_tensors='pt')
+        >>> reply_ids = model.generate(**inputs)
+        >>> print(tokenizer.batch_decode(reply_ids))
+        ["<s> That's unfortunate. Are they trying to lose weight or are they just trying to be healthier?</s>"]
+
+
+BlenderbotConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BlenderbotConfig
+    :members:
+
+BlenderbotTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BlenderbotTokenizer
+    :members: build_inputs_with_special_tokens
+
+
+BlenderbotModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+See :obj:`transformers.BartModel` for arguments to `forward` and `generate`
+
+.. autoclass:: transformers.BlenderbotModel
+    :members: forward
+
+
+BlenderbotForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+See :obj:`transformers.BartForConditionalGeneration` for arguments to `forward` and `generate`
+
+.. autoclass:: transformers.BlenderbotForConditionalGeneration
+    :members: forward
+
+
+TFBlenderbotModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFBlenderbotModel
+    :members: call
+
+
+TFBlenderbotForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFBlenderbotForConditionalGeneration
+    :members: call
--- a/docs/source/model_doc/blenderbot_small.rst
+++ b/docs/source/model_doc/blenderbot_small.rst
@ -0,0 +1,84 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Blenderbot Small
+-----------------------------------------------------------------------------------------------------------------------
+
+Note that :class:`~transformers.BlenderbotSmallModel` and
+:class:`~transformers.BlenderbotSmallForConditionalGeneration` are only used in combination with the checkpoint
+`facebook/blenderbot-90M <https://huggingface.co/facebook/blenderbot-90M>`__. Larger Blenderbot checkpoints should
+instead be used with :class:`~transformers.BlenderbotModel` and
+:class:`~transformers.BlenderbotForConditionalGeneration`
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The Blender chatbot model was proposed in `Recipes for building an open-domain chatbot
+<https://arxiv.org/pdf/2004.13637.pdf>`__ Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu,
+Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston on 30 Apr 2020.
+
+The abstract of the paper is the following:
+
+*Building open-domain chatbots is a challenging area for machine learning research. While prior work has shown that
+scaling neural models in the number of parameters and the size of the data they are trained on gives improved results,
+we show that other ingredients are important for a high-performing chatbot. Good conversation requires a number of
+skills that an expert conversationalist blends in a seamless way: providing engaging talking points and listening to
+their partners, and displaying knowledge, empathy and personality appropriately, while maintaining a consistent
+persona. We show that large scale models can learn these skills when given appropriate training data and choice of
+generation strategy. We build variants of these recipes with 90M, 2.7B and 9.4B parameter models, and make our models
+and code publicly available. Human evaluations show our best models are superior to existing approaches in multi-turn
+dialogue in terms of engagingness and humanness measurements. We then discuss the limitations of this work by analyzing
+failure cases of our models.*
+
+The authors' code can be found `here <https://github.com/facebookresearch/ParlAI>`__ .
+
+BlenderbotSmallConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BlenderbotSmallConfig
+    :members:
+
+
+BlenderbotSmallTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BlenderbotSmallTokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+BlenderbotSmallModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BlenderbotSmallModel
+    :members: forward
+
+
+BlenderbotSmallForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BlenderbotSmallForConditionalGeneration
+    :members: forward
+
+
+TFBlenderbotSmallModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFBlenderbotSmallModel
+    :members: call
+
+
+TFBlenderbotSmallForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFBlenderbotSmallForConditionalGeneration
+    :members: call
--- a/docs/source/model_doc/camembert.rst
+++ b/docs/source/model_doc/camembert.rst
@ -0,0 +1,152 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+CamemBERT
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The CamemBERT model was proposed in `CamemBERT: a Tasty French Language Model <https://arxiv.org/abs/1911.03894>`__ by
+Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suárez, Yoann Dupont, Laurent Romary, Éric Villemonte de la
+Clergerie, Djamé Seddah, and Benoît Sagot. It is based on Facebook's RoBERTa model released in 2019. It is a model
+trained on 138GB of French text.
+
+The abstract from the paper is the following:
+
+*Pretrained language models are now ubiquitous in Natural Language Processing. Despite their success, most available
+models have either been trained on English data or on the concatenation of data in multiple languages. This makes
+practical use of such models --in all languages except English-- very limited. Aiming to address this issue for French,
+we release CamemBERT, a French version of the Bi-directional Encoders for Transformers (BERT). We measure the
+performance of CamemBERT compared to multilingual models in multiple downstream tasks, namely part-of-speech tagging,
+dependency parsing, named-entity recognition, and natural language inference. CamemBERT improves the state of the art
+for most of the tasks considered. We release the pretrained model for CamemBERT hoping to foster research and
+downstream applications for French NLP.*
+
+Tips:
+
+- This implementation is the same as RoBERTa. Refer to the :doc:`documentation of RoBERTa <roberta>` for usage examples
+  as well as the information relative to the inputs and outputs.
+
+The original code can be found `here <https://camembert-model.fr/>`__.
+
+CamembertConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CamembertConfig
+    :members:
+
+
+CamembertTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CamembertTokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+CamembertTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CamembertTokenizerFast
+    :members:
+
+
+CamembertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CamembertModel
+    :members:
+
+
+CamembertForCausalLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CamembertForCausalLM
+    :members:
+
+
+CamembertForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CamembertForMaskedLM
+    :members:
+
+
+CamembertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CamembertForSequenceClassification
+    :members:
+
+
+CamembertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CamembertForMultipleChoice
+    :members:
+
+
+CamembertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CamembertForTokenClassification
+    :members:
+
+
+CamembertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CamembertForQuestionAnswering
+    :members:
+
+
+TFCamembertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFCamembertModel
+    :members:
+
+
+TFCamembertForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFCamembertForMaskedLM
+    :members:
+
+
+TFCamembertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFCamembertForSequenceClassification
+    :members:
+
+
+TFCamembertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFCamembertForMultipleChoice
+    :members:
+
+
+TFCamembertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFCamembertForTokenClassification
+    :members:
+
+
+TFCamembertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFCamembertForQuestionAnswering
+    :members:
--- a/docs/source/model_doc/ctrl.rst
+++ b/docs/source/model_doc/ctrl.rst
@ -0,0 +1,104 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+CTRL
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+CTRL model was proposed in `CTRL: A Conditional Transformer Language Model for Controllable Generation
+<https://arxiv.org/abs/1909.05858>`_ by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and
+Richard Socher. It's a causal (unidirectional) transformer pre-trained using language modeling on a very large corpus
+of ~140 GB of text data with the first token reserved as a control code (such as Links, Books, Wikipedia etc.).
+
+The abstract from the paper is the following:
+
+*Large-scale language models show promising text generation capabilities, but users cannot easily control particular
+aspects of the generated text. We release CTRL, a 1.63 billion-parameter conditional transformer language model,
+trained to condition on control codes that govern style, content, and task-specific behavior. Control codes were
+derived from structure that naturally co-occurs with raw text, preserving the advantages of unsupervised learning while
+providing more explicit control over text generation. These codes also allow CTRL to predict which parts of the
+training data are most likely given a sequence. This provides a potential method for analyzing large amounts of data
+via model-based source attribution.*
+
+Tips:
+
+- CTRL makes use of control codes to generate text: it requires generations to be started by certain words, sentences
+  or links to generate coherent text. Refer to the `original implementation <https://github.com/salesforce/ctrl>`__ for
+  more information.
+- CTRL is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
+  the left.
+- CTRL was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next
+  token in a sequence. Leveraging this feature allows CTRL to generate syntactically coherent text as it can be
+  observed in the `run_generation.py` example script.
+- The PyTorch models can take the `past` as input, which is the previously computed key/value attention pairs. Using
+  this `past` value prevents the model from re-computing pre-computed values in the context of text generation. See
+  `reusing the past in generative models <../quickstart.html#using-the-past>`__ for more information on the usage of
+  this argument.
+
+The original code can be found `here <https://github.com/salesforce/ctrl>`__.
+
+
+CTRLConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CTRLConfig
+    :members:
+
+
+CTRLTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CTRLTokenizer
+    :members: save_vocabulary
+
+
+CTRLModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CTRLModel
+    :members: forward
+
+
+CTRLLMHeadModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CTRLLMHeadModel
+    :members: forward
+
+
+CTRLForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CTRLForSequenceClassification
+    :members: forward
+
+
+TFCTRLModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFCTRLModel
+    :members: call
+
+
+TFCTRLLMHeadModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFCTRLLMHeadModel
+    :members: call
+
+TFCTRLForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFCTRLForSequenceClassification
+    :members: call
--- a/docs/source/model_doc/deberta.rst
+++ b/docs/source/model_doc/deberta.rst
@ -0,0 +1,77 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+DeBERTa
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The DeBERTa model was proposed in `DeBERTa: Decoding-enhanced BERT with Disentangled Attention
+<https://arxiv.org/abs/2006.03654>`__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen It is based on Google's
+BERT model released in 2018 and Facebook's RoBERTa model released in 2019.
+
+It builds on RoBERTa with disentangled attention and enhanced mask decoder training with half of the data used in
+RoBERTa.
+
+The abstract from the paper is the following:
+
+*Recent progress in pre-trained neural language models has significantly improved the performance of many natural
+language processing (NLP) tasks. In this paper we propose a new model architecture DeBERTa (Decoding-enhanced BERT with
+disentangled attention) that improves the BERT and RoBERTa models using two novel techniques. The first is the
+disentangled attention mechanism, where each word is represented using two vectors that encode its content and
+position, respectively, and the attention weights among words are computed using disentangled matrices on their
+contents and relative positions. Second, an enhanced mask decoder is used to replace the output softmax layer to
+predict the masked tokens for model pretraining. We show that these two techniques significantly improve the efficiency
+of model pretraining and performance of downstream tasks. Compared to RoBERTa-Large, a DeBERTa model trained on half of
+the training data performs consistently better on a wide range of NLP tasks, achieving improvements on MNLI by +0.9%
+(90.2% vs. 91.1%), on SQuAD v2.0 by +2.3% (88.4% vs. 90.7%) and RACE by +3.6% (83.2% vs. 86.8%). The DeBERTa code and
+pre-trained models will be made publicly available at https://github.com/microsoft/DeBERTa.*
+
+
+The original code can be found `here <https://github.com/microsoft/DeBERTa>`__.
+
+
+DebertaConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DebertaConfig
+    :members:
+
+
+DebertaTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DebertaTokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+DebertaModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DebertaModel
+    :members:
+
+
+DebertaPreTrainedModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DebertaPreTrainedModel
+    :members:
+
+
+DebertaForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DebertaForSequenceClassification
+    :members:
--- a/docs/source/model_doc/dialogpt.rst
+++ b/docs/source/model_doc/dialogpt.rst
@ -0,0 +1,54 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+DialoGPT
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+DialoGPT was proposed in `DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation
+<https://arxiv.org/abs/1911.00536>`_ by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao,
+Jianfeng Gao, Jingjing Liu, Bill Dolan. It's a GPT2 Model trained on 147M conversation-like exchanges extracted from
+Reddit.
+
+The abstract from the paper is the following:
+
+*We present a large, tunable neural conversational response generation model, DialoGPT (dialogue generative pre-trained
+transformer). Trained on 147M conversation-like exchanges extracted from Reddit comment chains over a period spanning
+from 2005 through 2017, DialoGPT extends the Hugging Face PyTorch transformer to attain a performance close to human
+both in terms of automatic and human evaluation in single-turn dialogue settings. We show that conversational systems
+that leverage DialoGPT generate more relevant, contentful and context-consistent responses than strong baseline
+systems. The pre-trained model and training pipeline are publicly released to facilitate research into neural response
+generation and the development of more intelligent open-domain dialogue systems.*
+
+Tips:
+
+- DialoGPT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather
+  than the left.
+- DialoGPT was trained with a causal language modeling (CLM) objective on conversational data and is therefore powerful
+  at response generation in open-domain dialogue systems.
+- DialoGPT enables the user to create a chat bot in just 10 lines of code as shown on `DialoGPT's model card
+  <https://huggingface.co/microsoft/DialoGPT-medium>`_.
+
+Training:
+
+In order to train or fine-tune DialoGPT, one can use causal language modeling training. To cite the official paper: *We
+follow the OpenAI GPT-2 to model a multiturn dialogue session as a long text and frame the generation task as language
+modeling. We first concatenate all dialog turns within a dialogue session into a long text x_1,..., x_N (N is the
+sequence length), ended by the end-of-text token.* For more information please confer to the original paper.
+
+
+DialoGPT's architecture is based on the GPT2 model, so one can refer to GPT2's `docstring
+<https://huggingface.co/transformers/model_doc/gpt2.html>`_.
+
+The original code can be found `here <https://github.com/microsoft/DialoGPT>`_.
--- a/docs/source/model_doc/distilbert.rst
+++ b/docs/source/model_doc/distilbert.rst
@ -1,70 +1,154 @@
-DistilBERT
----------------------------------------------------
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.

-``DistilBertConfig``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+DistilBERT
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The DistilBERT model was proposed in the blog post `Smaller, faster, cheaper, lighter: Introducing DistilBERT, a
+distilled version of BERT <https://medium.com/huggingface/distilbert-8cf3380435b5>`__, and the paper `DistilBERT, a
+distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`__. DistilBERT is a
+small, fast, cheap and light Transformer model trained by distilling BERT base. It has 40% less parameters than
+`bert-base-uncased`, runs 60% faster while preserving over 95% of BERT's performances as measured on the GLUE language
+understanding benchmark.
+
+The abstract from the paper is the following:
+
+*As Transfer Learning from large-scale pre-trained models becomes more prevalent in Natural Language Processing (NLP),
+operating these large models in on-the-edge and/or under constrained computational training or inference budgets
+remains challenging. In this work, we propose a method to pre-train a smaller general-purpose language representation
+model, called DistilBERT, which can then be fine-tuned with good performances on a wide range of tasks like its larger
+counterparts. While most prior work investigated the use of distillation for building task-specific models, we leverage
+knowledge distillation during the pretraining phase and show that it is possible to reduce the size of a BERT model by
+40%, while retaining 97% of its language understanding capabilities and being 60% faster. To leverage the inductive
+biases learned by larger models during pretraining, we introduce a triple loss combining language modeling,
+distillation and cosine-distance losses. Our smaller, faster and lighter model is cheaper to pre-train and we
+demonstrate its capabilities for on-device computations in a proof-of-concept experiment and a comparative on-device
+study.*
+
+Tips:
+
+- DistilBERT doesn't have :obj:`token_type_ids`, you don't need to indicate which token belongs to which segment. Just
+  separate your segments with the separation token :obj:`tokenizer.sep_token` (or :obj:`[SEP]`).
+- DistilBERT doesn't have options to select the input positions (:obj:`position_ids` input). This could be added if
+  necessary though, just let us know if you need this option.
+
+The original code can be found `here
+<https://github.com/huggingface/transformers/tree/master/examples/distillation>`__.
+
+
+DistilBertConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.DistilBertConfig
    :members:


-``DistilBertTokenizer``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+DistilBertTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.DistilBertTokenizer
    :members:


-``DistilBertModel``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+DistilBertTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DistilBertTokenizerFast
+    :members:
+
+
+DistilBertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.DistilBertModel
-    :members:
+    :members: forward


-``DistilBertForMaskedLM``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+DistilBertForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.DistilBertForMaskedLM
-    :members:
+    :members: forward


-``DistilBertForSequenceClassification``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+DistilBertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.DistilBertForSequenceClassification
-    :members:
+    :members: forward


-``DistilBertForQuestionAnswering``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+DistilBertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DistilBertForMultipleChoice
+    :members: forward
+
+
+DistilBertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DistilBertForTokenClassification
+    :members: forward
+
+
+DistilBertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.DistilBertForQuestionAnswering
-    :members:
+    :members: forward

-``TFDistilBertModel``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+TFDistilBertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.TFDistilBertModel
-    :members:
+.. autoclass:: transformers.TFDistilBertModel
+    :members: call


-``TFDistilBertForMaskedLM``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+TFDistilBertForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.TFDistilBertForMaskedLM
-    :members:
+.. autoclass:: transformers.TFDistilBertForMaskedLM
+    :members: call


-``TFDistilBertForSequenceClassification``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+TFDistilBertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.TFDistilBertForSequenceClassification
-    :members:
+.. autoclass:: transformers.TFDistilBertForSequenceClassification
+    :members: call


-``TFDistilBertForQuestionAnswering``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.TFDistilBertForQuestionAnswering
-    :members:
+TFDistilBertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDistilBertForMultipleChoice
+    :members: call
+
+
+
+TFDistilBertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDistilBertForTokenClassification
+    :members: call
+
+
+TFDistilBertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDistilBertForQuestionAnswering
+    :members: call
--- a/docs/source/model_doc/dpr.rst
+++ b/docs/source/model_doc/dpr.rst
@ -0,0 +1,132 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+DPR
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Dense Passage Retrieval (DPR) is a set of tools and models for state-of-the-art open-domain Q&A research. It was
+introduced in `Dense Passage Retrieval for Open-Domain Question Answering <https://arxiv.org/abs/2004.04906>`__ by
+Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, Wen-tau Yih.
+
+The abstract from the paper is the following:
+
+*Open-domain question answering relies on efficient passage retrieval to select candidate contexts, where traditional
+sparse vector space models, such as TF-IDF or BM25, are the de facto method. In this work, we show that retrieval can
+be practically implemented using dense representations alone, where embeddings are learned from a small number of
+questions and passages by a simple dual-encoder framework. When evaluated on a wide range of open-domain QA datasets,
+our dense retriever outperforms a strong Lucene-BM25 system largely by 9%-19% absolute in terms of top-20 passage
+retrieval accuracy, and helps our end-to-end QA system establish new state-of-the-art on multiple open-domain QA
+benchmarks.*
+
+The original code can be found `here <https://github.com/facebookresearch/DPR>`__.
+
+
+DPRConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DPRConfig
+    :members:
+
+
+DPRContextEncoderTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DPRContextEncoderTokenizer
+    :members:
+
+
+DPRContextEncoderTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DPRContextEncoderTokenizerFast
+    :members:
+
+DPRQuestionEncoderTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DPRQuestionEncoderTokenizer
+    :members:
+
+
+DPRQuestionEncoderTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DPRQuestionEncoderTokenizerFast
+    :members:
+
+DPRReaderTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DPRReaderTokenizer
+    :members:
+
+
+DPRReaderTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DPRReaderTokenizerFast
+    :members:
+
+
+DPR specific outputs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.models.dpr.modeling_dpr.DPRContextEncoderOutput
+    :members:
+
+.. autoclass:: transformers.models.dpr.modeling_dpr.DPRQuestionEncoderOutput
+    :members:
+
+.. autoclass:: transformers.models.dpr.modeling_dpr.DPRReaderOutput
+    :members:
+
+
+DPRContextEncoder
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DPRContextEncoder
+    :members: forward
+
+DPRQuestionEncoder
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DPRQuestionEncoder
+    :members: forward
+
+
+DPRReader
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DPRReader
+    :members: forward
+
+TFDPRContextEncoder
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDPRContextEncoder
+    :members: call
+
+TFDPRQuestionEncoder
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDPRQuestionEncoder
+    :members: call
+
+
+TFDPRReader
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDPRReader
+    :members: call
--- a/docs/source/model_doc/electra.rst
+++ b/docs/source/model_doc/electra.rst
@ -0,0 +1,186 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+ELECTRA
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The ELECTRA model was proposed in the paper `ELECTRA: Pre-training Text Encoders as Discriminators Rather Than
+Generators <https://openreview.net/pdf?id=r1xMH1BtvB>`__. ELECTRA is a new pretraining approach which trains two
+transformer models: the generator and the discriminator. The generator's role is to replace tokens in a sequence, and
+is therefore trained as a masked language model. The discriminator, which is the model we're interested in, tries to
+identify which tokens were replaced by the generator in the sequence.
+
+The abstract from the paper is the following:
+
+*Masked language modeling (MLM) pretraining methods such as BERT corrupt the input by replacing some tokens with [MASK]
+and then train a model to reconstruct the original tokens. While they produce good results when transferred to
+downstream NLP tasks, they generally require large amounts of compute to be effective. As an alternative, we propose a
+more sample-efficient pretraining task called replaced token detection. Instead of masking the input, our approach
+corrupts it by replacing some tokens with plausible alternatives sampled from a small generator network. Then, instead
+of training a model that predicts the original identities of the corrupted tokens, we train a discriminative model that
+predicts whether each token in the corrupted input was replaced by a generator sample or not. Thorough experiments
+demonstrate this new pretraining task is more efficient than MLM because the task is defined over all input tokens
+rather than just the small subset that was masked out. As a result, the contextual representations learned by our
+approach substantially outperform the ones learned by BERT given the same model size, data, and compute. The gains are
+particularly strong for small models; for example, we train a model on one GPU for 4 days that outperforms GPT (trained
+using 30x more compute) on the GLUE natural language understanding benchmark. Our approach also works well at scale,
+where it performs comparably to RoBERTa and XLNet while using less than 1/4 of their compute and outperforms them when
+using the same amount of compute.*
+
+Tips:
+
+- ELECTRA is the pretraining approach, therefore there is nearly no changes done to the underlying model: BERT. The
+  only change is the separation of the embedding size and the hidden size: the embedding size is generally smaller,
+  while the hidden size is larger. An additional projection layer (linear) is used to project the embeddings from their
+  embedding size to the hidden size. In the case where the embedding size is the same as the hidden size, no projection
+  layer is used.
+- The ELECTRA checkpoints saved using `Google Research's implementation <https://github.com/google-research/electra>`__
+  contain both the generator and discriminator. The conversion script requires the user to name which model to export
+  into the correct architecture. Once converted to the HuggingFace format, these checkpoints may be loaded into all
+  available ELECTRA models, however. This means that the discriminator may be loaded in the
+  :class:`~transformers.ElectraForMaskedLM` model, and the generator may be loaded in the
+  :class:`~transformers.ElectraForPreTraining` model (the classification head will be randomly initialized as it
+  doesn't exist in the generator).
+
+The original code can be found `here <https://github.com/google-research/electra>`__.
+
+
+ElectraConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ElectraConfig
+    :members:
+
+
+ElectraTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ElectraTokenizer
+    :members:
+
+
+ElectraTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ElectraTokenizerFast
+    :members:
+
+
+Electra specific outputs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.models.electra.modeling_electra.ElectraForPreTrainingOutput
+    :members:
+
+.. autoclass:: transformers.models.electra.modeling_tf_electra.TFElectraForPreTrainingOutput
+    :members:
+
+
+ElectraModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ElectraModel
+    :members: forward
+
+
+ElectraForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ElectraForPreTraining
+    :members: forward
+
+
+ElectraForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ElectraForMaskedLM
+    :members: forward
+
+
+ElectraForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ElectraForSequenceClassification
+    :members: forward
+
+
+ElectraForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ElectraForMultipleChoice
+    :members: forward
+
+
+ElectraForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ElectraForTokenClassification
+    :members: forward
+
+
+ElectraForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ElectraForQuestionAnswering
+    :members: forward
+
+
+TFElectraModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFElectraModel
+    :members: call
+
+
+TFElectraForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFElectraForPreTraining
+    :members: call
+
+
+TFElectraForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFElectraForMaskedLM
+    :members: call
+
+
+TFElectraForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFElectraForSequenceClassification
+    :members: call
+
+
+TFElectraForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFElectraForMultipleChoice
+    :members: call
+
+
+TFElectraForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFElectraForTokenClassification
+    :members: call
+
+
+TFElectraForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFElectraForQuestionAnswering
+    :members: call
--- a/docs/source/model_doc/encoderdecoder.rst
+++ b/docs/source/model_doc/encoderdecoder.rst
@ -0,0 +1,42 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Encoder Decoder Models
+-----------------------------------------------------------------------------------------------------------------------
+
+The :class:`~transformers.EncoderDecoderModel` can be used to initialize a sequence-to-sequence model with any
+pretrained autoencoding model as the encoder and any pretrained autoregressive model as the decoder.
+
+The effectiveness of initializing sequence-to-sequence models with pretrained checkpoints for sequence generation tasks
+was shown in `Leveraging Pre-trained Checkpoints for Sequence Generation Tasks <https://arxiv.org/abs/1907.12461>`__ by
+Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+
+After such an :class:`~transformers.EncoderDecoderModel` has been trained/fine-tuned, it can be saved/loaded just like
+any other models (see the examples for more information).
+
+An application of this architecture could be to leverage two pretrained :class:`~transformers.BertModel` as the encoder
+and decoder for a summarization model as was shown in: `Text Summarization with Pretrained Encoders
+<https://arxiv.org/abs/1908.08345>`__ by Yang Liu and Mirella Lapata.
+
+
+EncoderDecoderConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.EncoderDecoderConfig
+    :members:
+
+
+EncoderDecoderModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.EncoderDecoderModel
+    :members: forward, from_encoder_decoder_pretrained
--- a/docs/source/model_doc/flaubert.rst
+++ b/docs/source/model_doc/flaubert.rst
@ -0,0 +1,143 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+FlauBERT
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The FlauBERT model was proposed in the paper `FlauBERT: Unsupervised Language Model Pre-training for French
+<https://arxiv.org/abs/1912.05372>`__ by Hang Le et al. It's a transformer model pretrained using a masked language
+modeling (MLM) objective (like BERT).
+
+The abstract from the paper is the following:
+
+*Language models have become a key step to achieve state-of-the art results in many different Natural Language
+Processing (NLP) tasks. Leveraging the huge amount of unlabeled texts nowadays available, they provide an efficient way
+to pre-train continuous word representations that can be fine-tuned for a downstream task, along with their
+contextualization at the sentence level. This has been widely demonstrated for English using contextualized
+representations (Dai and Le, 2015; Peters et al., 2018; Howard and Ruder, 2018; Radford et al., 2018; Devlin et al.,
+2019; Yang et al., 2019b). In this paper, we introduce and share FlauBERT, a model learned on a very large and
+heterogeneous French corpus. Models of different sizes are trained using the new CNRS (French National Centre for
+Scientific Research) Jean Zay supercomputer. We apply our French language models to diverse NLP tasks (text
+classification, paraphrasing, natural language inference, parsing, word sense disambiguation) and show that most of the
+time they outperform other pretraining approaches. Different versions of FlauBERT as well as a unified evaluation
+protocol for the downstream tasks, called FLUE (French Language Understanding Evaluation), are shared to the research
+community for further reproducible experiments in French NLP.*
+
+The original code can be found `here <https://github.com/getalp/Flaubert>`__.
+
+
+FlaubertConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaubertConfig
+    :members:
+
+
+FlaubertTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaubertTokenizer
+    :members:
+
+
+FlaubertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaubertModel
+    :members: forward
+
+
+FlaubertWithLMHeadModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaubertWithLMHeadModel
+    :members: forward
+
+
+FlaubertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaubertForSequenceClassification
+    :members: forward
+
+
+FlaubertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaubertForMultipleChoice
+    :members: forward
+
+
+FlaubertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaubertForTokenClassification
+    :members: forward
+
+
+FlaubertForQuestionAnsweringSimple
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaubertForQuestionAnsweringSimple
+    :members: forward
+
+
+FlaubertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaubertForQuestionAnswering
+    :members: forward
+
+
+TFFlaubertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFFlaubertModel
+    :members: call
+
+
+TFFlaubertWithLMHeadModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFFlaubertWithLMHeadModel
+    :members: call
+
+
+TFFlaubertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFFlaubertForSequenceClassification
+    :members: call
+
+
+TFFlaubertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFFlaubertForMultipleChoice
+    :members: call
+
+
+TFFlaubertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFFlaubertForTokenClassification
+    :members: call
+
+
+TFFlaubertForQuestionAnsweringSimple
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFFlaubertForQuestionAnsweringSimple
+    :members: call
--- a/docs/source/model_doc/fsmt.rst
+++ b/docs/source/model_doc/fsmt.rst
@ -0,0 +1,73 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+FSMT
+-----------------------------------------------------------------------------------------------------------------------
+
+**DISCLAIMER:** If you see something strange, file a `Github Issue
+<https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`__ and assign
+@stas00.
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+FSMT (FairSeq MachineTranslation) models were introduced in `Facebook FAIR's WMT19 News Translation Task Submission
+<https://arxiv.org/abs/1907.06616>`__ by Nathan Ng, Kyra Yee, Alexei Baevski, Myle Ott, Michael Auli, Sergey Edunov.
+
+The abstract of the paper is the following:
+
+*This paper describes Facebook FAIR's submission to the WMT19 shared news translation task. We participate in two
+language pairs and four language directions, English <-> German and English <-> Russian. Following our submission from
+last year, our baseline systems are large BPE-based transformer models trained with the Fairseq sequence modeling
+toolkit which rely on sampled back-translations. This year we experiment with different bitext data filtering schemes,
+as well as with adding filtered back-translated data. We also ensemble and fine-tune our models on domain-specific
+data, then decode using noisy channel model reranking. Our submissions are ranked first in all four directions of the
+human evaluation campaign. On En->De, our system significantly outperforms other systems as well as human translations.
+This system improves upon our WMT'18 submission by 4.5 BLEU points.*
+
+The original code can be found here <https://github.com/pytorch/fairseq/tree/master/examples/wmt19>__.
+
+Implementation Notes
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+- FSMT uses source and target vocabulary pairs that aren't combined into one. It doesn't share embeddings tokens
+  either. Its tokenizer is very similar to :class:`~transformers.XLMTokenizer` and the main model is derived from
+  :class:`~transformers.BartModel`.
+
+
+FSMTConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FSMTConfig
+    :members:
+
+
+FSMTTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FSMTTokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, prepare_seq2seq_batch, save_vocabulary
+
+
+FSMTModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FSMTModel
+    :members: forward
+
+
+FSMTForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FSMTForConditionalGeneration
+    :members: forward
--- a/docs/source/model_doc/funnel.rst
+++ b/docs/source/model_doc/funnel.rst
@ -0,0 +1,196 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Funnel Transformer
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The Funnel Transformer model was proposed in the paper `Funnel-Transformer: Filtering out Sequential Redundancy for
+Efficient Language Processing <https://arxiv.org/abs/2006.03236>`__. It is a bidirectional transformer model, like
+BERT, but with a pooling operation after each block of layers, a bit like in traditional convolutional neural networks
+(CNN) in computer vision.
+
+The abstract from the paper is the following:
+
+*With the success of language pretraining, it is highly desirable to develop more efficient architectures of good
+scalability that can exploit the abundant unlabeled data at a lower cost. To improve the efficiency, we examine the
+much-overlooked redundancy in maintaining a full-length token-level presentation, especially for tasks that only
+require a single-vector presentation of the sequence. With this intuition, we propose Funnel-Transformer which
+gradually compresses the sequence of hidden states to a shorter one and hence reduces the computation cost. More
+importantly, by re-investing the saved FLOPs from length reduction in constructing a deeper or wider model, we further
+improve the model capacity. In addition, to perform token-level predictions as required by common pretraining
+objectives, Funnel-Transformer is able to recover a deep representation for each token from the reduced hidden sequence
+via a decoder. Empirically, with comparable or fewer FLOPs, Funnel-Transformer outperforms the standard Transformer on
+a wide variety of sequence-level prediction tasks, including text classification, language understanding, and reading
+comprehension.*
+
+Tips:
+
+- Since Funnel Transformer uses pooling, the sequence length of the hidden states changes after each block of layers.
+  The base model therefore has a final sequence length that is a quarter of the original one. This model can be used
+  directly for tasks that just require a sentence summary (like sequence classification or multiple choice). For other
+  tasks, the full model is used; this full model has a decoder that upsamples the final hidden states to the same
+  sequence length as the input.
+- The Funnel Transformer checkpoints are all available with a full version and a base version. The first ones should be
+  used for :class:`~transformers.FunnelModel`, :class:`~transformers.FunnelForPreTraining`,
+  :class:`~transformers.FunnelForMaskedLM`, :class:`~transformers.FunnelForTokenClassification` and
+  class:`~transformers.FunnelForQuestionAnswering`. The second ones should be used for
+  :class:`~transformers.FunnelBaseModel`, :class:`~transformers.FunnelForSequenceClassification` and
+  :class:`~transformers.FunnelForMultipleChoice`.
+
+The original code can be found `here <https://github.com/laiguokun/Funnel-Transformer>`__.
+
+
+FunnelConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FunnelConfig
+    :members:
+
+
+FunnelTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FunnelTokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+FunnelTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FunnelTokenizerFast
+    :members:
+
+
+Funnel specific outputs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.models.funnel.modeling_funnel.FunnelForPreTrainingOutput
+    :members:
+
+.. autoclass:: transformers.models.funnel.modeling_tf_funnel.TFFunnelForPreTrainingOutput
+    :members:
+
+
+FunnelBaseModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FunnelBaseModel
+    :members: forward
+
+
+FunnelModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FunnelModel
+    :members: forward
+
+
+FunnelModelForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FunnelForPreTraining
+    :members: forward
+
+
+FunnelForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FunnelForMaskedLM
+    :members: forward
+
+
+FunnelForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FunnelForSequenceClassification
+    :members: forward
+
+
+FunnelForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FunnelForMultipleChoice
+    :members: forward
+
+
+FunnelForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FunnelForTokenClassification
+    :members: forward
+
+
+FunnelForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FunnelForQuestionAnswering
+    :members: forward
+
+
+TFFunnelBaseModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFFunnelBaseModel
+    :members: call
+
+
+TFFunnelModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFFunnelModel
+    :members: call
+
+
+TFFunnelModelForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFFunnelForPreTraining
+    :members: call
+
+
+TFFunnelForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFFunnelForMaskedLM
+    :members: call
+
+
+TFFunnelForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFFunnelForSequenceClassification
+    :members: call
+
+
+TFFunnelForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFFunnelForMultipleChoice
+    :members: call
+
+
+TFFunnelForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFFunnelForTokenClassification
+    :members: call
+
+
+TFFunnelForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFFunnelForQuestionAnswering
+    :members: call
--- a/docs/source/model_doc/gpt.rst
+++ b/docs/source/model_doc/gpt.rst
@ -1,57 +1,146 @@
-OpenAI GPT
----------------------------------------------------
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.

-``OpenAIGPTConfig``
-~~~~~~~~~~~~~~~~~~~~~
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+OpenAI GPT
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+OpenAI GPT model was proposed in `Improving Language Understanding by Generative Pre-Training
+<https://s3-us-west-2.amazonaws.com/openai-assets/research-covers/language-unsupervised/language_understanding_paper.pdf>`__
+by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever. It's a causal (unidirectional) transformer
+pre-trained using language modeling on a large corpus will long range dependencies, the Toronto Book Corpus.
+
+The abstract from the paper is the following:
+
+*Natural language understanding comprises a wide range of diverse tasks such as textual entailment, question answering,
+semantic similarity assessment, and document classification. Although large unlabeled text corpora are abundant,
+labeled data for learning these specific tasks is scarce, making it challenging for discriminatively trained models to
+perform adequately. We demonstrate that large gains on these tasks can be realized by generative pretraining of a
+language model on a diverse corpus of unlabeled text, followed by discriminative fine-tuning on each specific task. In
+contrast to previous approaches, we make use of task-aware input transformations during fine-tuning to achieve
+effective transfer while requiring minimal changes to the model architecture. We demonstrate the effectiveness of our
+approach on a wide range of benchmarks for natural language understanding. Our general task-agnostic model outperforms
+discriminatively trained models that use architectures specifically crafted for each task, significantly improving upon
+the state of the art in 9 out of the 12 tasks studied.*
+
+Tips:
+
+- GPT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
+  the left.
+- GPT was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next
+  token in a sequence. Leveraging this feature allows GPT-2 to generate syntactically coherent text as it can be
+  observed in the `run_generation.py` example script.
+
+`Write With Transformer <https://transformer.huggingface.co/doc/gpt>`__ is a webapp created and hosted by Hugging Face
+showcasing the generative capabilities of several models. GPT is one of them.
+
+The original code can be found `here <https://github.com/openai/finetune-transformer-lm>`__.
+
+Note:
+
+If you want to reproduce the original tokenization process of the `OpenAI GPT` paper, you will need to install ``ftfy``
+and ``SpaCy``::
+
+.. code-block:: bash
+
+    pip install spacy ftfy==4.4.3
+    python -m spacy download en
+
+If you don't install ``ftfy`` and ``SpaCy``, the :class:`~transformers.OpenAIGPTTokenizer` will default to tokenize
+using BERT's :obj:`BasicTokenizer` followed by Byte-Pair Encoding (which should be fine for most usage, don't worry).
+
+OpenAIGPTConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.OpenAIGPTConfig
    :members:


-``OpenAIGPTTokenizer``
-~~~~~~~~~~~~~~~~~~~~~~~~~~
+OpenAIGPTTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.OpenAIGPTTokenizer
+    :members: save_vocabulary
+
+
+OpenAIGPTTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.OpenAIGPTTokenizerFast
    :members:


-``OpenAIGPTModel``
-~~~~~~~~~~~~~~~~~~~~~~~~~
+OpenAI specific outputs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.models.openai.modeling_openai.OpenAIGPTDoubleHeadsModelOutput
+    :members:
+
+.. autoclass:: transformers.models.openai.modeling_tf_openai.TFOpenAIGPTDoubleHeadsModelOutput
+    :members:
+
+
+OpenAIGPTModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.OpenAIGPTModel
-    :members:
+    :members: forward


-``OpenAIGPTLMHeadModel``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+OpenAIGPTLMHeadModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.OpenAIGPTLMHeadModel
-    :members:
+    :members: forward


-``OpenAIGPTDoubleHeadsModel``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+OpenAIGPTDoubleHeadsModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.OpenAIGPTDoubleHeadsModel
-    :members:
+    :members: forward


-``TFOpenAIGPTModel``
-~~~~~~~~~~~~~~~~~~~~~~~~~
+OpenAIGPTForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.TFOpenAIGPTModel
-    :members:
+.. autoclass:: transformers.OpenAIGPTForSequenceClassification
+    :members: forward


-``TFOpenAIGPTLMHeadModel``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+TFOpenAIGPTModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.TFOpenAIGPTLMHeadModel
-    :members:
+.. autoclass:: transformers.TFOpenAIGPTModel
+    :members: call


-``TFOpenAIGPTDoubleHeadsModel``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+TFOpenAIGPTLMHeadModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: pytorch_transformers.TFOpenAIGPTDoubleHeadsModel
-    :members:
+.. autoclass:: transformers.TFOpenAIGPTLMHeadModel
+    :members: call
+
+
+TFOpenAIGPTDoubleHeadsModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFOpenAIGPTDoubleHeadsModel
+    :members: call
+
+TFOpenAIGPTForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFOpenAIGPTForSequenceClassification
+    :members: call
--- a/docs/source/model_doc/gpt2.rst
+++ b/docs/source/model_doc/gpt2.rst
@ -1,57 +1,140 @@
-OpenAI GPT2
----------------------------------------------------
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.

-``GPT2Config``
-~~~~~~~~~~~~~~~~~~~~~
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+OpenAI GPT2
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+OpenAI GPT-2 model was proposed in `Language Models are Unsupervised Multitask Learners
+<https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf>`_ by Alec
+Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei and Ilya Sutskever. It's a causal (unidirectional)
+transformer pretrained using language modeling on a very large corpus of ~40 GB of text data.
+
+The abstract from the paper is the following:
+
+*GPT-2 is a large transformer-based language model with 1.5 billion parameters, trained on a dataset[1] of 8 million
+web pages. GPT-2 is trained with a simple objective: predict the next word, given all of the previous words within some
+text. The diversity of the dataset causes this simple goal to contain naturally occurring demonstrations of many tasks
+across diverse domains. GPT-2 is a direct scale-up of GPT, with more than 10X the parameters and trained on more than
+10X the amount of data.*
+
+Tips:
+
+- GPT-2 is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
+  the left.
+- GPT-2 was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next
+  token in a sequence. Leveraging this feature allows GPT-2 to generate syntactically coherent text as it can be
+  observed in the `run_generation.py` example script.
+- The PyTorch models can take the `past` as input, which is the previously computed key/value attention pairs. Using
+  this `past` value prevents the model from re-computing pre-computed values in the context of text generation. See
+  `reusing the past in generative models <../quickstart.html#using-the-past>`__ for more information on the usage of
+  this argument.
+
+`Write With Transformer <https://transformer.huggingface.co/doc/gpt2-large>`__ is a webapp created and hosted by
+Hugging Face showcasing the generative capabilities of several models. GPT-2 is one of them and is available in five
+different sizes: small, medium, large, xl and a distilled version of the small checkpoint: `distilgpt-2`.
+
+The original code can be found `here <https://openai.com/blog/better-language-models/>`__.
+
+
+GPT2Config
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.GPT2Config
    :members:


-``GPT2Tokenizer``
-~~~~~~~~~~~~~~~~~~~~~
+GPT2Tokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.GPT2Tokenizer
+    :members: save_vocabulary
+
+
+GPT2TokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.GPT2TokenizerFast
    :members:


-``GPT2Model``
-~~~~~~~~~~~~~~~~~~~~~
+GPT2 specific outputs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.models.gpt2.modeling_gpt2.GPT2DoubleHeadsModelOutput
+    :members:
+
+.. autoclass:: transformers.models.gpt2.modeling_tf_gpt2.TFGPT2DoubleHeadsModelOutput
+    :members:
+
+
+GPT2Model
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.GPT2Model
-    :members:
+    :members: forward, parallelize, deparallelize


-``GPT2LMHeadModel``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
+GPT2LMHeadModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.GPT2LMHeadModel
-    :members:
+    :members: forward, parallelize, deparallelize


-``GPT2DoubleHeadsModel``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+GPT2DoubleHeadsModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.GPT2DoubleHeadsModel
-    :members:
-
-
-``TFGPT2Model``
-~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: pytorch_transformers.TFGPT2Model
-    :members:
-
-
-``TFGPT2LMHeadModel``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: pytorch_transformers.TFGPT2LMHeadModel
-    :members:
-
-
-``TFGPT2DoubleHeadsModel``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: pytorch_transformers.TFGPT2DoubleHeadsModel
+    :members: forward
+
+
+GPT2ForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.GPT2ForSequenceClassification
+    :members: forward
+
+
+TFGPT2Model
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFGPT2Model
+    :members: call
+
+
+TFGPT2LMHeadModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFGPT2LMHeadModel
+    :members: call
+
+
+TFGPT2DoubleHeadsModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFGPT2DoubleHeadsModel
+    :members: call
+
+TFGPT2ForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFGPT2ForSequenceClassification
+    :members: call
+
+TFSequenceClassifierOutputWithPast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_tf_outputs.TFSequenceClassifierOutputWithPast
    :members:
--- a/docs/source/model_doc/herbert.rst
+++ b/docs/source/model_doc/herbert.rst
@ -0,0 +1,71 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+herBERT
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The herBERT model was proposed in `KLEJ: Comprehensive Benchmark for Polish Language Understanding
+<https://www.aclweb.org/anthology/2020.acl-main.111.pdf>`__ by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, and
+Ireneusz Gawlik. It is a BERT-based Language Model trained on Polish Corpora using only MLM objective with dynamic
+masking of whole words.
+
+The abstract from the paper is the following:
+
+*In recent years, a series of Transformer-based models unlocked major improvements in general natural language
+understanding (NLU) tasks. Such a fast pace of research would not be possible without general NLU benchmarks, which
+allow for a fair comparison of the proposed methods. However, such benchmarks are available only for a handful of
+languages. To alleviate this issue, we introduce a comprehensive multi-task benchmark for the Polish language
+understanding, accompanied by an online leaderboard. It consists of a diverse set of tasks, adopted from existing
+datasets for named entity recognition, question-answering, textual entailment, and others. We also introduce a new
+sentiment analysis task for the e-commerce domain, named Allegro Reviews (AR). To ensure a common evaluation scheme and
+promote models that generalize to different NLU tasks, the benchmark includes datasets from varying domains and
+applications. Additionally, we release HerBERT, a Transformer-based model trained specifically for the Polish language,
+which has the best average performance and obtains the best results for three out of nine tasks. Finally, we provide an
+extensive evaluation, including several standard baselines and recently proposed, multilingual Transformer-based
+models.*
+
+Examples of use:
+
+.. code-block::
+
+  from transformers import HerbertTokenizer, RobertaModel
+
+  tokenizer = HerbertTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")
+  model = RobertaModel.from_pretrained("allegro/herbert-klej-cased-v1")
+
+  encoded_input = tokenizer.encode("Kto ma lepszą sztukę, ma lepszy rząd – to jasne.", return_tensors='pt')
+  outputs = model(encoded_input)
+
+  # HerBERT can also be loaded using AutoTokenizer and AutoModel:
+  import torch
+  from transformers import AutoModel, AutoTokenizer
+
+  tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")
+  model = AutoModel.from_pretrained("allegro/herbert-klej-cased-v1")
+
+
+The original code can be found `here <https://github.com/allegro/HerBERT>`__.
+
+HerbertTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.HerbertTokenizer
+    :members: 
+
+HerbertTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.HerbertTokenizerFast
+    :members: 
--- a/docs/source/model_doc/layoutlm.rst
+++ b/docs/source/model_doc/layoutlm.rst
@ -0,0 +1,132 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+LayoutLM
+-----------------------------------------------------------------------------------------------------------------------
+
+.. _Overview:
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The LayoutLM model was proposed in the paper `LayoutLM: Pre-training of Text and Layout for Document Image
+Understanding <https://arxiv.org/abs/1912.13318>`__ by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, and
+Ming Zhou. It's a simple but effective pretraining method of text and layout for document image understanding and
+information extraction tasks, such as form understanding and receipt understanding. It obtains state-of-the-art results
+on several downstream tasks:
+
+- form understanding: the `FUNSD <https://guillaumejaume.github.io/FUNSD/>`__ dataset (a collection of 199 annotated
+  forms comprising more than 30,000 words).
+- receipt understanding: the `SROIE <https://rrc.cvc.uab.es/?ch=13>`__ dataset (a collection of 626 receipts for
+  training and 347 receipts for testing).
+- document image classification: the `RVL-CDIP <https://www.cs.cmu.edu/~aharley/rvl-cdip/>`__ dataset (a collection of
+  400,000 images belonging to one of 16 classes).
+
+The abstract from the paper is the following:
+
+*Pre-training techniques have been verified successfully in a variety of NLP tasks in recent years. Despite the
+widespread use of pretraining models for NLP applications, they almost exclusively focus on text-level manipulation,
+while neglecting layout and style information that is vital for document image understanding. In this paper, we propose
+the LayoutLM to jointly model interactions between text and layout information across scanned document images, which is
+beneficial for a great number of real-world document image understanding tasks such as information extraction from
+scanned documents. Furthermore, we also leverage image features to incorporate words' visual information into LayoutLM.
+To the best of our knowledge, this is the first time that text and layout are jointly learned in a single framework for
+document-level pretraining. It achieves new state-of-the-art results in several downstream tasks, including form
+understanding (from 70.72 to 79.27), receipt understanding (from 94.02 to 95.24) and document image classification
+(from 93.07 to 94.42).*
+
+Tips:
+
+- In addition to `input_ids`, :meth:`~transformer.LayoutLMModel.forward` also expects the input :obj:`bbox`, which are
+  the bounding boxes (i.e. 2D-positions) of the input tokens. These can be obtained using an external OCR engine such
+  as Google's `Tesseract <https://github.com/tesseract-ocr/tesseract>`__ (there's a `Python wrapper
+  <https://pypi.org/project/pytesseract/>`__ available). Each bounding box should be in (x0, y0, x1, y1) format, where
+  (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1, y1) represents the
+  position of the lower right corner. Note that one first needs to normalize the bounding boxes to be on a 0-1000
+  scale. To normalize, you can use the following function:
+
+.. code-block::
+
+   def normalize_bbox(bbox, width, height):
+        return [
+            int(1000 * (bbox[0] / width)),
+            int(1000 * (bbox[1] / height)),
+            int(1000 * (bbox[2] / width)),
+            int(1000 * (bbox[3] / height)),
+        ]
+
+Here, :obj:`width` and :obj:`height` correspond to the width and height of the original document in which the token
+occurs. Those can be obtained using the Python Image Library (PIL) library for example, as follows:
+
+.. code-block::
+
+   from PIL import Image
+
+   image = Image.open("name_of_your_document - can be a png file, pdf, etc.")
+
+   width, height = image.size
+
+- For a demo which shows how to fine-tune :class:`LayoutLMForTokenClassification` on the `FUNSD dataset
+  <https://guillaumejaume.github.io/FUNSD/>`__ (a collection of annotated forms), see `this notebook
+  <https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb>`__.
+  It includes an inference part, which shows how to use Google's Tesseract on a new document.
+
+The original code can be found `here <https://github.com/microsoft/unilm/tree/master/layoutlm>`_.
+
+
+LayoutLMConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutLMConfig
+    :members:
+
+
+LayoutLMTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutLMTokenizer
+    :members:
+
+
+LayoutLMTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutLMTokenizerFast
+    :members:
+
+
+LayoutLMModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutLMModel
+    :members:
+
+
+LayoutLMForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutLMForMaskedLM
+    :members:
+
+
+LayoutLMForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutLMForSequenceClassification
+    :members:
+
+
+LayoutLMForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutLMForTokenClassification
+    :members:
--- a/docs/source/model_doc/led.rst
+++ b/docs/source/model_doc/led.rst
@ -0,0 +1,149 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+LED
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The LED model was proposed in `Longformer: The Long-Document Transformer <https://arxiv.org/abs/2004.05150>`__ by Iz
+Beltagy, Matthew E. Peters, Arman Cohan.
+
+The abstract from the paper is the following:
+
+*Transformer-based models are unable to process long sequences due to their self-attention operation, which scales
+quadratically with the sequence length. To address this limitation, we introduce the Longformer with an attention
+mechanism that scales linearly with sequence length, making it easy to process documents of thousands of tokens or
+longer. Longformer's attention mechanism is a drop-in replacement for the standard self-attention and combines a local
+windowed attention with a task motivated global attention. Following prior work on long-sequence transformers, we
+evaluate Longformer on character-level language modeling and achieve state-of-the-art results on text8 and enwik8. In
+contrast to most prior work, we also pretrain Longformer and finetune it on a variety of downstream tasks. Our
+pretrained Longformer consistently outperforms RoBERTa on long document tasks and sets new state-of-the-art results on
+WikiHop and TriviaQA. We finally introduce the Longformer-Encoder-Decoder (LED), a Longformer variant for supporting
+long document generative sequence-to-sequence tasks, and demonstrate its effectiveness on the arXiv summarization
+dataset.*
+
+Tips:
+
+- :class:`~transformers.LEDForConditionalGeneration` is an extension of
+  :class:`~transformers.BartForConditionalGeneration` exchanging the traditional *self-attention* layer with
+  *Longformer*'s *chunked self-attention* layer. :class:`~transformers.LEDTokenizer` is an alias of
+  :class:`~transformers.BartTokenizer`.
+- LED works very well on long-range *sequence-to-sequence* tasks where the ``input_ids`` largely exceed a length of
+  1024 tokens.
+- LED pads the ``input_ids`` to be a multiple of ``config.attention_window`` if required. Therefore a small speed-up is
+  gained, when :class:`~transformers.LEDTokenizer` is used with the ``pad_to_multiple_of`` argument.
+- LED makes use of *global attention* by means of the ``global_attention_mask`` (see
+  :class:`~transformers.LongformerModel`). For summarization, it is advised to put *global attention* only on the first
+  ``<s>`` token. For question answering, it is advised to put *global attention* on all tokens of the question.
+- To fine-tune LED on all 16384, it is necessary to enable *gradient checkpointing* by setting
+  ``config.gradient_checkpointing = True``.
+- A notebook showing how to evaluate LED, can be accessed `here
+  <https://colab.research.google.com/drive/12INTTR6n64TzS4RrXZxMSXfrOd9Xzamo?usp=sharing>`__.
+- A notebook showing how to fine-tune LED, can be accessed `here
+  <https://colab.research.google.com/drive/12LjJazBl7Gam0XBPy_y0CTOJZeZ34c2v?usp=sharing>`__.
+
+
+LEDConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LEDConfig
+    :members:
+
+
+LEDTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LEDTokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+LEDTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LEDTokenizerFast
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+LED specific outputs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.models.led.modeling_led.LEDEncoderBaseModelOutput
+    :members: 
+
+.. autoclass:: transformers.models.led.modeling_led.LEDSeq2SeqModelOutput
+    :members: 
+
+.. autoclass:: transformers.models.led.modeling_led.LEDSeq2SeqLMOutput
+    :members: 
+
+.. autoclass:: transformers.models.led.modeling_led.LEDSeq2SeqSequenceClassifierOutput
+    :members: 
+
+.. autoclass:: transformers.models.led.modeling_led.LEDSeq2SeqQuestionAnsweringModelOutput
+    :members: 
+
+.. autoclass:: transformers.models.led.modeling_tf_led.TFLEDEncoderBaseModelOutput
+    :members: 
+
+.. autoclass:: transformers.models.led.modeling_tf_led.TFLEDSeq2SeqModelOutput
+    :members: 
+
+.. autoclass:: transformers.models.led.modeling_tf_led.TFLEDSeq2SeqLMOutput
+    :members: 
+
+
+
+
+LEDModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LEDModel
+    :members: forward
+
+
+LEDForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LEDForConditionalGeneration
+    :members: forward
+
+
+LEDForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LEDForSequenceClassification
+    :members: forward
+
+
+LEDForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LEDForQuestionAnswering
+    :members: forward
+
+
+TFLEDModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFLEDModel
+    :members: call
+
+
+TFLEDForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFLEDForConditionalGeneration
+    :members: call
--- a/docs/source/model_doc/longformer.rst
+++ b/docs/source/model_doc/longformer.rst
@ -0,0 +1,238 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Longformer
+-----------------------------------------------------------------------------------------------------------------------
+
+**DISCLAIMER:** This model is still a work in progress, if you see something strange, file a `Github Issue
+<https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`__.
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The Longformer model was presented in `Longformer: The Long-Document Transformer
+<https://arxiv.org/pdf/2004.05150.pdf>`__ by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+
+The abstract from the paper is the following:
+
+*Transformer-based models are unable to process long sequences due to their self-attention operation, which scales
+quadratically with the sequence length. To address this limitation, we introduce the Longformer with an attention
+mechanism that scales linearly with sequence length, making it easy to process documents of thousands of tokens or
+longer. Longformer's attention mechanism is a drop-in replacement for the standard self-attention and combines a local
+windowed attention with a task motivated global attention. Following prior work on long-sequence transformers, we
+evaluate Longformer on character-level language modeling and achieve state-of-the-art results on text8 and enwik8. In
+contrast to most prior work, we also pretrain Longformer and finetune it on a variety of downstream tasks. Our
+pretrained Longformer consistently outperforms RoBERTa on long document tasks and sets new state-of-the-art results on
+WikiHop and TriviaQA.*
+
+Tips:
+
+- Since the Longformer is based on RoBERTa, it doesn't have :obj:`token_type_ids`. You don't need to indicate which
+  token belongs to which segment. Just separate your segments with the separation token :obj:`tokenizer.sep_token` (or
+  :obj:`</s>`).
+
+The Authors' code can be found `here <https://github.com/allenai/longformer>`__.
+
+Longformer Self Attention
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Longformer self attention employs self attention on both a "local" context and a "global" context. Most tokens only
+attend "locally" to each other meaning that each token attends to its :math:`\frac{1}{2} w` previous tokens and
+:math:`\frac{1}{2} w` succeding tokens with :math:`w` being the window length as defined in
+:obj:`config.attention_window`. Note that :obj:`config.attention_window` can be of type :obj:`List` to define a
+different :math:`w` for each layer. A selected few tokens attend "globally" to all other tokens, as it is
+conventionally done for all tokens in :obj:`BertSelfAttention`.
+
+Note that "locally" and "globally" attending tokens are projected by different query, key and value matrices. Also note
+that every "locally" attending token not only attends to tokens within its window :math:`w`, but also to all "globally"
+attending tokens so that global attention is *symmetric*.
+
+The user can define which tokens attend "locally" and which tokens attend "globally" by setting the tensor
+:obj:`global_attention_mask` at run-time appropriately. All Longformer models employ the following logic for
+:obj:`global_attention_mask`:
+
+- 0: the token attends "locally",
+- 1: the token attends "globally".
+
+For more information please also refer to :meth:`~transformers.LongformerModel.forward` method.
+
+Using Longformer self attention, the memory and time complexity of the query-key matmul operation, which usually
+represents the memory and time bottleneck, can be reduced from :math:`\mathcal{O}(n_s \times n_s)` to
+:math:`\mathcal{O}(n_s \times w)`, with :math:`n_s` being the sequence length and :math:`w` being the average window
+size. It is assumed that the number of "globally" attending tokens is insignificant as compared to the number of
+"locally" attending tokens.
+
+For more information, please refer to the official `paper <https://arxiv.org/pdf/2004.05150.pdf>`__.
+
+
+Training
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+:class:`~transformers.LongformerForMaskedLM` is trained the exact same way :class:`~transformers.RobertaForMaskedLM` is
+trained and should be used as follows:
+
+.. code-block::
+
+    input_ids = tokenizer.encode('This is a sentence from [MASK] training data', return_tensors='pt')
+    mlm_labels = tokenizer.encode('This is a sentence from the training data', return_tensors='pt')
+
+    loss = model(input_ids, labels=input_ids, masked_lm_labels=mlm_labels)[0]
+
+
+LongformerConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LongformerConfig
+    :members:
+
+
+LongformerTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LongformerTokenizer
+    :members: 
+
+
+LongformerTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LongformerTokenizerFast
+    :members: 
+
+Longformer specific outputs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.models.longformer.modeling_longformer.LongformerBaseModelOutput
+    :members: 
+
+.. autoclass:: transformers.models.longformer.modeling_longformer.LongformerBaseModelOutputWithPooling
+    :members: 
+
+.. autoclass:: transformers.models.longformer.modeling_longformer.LongformerMaskedLMOutput
+    :members: 
+
+.. autoclass:: transformers.models.longformer.modeling_longformer.LongformerQuestionAnsweringModelOutput
+    :members: 
+
+.. autoclass:: transformers.models.longformer.modeling_longformer.LongformerSequenceClassifierOutput
+    :members: 
+
+.. autoclass:: transformers.models.longformer.modeling_longformer.LongformerMultipleChoiceModelOutput
+    :members: 
+
+.. autoclass:: transformers.models.longformer.modeling_longformer.LongformerTokenClassifierOutput
+    :members: 
+
+.. autoclass:: transformers.models.longformer.modeling_tf_longformer.TFLongformerBaseModelOutput
+    :members: 
+
+.. autoclass:: transformers.models.longformer.modeling_tf_longformer.TFLongformerBaseModelOutputWithPooling
+    :members: 
+
+.. autoclass:: transformers.models.longformer.modeling_tf_longformer.TFLongformerMaskedLMOutput
+    :members: 
+
+.. autoclass:: transformers.models.longformer.modeling_tf_longformer.TFLongformerQuestionAnsweringModelOutput
+    :members: 
+
+.. autoclass:: transformers.models.longformer.modeling_tf_longformer.TFLongformerSequenceClassifierOutput
+    :members: 
+
+.. autoclass:: transformers.models.longformer.modeling_tf_longformer.TFLongformerMultipleChoiceModelOutput
+    :members: 
+
+.. autoclass:: transformers.models.longformer.modeling_tf_longformer.TFLongformerTokenClassifierOutput
+    :members: 
+
+LongformerModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LongformerModel
+    :members: forward
+
+
+LongformerForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LongformerForMaskedLM
+    :members: forward
+
+
+LongformerForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LongformerForSequenceClassification
+    :members: forward
+
+
+LongformerForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LongformerForMultipleChoice
+    :members: forward
+
+
+LongformerForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LongformerForTokenClassification
+    :members: forward
+
+
+LongformerForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LongformerForQuestionAnswering
+    :members: forward
+
+
+TFLongformerModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFLongformerModel
+    :members: call
+
+
+TFLongformerForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFLongformerForMaskedLM
+    :members: call
+
+
+TFLongformerForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFLongformerForQuestionAnswering
+    :members: call
+
+
+TFLongformerForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFLongformerForSequenceClassification
+    :members: call
+
+
+TFLongformerForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFLongformerForTokenClassification
+    :members: call
+
+
+TFLongformerForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFLongformerForMultipleChoice
+    :members: call
+
--- a/docs/source/model_doc/lxmert.rst
+++ b/docs/source/model_doc/lxmert.rst
@ -0,0 +1,127 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+LXMERT
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The LXMERT model was proposed in `LXMERT: Learning Cross-Modality Encoder Representations from Transformers
+<https://arxiv.org/abs/1908.07490>`__ by Hao Tan & Mohit Bansal. It is a series of bidirectional transformer encoders
+(one for the vision modality, one for the language modality, and then one to fuse both modalities) pretrained using a
+combination of masked language modeling, visual-language text alignment, ROI-feature regression, masked
+visual-attribute modeling, masked visual-object modeling, and visual-question answering objectives. The pretraining
+consists of multiple multi-modal datasets: MSCOCO, Visual-Genome + Visual-Genome Question Answering, VQA 2.0, and GQA.
+
+The abstract from the paper is the following:
+
+*Vision-and-language reasoning requires an understanding of visual concepts, language semantics, and, most importantly,
+the alignment and relationships between these two modalities. We thus propose the LXMERT (Learning Cross-Modality
+Encoder Representations from Transformers) framework to learn these vision-and-language connections. In LXMERT, we
+build a large-scale Transformer model that consists of three encoders: an object relationship encoder, a language
+encoder, and a cross-modality encoder. Next, to endow our model with the capability of connecting vision and language
+semantics, we pre-train the model with large amounts of image-and-sentence pairs, via five diverse representative
+pretraining tasks: masked language modeling, masked object prediction (feature regression and label classification),
+cross-modality matching, and image question answering. These tasks help in learning both intra-modality and
+cross-modality relationships. After fine-tuning from our pretrained parameters, our model achieves the state-of-the-art
+results on two visual question answering datasets (i.e., VQA and GQA). We also show the generalizability of our
+pretrained cross-modality model by adapting it to a challenging visual-reasoning task, NLVR, and improve the previous
+best result by 22% absolute (54% to 76%). Lastly, we demonstrate detailed ablation studies to prove that both our novel
+model components and pretraining strategies significantly contribute to our strong results; and also present several
+attention visualizations for the different encoders*
+
+Tips:
+
+- Bounding boxes are not necessary to be used in the visual feature embeddings, any kind of visual-spacial features
+  will work.
+- Both the language hidden states and the visual hidden states that LXMERT outputs are passed through the
+  cross-modality layer, so they contain information from both modalities. To access a modality that only attends to
+  itself, select the vision/language hidden states from the first input in the tuple.
+- The bidirectional cross-modality encoder attention only returns attention values when the language modality is used
+  as the input and the vision modality is used as the context vector. Further, while the cross-modality encoder
+  contains self-attention for each respective modality and cross-attention, only the cross attention is returned and
+  both self attention outputs are disregarded.
+
+The original code can be found `here <https://github.com/airsplay/lxmert>`__.
+
+
+LxmertConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LxmertConfig
+    :members:
+
+
+LxmertTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LxmertTokenizer
+    :members:
+
+
+LxmertTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LxmertTokenizerFast
+    :members:
+
+
+Lxmert specific outputs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.models.lxmert.modeling_lxmert.LxmertModelOutput
+    :members:
+
+.. autoclass:: transformers.models.lxmert.modeling_lxmert.LxmertForPreTrainingOutput
+    :members:
+
+.. autoclass:: transformers.models.lxmert.modeling_lxmert.LxmertForQuestionAnsweringOutput
+    :members:
+
+.. autoclass:: transformers.models.lxmert.modeling_tf_lxmert.TFLxmertModelOutput
+    :members:
+
+.. autoclass:: transformers.models.lxmert.modeling_tf_lxmert.TFLxmertForPreTrainingOutput
+    :members:
+
+
+LxmertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LxmertModel
+    :members: forward
+
+LxmertForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LxmertForPreTraining
+    :members: forward
+
+LxmertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LxmertForQuestionAnswering
+    :members: forward
+
+
+TFLxmertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFLxmertModel
+    :members: call
+
+TFLxmertForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFLxmertForPreTraining
+    :members: call
--- a/Show More
+++ b/Show More
				`@ -0,0 +1 @@`
				`$PYTHON setup.py install # Python command to install the script.`