Adding call for contribution for the S4 model

[Flax] Add FlaxBlenderbot (#13633 )
* Init Flax implementation for Blenderbot * Add a majority of stuff except for tests * make style quality * Add tests and fix some bugs * Add tests * Clean source code and fix some bugs * Fix copies and docs * Fix jax device condition for tests * Fix layer norm in the encoder * Fix a few typos in the test file * make fix-copies * make fix-copies * fix layer norm * Fix Flax params dtype (#13090) * Fix PR reference (#13098) * make fix-copies * Update tests/test_modeling_flax_blenderbot.py Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com> Co-authored-by: Suraj Patil <surajp815@gmail.com>
2025-10-20 17:13:56 +08:00 · 2021-11-30 13:53:04 +00:00 · 2021-11-30 17:36:54 +05:30 · 2021-11-30 05:32:20 -05:00 · 2021-11-30 11:07:55 +01:00 · 2021-11-29 17:36:19 +00:00
411 changed files with 34393 additions and 3929 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -65,7 +65,7 @@ jobs:
    run_tests_torch_and_tf:
        working_directory: ~/transformers
        docker:
-            - image: circleci/python:3.6
+            - image: circleci/python:3.7
        environment:
            OMP_NUM_THREADS: 1
            RUN_PT_TF_CROSS_TESTS: yes
@ -81,7 +81,8 @@ jobs:
            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
            - run: pip install --upgrade pip
            - run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision]
-            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
+            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
+            - run: pip install tensorflow_probability
            - save_cache:
                key: v0.4-{{ checksum "setup.py" }}
                paths:
@ -117,7 +118,8 @@ jobs:
            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
            - run: pip install --upgrade pip
            - run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision]
-            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
+            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
+            - run: pip install tensorflow_probability
            - save_cache:
                key: v0.4-{{ checksum "setup.py" }}
                paths:
@ -148,7 +150,7 @@ jobs:
            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
            - run: pip install --upgrade pip
            - run: pip install .[sklearn,flax,torch,testing,sentencepiece,torch-speech,vision]
-            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
+            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
            - save_cache:
                key: v0.4-{{ checksum "setup.py" }}
                paths:
@ -184,7 +186,7 @@ jobs:
            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
            - run: pip install --upgrade pip
            - run: pip install .[sklearn,flax,torch,testing,sentencepiece,torch-speech,vision]
-            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
+            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
            - save_cache:
                key: v0.4-{{ checksum "setup.py" }}
                paths:
@ -214,7 +216,7 @@ jobs:
            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
            - run: pip install --upgrade pip
            - run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]
-            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
+            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
            - save_cache:
                  key: v0.4-torch-{{ checksum "setup.py" }}
                  paths:
@ -249,7 +251,7 @@ jobs:
            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
            - run: pip install --upgrade pip
            - run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]
-            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
+            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
            - save_cache:
                  key: v0.4-torch-{{ checksum "setup.py" }}
                  paths:
@ -277,7 +279,8 @@ jobs:
                      - v0.4-tf-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech]
+            - run: pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]
+            - run: pip install tensorflow_probability
            - save_cache:
                  key: v0.4-tf-{{ checksum "setup.py" }}
                  paths:
@ -310,7 +313,8 @@ jobs:
                      - v0.4-tf-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech]
+            - run: pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]
+            - run: pip install tensorflow_probability
            - save_cache:
                  key: v0.4-tf-{{ checksum "setup.py" }}
                  paths:
@ -401,8 +405,8 @@ jobs:
                      - v0.4-{{ checksum "setup.py" }}
            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision]
-            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
+            - run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]
+            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
            - save_cache:
                  key: v0.4-torch-{{ checksum "setup.py" }}
                  paths:
@ -437,8 +441,8 @@ jobs:
                      - v0.4-{{ checksum "setup.py" }}
            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision]
-            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
+            - run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]
+            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
            - save_cache:
                  key: v0.4-torch-{{ checksum "setup.py" }}
                  paths:
@ -468,6 +472,7 @@ jobs:
                      - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
            - run: pip install .[sklearn,tf-cpu,testing,sentencepiece]
+            - run: pip install tensorflow_probability
            - save_cache:
                  key: v0.4-tf-{{ checksum "setup.py" }}
                  paths:
@ -502,6 +507,7 @@ jobs:
                      - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
            - run: pip install .[sklearn,tf-cpu,testing,sentencepiece]
+            - run: pip install tensorflow_probability
            - save_cache:
                  key: v0.4-tf-{{ checksum "setup.py" }}
                  paths:
@ -753,7 +759,8 @@ jobs:
            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
            - run: pip install --upgrade pip
            - run: pip install ."[docs]"
-            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
+            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
+            - run: pip install pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com
            - save_cache:
                  key: v0.4-build_doc-{{ checksum "setup.py" }}
                  paths:
@ -800,7 +807,6 @@ jobs:
                      - v0.4-code_quality-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
-            - run: pip install isort GitPython
            - run: pip install .[all,quality]
            - save_cache:
                  key: v0.4-code_quality-{{ checksum "setup.py" }}
@ -811,6 +817,27 @@ jobs:
            - run: python utils/custom_init_isort.py --check_only
            - run: flake8 examples tests src utils
            - run: python utils/style_doc.py src/transformers docs/source --max_len 119 --check_only
+
+    check_repository_consistency:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.6
+        resource_class: large
+        environment:
+            TRANSFORMERS_IS_CI: yes
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.4-repository_consistency-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: pip install --upgrade pip
+            - run: pip install .[all,quality]
+            - save_cache:
+                  key: v0.4-repository_consistency-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
            - run: python utils/check_copies.py
            - run: python utils/check_table.py
            - run: python utils/check_dummies.py
@ -819,17 +846,6 @@ jobs:
            - run: make deps_table_check_updated
            - run: python utils/tests_fetcher.py --sanity_check

-    check_repository_consistency:
-        working_directory: ~/transformers
-        docker:
-            - image: circleci/python:3.6
-        resource_class: small
-        parallelism: 1
-        steps:
-            - checkout
-            - run: pip install requests
-            - run: python ./utils/link_tester.py
-
    run_tests_layoutlmv2:
        working_directory: ~/transformers
        docker:
--- a/.circleci/deploy.sh
+++ b/.circleci/deploy.sh
@ -76,4 +76,9 @@ deploy_doc "28e2787" v4.10.1
 deploy_doc "dc193c9" v4.11.0
 deploy_doc "54f9d62" v4.11.1
 deploy_doc "7655f11" v4.11.2
-deploy_doc "65659a2"  # v4.11.3 Latest stable release
+deploy_doc "65659a2" v4.11.3
+deploy_doc "62bf536" v4.12.0
+deploy_doc "e0a5154" v4.12.1
+deploy_doc "2191373" v4.12.2
+deploy_doc "527c763" v4.12.4
+deploy_doc "ef3cec0"  # v4.12.5 Latest stable release
--- a/.github/workflows/self-nightly-scheduled.yml
+++ b/.github/workflows/self-nightly-scheduled.yml
@ -205,8 +205,9 @@ jobs:
                  apt -y update && apt install -y libaio-dev
                  pip install --upgrade pip
                  pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html -U
-                  pip install .[testing,deepspeed,fairscale]
-                  pip install git+https://github.com/microsoft/DeepSpeed
+                  rm -rf ~/.cache/torch_extensions/ # shared between conflicting builds
+                  pip install .[testing,fairscale]
+                  pip install git+https://github.com/microsoft/DeepSpeed # testing bleeding edge

            - name: Are GPUs recognized by our DL frameworks
              run: |
@ -218,7 +219,7 @@ jobs:
            - name: Run all tests on GPU
              run: |
                  python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended
- 
+
            - name: Failure short reports
              if: ${{ always() }}
              run: cat reports/tests_torch_cuda_extensions_multi_gpu_failures_short.txt
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@ -50,7 +50,7 @@ jobs:
          python -c "import torch; print('Cuda version:', torch.version.cuda)"
          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
-      
+
      - name: Fetch the tests to run
        run: |
          python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
@ -105,7 +105,7 @@ jobs:
        run: |
          python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)"
          python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))"
-      
+
      - name: Fetch the tests to run
        run: |
          python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
@ -203,7 +203,7 @@ jobs:
          apt install -y libsndfile1-dev
          pip install --upgrade pip
          pip install .[sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
-      
+
      - name: Launcher docker
        uses: actions/checkout@v2
        with:
@ -277,7 +277,7 @@ jobs:
 #        run: |
 #          python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)"
 #          python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))"
-#      
+#
 #      - name: Fetch the tests to run
 #        run: |
 #          python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
@ -389,11 +389,11 @@ jobs:
          python -c "import torch; print('Cuda version:', torch.version.cuda)"
          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
-      
+
      - name: Fetch the tests to run
        run: |
          python utils/tests_fetcher.py --diff_with_last_commit --filters tests/deepspeed tests/extended | tee test_preparation.txt
-      
+
      - name: Report fetched tests
        uses: actions/upload-artifact@v2
        with:
@ -437,6 +437,7 @@ jobs:
        run: |
          apt -y update && apt install -y libaio-dev
          pip install --upgrade pip
+          rm -rf ~/.cache/torch_extensions/ # shared between conflicting builds
          pip install .[testing,deepspeed,fairscale]

      - name: Are GPUs recognized by our DL frameworks
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@ -143,7 +143,7 @@ jobs:
        run: |
          apt -y update && apt install -y libsndfile1-dev git
          pip install --upgrade pip
-          pip install .[sklearn,testing,onnx,sentencepiece,tf-speech]
+          pip install .[sklearn,testing,onnx,sentencepiece,tf-speech,vision]

      - name: Are GPUs recognized by our DL frameworks
        run: |
@ -293,7 +293,7 @@ jobs:
        run: |
          apt -y update && apt install -y libsndfile1-dev git
          pip install --upgrade pip
-          pip install .[sklearn,testing,onnx,sentencepiece,tf-speech]
+          pip install .[sklearn,testing,onnx,sentencepiece,tf-speech,vision]

      - name: Are GPUs recognized by our DL frameworks
        run: |
@ -429,6 +429,7 @@ jobs:
        run: |
          apt -y update && apt install -y libaio-dev
          pip install --upgrade pip
+          rm -rf ~/.cache/torch_extensions/ # shared between conflicting builds
          pip install .[testing,deepspeed,fairscale]

      - name: Are GPUs recognized by our DL frameworks
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -273,9 +273,11 @@ Follow these steps to start contributing:
   - If you are adding a new tokenizer, write tests, and make sure
     `RUN_SLOW=1 python -m pytest tests/test_tokenization_{your_model_name}.py` passes.
   CircleCI does not run the slow tests, but github actions does every night!
-6. All public methods must have informative docstrings that work nicely with sphinx. See `modeling_ctrl.py` for an
+6. All public methods must have informative docstrings that work nicely with sphinx. See `modeling_bert.py` for an
   example.

+See more about the checks run on a pull request in our [PR guide](https://huggingface.co/transformers/master/pr_tests.html)
+
 ### Tests

 An extensive test suite is included to test the library behavior and several examples. Library tests can be found in
--- a/10
+++ b/10
@ -31,9 +31,9 @@ deps_table_check_updated:

 autogenerate_code: deps_table_update

-# Check that source code meets quality standards
+# Check that the repo is in a good state

-extra_quality_checks:
+repo-consistency:
 	python utils/check_copies.py
 	python utils/check_table.py
 	python utils/check_dummies.py
@ -42,12 +42,13 @@ extra_quality_checks:
 	python utils/tests_fetcher.py --sanity_check

 # this target runs checks on all files
+
 quality:
 	black --check $(check_dirs)
 	isort --check-only $(check_dirs)
 	python utils/custom_init_isort.py --check_only
 	flake8 $(check_dirs)
-	${MAKE} extra_quality_checks
+	python utils/style_doc.py src/transformers docs/source --max_len 119 --check_only

 # Format source code automatically and check is there are any problems left that need manual fixing

@ -56,6 +57,7 @@ extra_style_checks:
 	python utils/style_doc.py src/transformers docs/source --max_len 119

 # this target runs checks on all files and potentially modifies some of them
+
 style:
 	black $(check_dirs)
 	isort $(check_dirs)
@ -64,7 +66,7 @@ style:

 # Super fast fix and check target that only works on relevant modified files since the branch was made

-fixup: modified_only_fixup extra_style_checks autogenerate_code extra_quality_checks
+fixup: modified_only_fixup extra_style_checks autogenerate_code repo-consistency

 # Make marked copies of snippets of codes conform to the original

--- a/README.md
+++ b/README.md
@ -249,6 +249,7 @@ Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[GPT Neo](https://huggingface.co/transformers/model_doc/gpt_neo.html)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
 1. **[Hubert](https://huggingface.co/transformers/model_doc/hubert.html)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/transformers/model_doc/ibert.html)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
+1. **[ImageGPT](https://huggingface.co/transformers/master/model_doc/imagegpt.html)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
 1. **[LayoutLM](https://huggingface.co/transformers/model_doc/layoutlm.html)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 1. **[LayoutLMv2](https://huggingface.co/transformers/model_doc/layoutlmv2.html)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
 1. **[LayoutXLM](https://huggingface.co/transformers/model_doc/layoutlmv2.html)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
@ -267,6 +268,7 @@ Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[PhoBERT](https://huggingface.co/transformers/model_doc/phobert.html)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
 1. **[ProphetNet](https://huggingface.co/transformers/model_doc/prophetnet.html)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[QDQBert](https://huggingface.co/transformers/model_doc/qdqbert.html)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
 1. **[Reformer](https://huggingface.co/transformers/model_doc/reformer.html)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
 1. **[RemBERT](https://huggingface.co/transformers/model_doc/rembert.html)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
 1. **[RoBERTa](https://huggingface.co/transformers/model_doc/roberta.html)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
--- a/README_ko.md
+++ b/README_ko.md
@ -247,6 +247,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[GPT-J](https://huggingface.co/transformers/model_doc/gptj.html)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
 1. **[Hubert](https://huggingface.co/transformers/model_doc/hubert.html)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/transformers/model_doc/ibert.html)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
+1. **[ImageGPT](https://huggingface.co/transformers/master/model_doc/imagegpt.html)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
 1. **[LayoutLM](https://huggingface.co/transformers/model_doc/layoutlm.html)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 1. **[LayoutLMv2](https://huggingface.co/transformers/model_doc/layoutlmv2.html)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
 1. **[LayoutXLM](https://huggingface.co/transformers/model_doc/layoutlmv2.html)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
@ -265,6 +266,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[PhoBERT](https://huggingface.co/transformers/model_doc/phobert.html)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
 1. **[ProphetNet](https://huggingface.co/transformers/model_doc/prophetnet.html)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[QDQBert](https://huggingface.co/transformers/model_doc/qdqbert.html)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
 1. **[Reformer](https://huggingface.co/transformers/model_doc/reformer.html)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
 1. **[RemBERT](https://huggingface.co/transformers/model_doc/rembert.html)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
 1. **[RoBERTa](https://huggingface.co/transformers/model_doc/roberta.html)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@ -271,6 +271,7 @@ conda install -c huggingface transformers
 1. **[GPT-J](https://huggingface.co/transformers/model_doc/gptj.html)** (来自 EleutherAI) 伴随论文 [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) 由 Ben Wang and Aran Komatsuzaki 发布。
 1. **[Hubert](https://huggingface.co/transformers/model_doc/hubert.html)** (来自 Facebook) 伴随论文 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 由 Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 发布。
 1. **[I-BERT](https://huggingface.co/transformers/model_doc/ibert.html)** (来自 Berkeley) 伴随论文 [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 由 Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer 发布。
+1. **[ImageGPT](https://huggingface.co/transformers/master/model_doc/imagegpt.html)** (来自 OpenAI) 伴随论文 [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) 由 Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever 发布。
 1. **[LayoutLM](https://huggingface.co/transformers/model_doc/layoutlm.html)** (来自 Microsoft Research Asia) 伴随论文 [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) 由 Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou 发布。
 1. **[LayoutLMv2](https://huggingface.co/transformers/model_doc/layoutlmv2.html)** (来自 Microsoft Research Asia) 伴随论文 [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) 由 Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou 发布。
 1. **[LayoutXLM](https://huggingface.co/transformers/model_doc/layoutlmv2.html)** (来自 Microsoft Research Asia) 伴随论文 [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) 由 Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei 发布。
@ -289,6 +290,7 @@ conda install -c huggingface transformers
 1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (来自 Google) 伴随论文 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 由 Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 发布。
 1. **[PhoBERT](https://huggingface.co/transformers/model_doc/phobert.html)** (来自 VinAI Research) 伴随论文 [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) 由 Dat Quoc Nguyen and Anh Tuan Nguyen 发布。
 1. **[ProphetNet](https://huggingface.co/transformers/model_doc/prophetnet.html)** (来自 Microsoft Research) 伴随论文 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 由 Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 发布。
+1. **[QDQBert](https://huggingface.co/transformers/model_doc/qdqbert.html)** (来自 NVIDIA) 伴随论文 [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) 由 Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius 发布。
 1. **[Reformer](https://huggingface.co/transformers/model_doc/reformer.html)** (来自 Google Research) 伴随论文 [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) 由 Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya 发布。
 1. **[RemBERT](https://huggingface.co/transformers/model_doc/rembert.html)** (来自 Google Research) 伴随论文 [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) 由 Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder 发布。
 1. **[RoBERTa](https://huggingface.co/transformers/model_doc/roberta.html)** (来自 Facebook), 伴随论文 [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) 由 Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov 发布。
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@ -283,6 +283,7 @@ conda install -c huggingface transformers
 1. **[GPT-J](https://huggingface.co/transformers/model_doc/gptj.html)** (from EleutherAI) released with the paper [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
 1. **[Hubert](https://huggingface.co/transformers/model_doc/hubert.html)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 1. **[I-BERT](https://huggingface.co/transformers/model_doc/ibert.html)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
+1. **[ImageGPT](https://huggingface.co/transformers/master/model_doc/imagegpt.html)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
 1. **[LayoutLM](https://huggingface.co/transformers/model_doc/layoutlm.html)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 1. **[LayoutLMv2](https://huggingface.co/transformers/model_doc/layoutlmv2.html)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
 1. **[LayoutXLM](https://huggingface.co/transformers/model_doc/layoutlmv2.html)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
@ -301,6 +302,7 @@ conda install -c huggingface transformers
 1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[PhoBERT](https://huggingface.co/transformers/model_doc/phobert.html)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
 1. **[ProphetNet](https://huggingface.co/transformers/model_doc/prophetnet.html)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[QDQBert](https://huggingface.co/transformers/model_doc/qdqbert.html)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
 1. **[Reformer](https://huggingface.co/transformers/model_doc/reformer.html)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
 1. **[RemBERT](https://huggingface.co/transformers/model_doc/rembert.html)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
 1. **[RoBERTa](https://huggingface.co/transformers/model_doc/roberta.html)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
--- a/docs/source/_static/js/custom.js
+++ b/docs/source/_static/js/custom.js
@ -1,10 +1,11 @@
 // These two things need to be updated at each release for the version selector.
 // Last stable version
-const stableVersion = "v4.11.3"
+const stableVersion = "v4.12.5"
 // Dictionary doc folder to label. The last stable version should have an empty key.
 const versionMapping = {
    "master": "master",
-    "": "v4.11.0/v4.11.1/v4.11.2/v4.11.3 (stable)",
+    "": "v4.12.0/v4.12.1/v4.12.2/v4.12.4/v4.12.5 (stable)",
+    "v4.11.3": "v4.11.0/v4.11.1/v4.11.2/v4.11.3",
    "v4.10.1": "v4.10.0/v4.10.1",
    "v4.9.2": "v4.9.0/v4.9.1/v4.9.2",
    "v4.8.2": "v4.8.0/v4.8.1/v4.8.2",
--- a/docs/source/add_new_pipeline.rst
+++ b/docs/source/add_new_pipeline.rst
@ -29,23 +29,23 @@ Start by inheriting the base class :obj:`Pipeline`. with the 4 methods needed to
    from transformers import Pipeline

    class MyPipeline(Pipeline):
-        def _sanitize_parameters(self, **kwargs)
+        def _sanitize_parameters(self, **kwargs):
            preprocess_kwargs = {}
            if "maybe_arg" in kwargs:
                preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
            return preprocess_kwargs, {}, {}

-        def preprocess(self, inputs, maybe_arg=2)
+        def preprocess(self, inputs, maybe_arg=2):
            model_input = Tensor(....)
            return {"model_input": model_input}

-        def _forward(self, model_inputs)
+        def _forward(self, model_inputs):
            # model_inputs == {"model_input": model_input}
-            oututs = self.model(**model_inputs)
+            outputs = self.model(**model_inputs)
            # Maybe {"logits": Tensor(...)}
            return outputs

-        def postprocess(self, model_outputs)
+        def postprocess(self, model_outputs):
            best_class = model_outputs["logits"].softmax(-1)
            return best_class

@ -89,12 +89,12 @@ In order to achieve that, we'll update our :obj:`postprocess` method with a defa
 .. code-block::


-        def postprocess(self, model_outputs, top_k=5)
+        def postprocess(self, model_outputs, top_k=5):
            best_class = model_outputs["logits"].softmax(-1)
            # Add logic to handle top_k
            return best_class

-        def _sanitize_parameters(self, **kwargs)
+        def _sanitize_parameters(self, **kwargs):
            preprocess_kwargs = {}
            if "maybe_arg" in kwargs:
                preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -27,7 +27,8 @@ author = "huggingface"
 # The short X.Y version
 version = ""
 # The full version, including alpha/beta/rc tags
-release = "4.12.0"
+release = "4.13.0.dev0"
+



--- a/docs/source/custom_datasets.rst
+++ b/docs/source/custom_datasets.rst
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -213,139 +213,145 @@ Supported models
    Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
 38. :doc:`I-BERT <model_doc/ibert>` (from Berkeley) released with the paper `I-BERT: Integer-only BERT Quantization
    <https://arxiv.org/abs/2101.01321>`__ by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
-39. :doc:`LayoutLM <model_doc/layoutlm>` (from Microsoft Research Asia) released with the paper `LayoutLM: Pre-training
+39. `ImageGPT <https://huggingface.co/transformers/master/model_doc/imagegpt.html>`__ (from OpenAI) released with the
+    paper `Generative Pretraining from Pixels <https://openai.com/blog/image-gpt/>`__ by Mark Chen, Alec Radford, Rewon
+    Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
+40. :doc:`LayoutLM <model_doc/layoutlm>` (from Microsoft Research Asia) released with the paper `LayoutLM: Pre-training
    of Text and Layout for Document Image Understanding <https://arxiv.org/abs/1912.13318>`__ by Yiheng Xu, Minghao Li,
    Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
-40. :doc:`LayoutLMv2 <model_doc/layoutlmv2>` (from Microsoft Research Asia) released with the paper `LayoutLMv2:
+41. :doc:`LayoutLMv2 <model_doc/layoutlmv2>` (from Microsoft Research Asia) released with the paper `LayoutLMv2:
    Multi-modal Pre-training for Visually-Rich Document Understanding <https://arxiv.org/abs/2012.14740>`__ by Yang Xu,
    Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min
    Zhang, Lidong Zhou.
-41. :doc:`LayoutXLM <model_doc/layoutlmv2>` (from Microsoft Research Asia) released with the paper `LayoutXLM:
+42. :doc:`LayoutXLM <model_doc/layoutlmv2>` (from Microsoft Research Asia) released with the paper `LayoutXLM:
    Multimodal Pre-training for Multilingual Visually-rich Document Understanding <https://arxiv.org/abs/2104.08836>`__
    by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
-42. :doc:`LED <model_doc/led>` (from AllenAI) released with the paper `Longformer: The Long-Document Transformer
+43. :doc:`LED <model_doc/led>` (from AllenAI) released with the paper `Longformer: The Long-Document Transformer
    <https://arxiv.org/abs/2004.05150>`__ by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-43. :doc:`Longformer <model_doc/longformer>` (from AllenAI) released with the paper `Longformer: The Long-Document
+44. :doc:`Longformer <model_doc/longformer>` (from AllenAI) released with the paper `Longformer: The Long-Document
    Transformer <https://arxiv.org/abs/2004.05150>`__ by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-44. :doc:`LUKE <model_doc/luke>` (from Studio Ousia) released with the paper `LUKE: Deep Contextualized Entity
+45. :doc:`LUKE <model_doc/luke>` (from Studio Ousia) released with the paper `LUKE: Deep Contextualized Entity
    Representations with Entity-aware Self-attention <https://arxiv.org/abs/2010.01057>`__ by Ikuya Yamada, Akari Asai,
    Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
-45. :doc:`LXMERT <model_doc/lxmert>` (from UNC Chapel Hill) released with the paper `LXMERT: Learning Cross-Modality
+46. :doc:`LXMERT <model_doc/lxmert>` (from UNC Chapel Hill) released with the paper `LXMERT: Learning Cross-Modality
    Encoder Representations from Transformers for Open-Domain Question Answering <https://arxiv.org/abs/1908.07490>`__
    by Hao Tan and Mohit Bansal.
-46. :doc:`M2M100 <model_doc/m2m_100>` (from Facebook) released with the paper `Beyond English-Centric Multilingual
+47. :doc:`M2M100 <model_doc/m2m_100>` (from Facebook) released with the paper `Beyond English-Centric Multilingual
    Machine Translation <https://arxiv.org/abs/2010.11125>`__ by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma,
    Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal,
    Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
-47. :doc:`MarianMT <model_doc/marian>` Machine translation models trained using `OPUS <http://opus.nlpl.eu/>`__ data by
+48. :doc:`MarianMT <model_doc/marian>` Machine translation models trained using `OPUS <http://opus.nlpl.eu/>`__ data by
    Jörg Tiedemann. The `Marian Framework <https://marian-nmt.github.io/>`__ is being developed by the Microsoft
    Translator Team.
-48. :doc:`MBart <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Denoising Pre-training for
+49. :doc:`MBart <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Denoising Pre-training for
    Neural Machine Translation <https://arxiv.org/abs/2001.08210>`__ by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li,
    Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
-49. :doc:`MBart-50 <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Translation with Extensible
+50. :doc:`MBart-50 <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Translation with Extensible
    Multilingual Pretraining and Finetuning <https://arxiv.org/abs/2008.00401>`__ by Yuqing Tang, Chau Tran, Xian Li,
    Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
-50. :doc:`Megatron-BERT <model_doc/megatron_bert>` (from NVIDIA) released with the paper `Megatron-LM: Training
+51. :doc:`Megatron-BERT <model_doc/megatron_bert>` (from NVIDIA) released with the paper `Megatron-LM: Training
    Multi-Billion Parameter Language Models Using Model Parallelism <https://arxiv.org/abs/1909.08053>`__ by Mohammad
    Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
-51. :doc:`Megatron-GPT2 <model_doc/megatron_gpt2>` (from NVIDIA) released with the paper `Megatron-LM: Training
+52. :doc:`Megatron-GPT2 <model_doc/megatron_gpt2>` (from NVIDIA) released with the paper `Megatron-LM: Training
    Multi-Billion Parameter Language Models Using Model Parallelism <https://arxiv.org/abs/1909.08053>`__ by Mohammad
    Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
-52. :doc:`MPNet <model_doc/mpnet>` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted
+53. :doc:`MPNet <model_doc/mpnet>` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted
    Pre-training for Language Understanding <https://arxiv.org/abs/2004.09297>`__ by Kaitao Song, Xu Tan, Tao Qin,
    Jianfeng Lu, Tie-Yan Liu.
-53. :doc:`MT5 <model_doc/mt5>` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained
+54. :doc:`MT5 <model_doc/mt5>` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained
    text-to-text transformer <https://arxiv.org/abs/2010.11934>`__ by Linting Xue, Noah Constant, Adam Roberts, Mihir
    Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
-54. :doc:`Pegasus <model_doc/pegasus>` (from Google) released with the paper `PEGASUS: Pre-training with Extracted
+55. :doc:`Pegasus <model_doc/pegasus>` (from Google) released with the paper `PEGASUS: Pre-training with Extracted
    Gap-sentences for Abstractive Summarization <https://arxiv.org/abs/1912.08777>`__ by Jingqing Zhang, Yao Zhao,
    Mohammad Saleh and Peter J. Liu.
-55. :doc:`PhoBERT <model_doc/phobert>` (from VinAI Research) released with the paper `PhoBERT: Pre-trained language
+56. :doc:`PhoBERT <model_doc/phobert>` (from VinAI Research) released with the paper `PhoBERT: Pre-trained language
    models for Vietnamese <https://www.aclweb.org/anthology/2020.findings-emnlp.92/>`__ by Dat Quoc Nguyen and Anh Tuan
    Nguyen.
-56. :doc:`ProphetNet <model_doc/prophetnet>` (from Microsoft Research) released with the paper `ProphetNet: Predicting
+57. :doc:`ProphetNet <model_doc/prophetnet>` (from Microsoft Research) released with the paper `ProphetNet: Predicting
    Future N-gram for Sequence-to-Sequence Pre-training <https://arxiv.org/abs/2001.04063>`__ by Yu Yan, Weizhen Qi,
    Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-57. :doc:`Reformer <model_doc/reformer>` (from Google Research) released with the paper `Reformer: The Efficient
+58. :doc:`QDQBert <model_doc/qdqbert>` (from NVIDIA) released with the paper `Integer Quantization for Deep Learning
+    Inference: Principles and Empirical Evaluation <https://arxiv.org/abs/2004.09602>`__ by Hao Wu, Patrick Judd,
+    Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
+59. :doc:`Reformer <model_doc/reformer>` (from Google Research) released with the paper `Reformer: The Efficient
    Transformer <https://arxiv.org/abs/2001.04451>`__ by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
-58. :doc:`RemBERT <model_doc/rembert>` (from Google Research) released with the paper `Rethinking embedding coupling in
+60. :doc:`RemBERT <model_doc/rembert>` (from Google Research) released with the paper `Rethinking embedding coupling in
    pre-trained language models <https://arxiv.org/pdf/2010.12821.pdf>`__ by Hyung Won Chung, Thibault Févry, Henry
    Tsai, M. Johnson, Sebastian Ruder.
-59. :doc:`RoBERTa <model_doc/roberta>` (from Facebook), released together with the paper a `Robustly Optimized BERT
+61. :doc:`RoBERTa <model_doc/roberta>` (from Facebook), released together with the paper a `Robustly Optimized BERT
    Pretraining Approach <https://arxiv.org/abs/1907.11692>`__ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar
    Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-60. :doc:`RoFormer <model_doc/roformer>` (from ZhuiyiTechnology), released together with the paper a `RoFormer:
+62. :doc:`RoFormer <model_doc/roformer>` (from ZhuiyiTechnology), released together with the paper a `RoFormer:
    Enhanced Transformer with Rotary Position Embedding <https://arxiv.org/pdf/2104.09864v1.pdf>`__ by Jianlin Su and
    Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
-61. :doc:`SegFormer <model_doc/segformer>` (from NVIDIA) released with the paper `SegFormer: Simple and Efficient
+63. :doc:`SegFormer <model_doc/segformer>` (from NVIDIA) released with the paper `SegFormer: Simple and Efficient
    Design for Semantic Segmentation with Transformers <https://arxiv.org/abs/2105.15203>`__ by Enze Xie, Wenhai Wang,
    Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
-62. :doc:`SEW <model_doc/sew>` (from ASAPP) released with the paper `Performance-Efficiency Trade-offs in Unsupervised
+64. :doc:`SEW <model_doc/sew>` (from ASAPP) released with the paper `Performance-Efficiency Trade-offs in Unsupervised
    Pre-training for Speech Recognition <https://arxiv.org/abs/2109.06870>`__ by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu
    Han, Kilian Q. Weinberger, Yoav Artzi.
-63. :doc:`SEW-D <model_doc/sew_d>` (from ASAPP) released with the paper `Performance-Efficiency Trade-offs in
+65. :doc:`SEW-D <model_doc/sew_d>` (from ASAPP) released with the paper `Performance-Efficiency Trade-offs in
    Unsupervised Pre-training for Speech Recognition <https://arxiv.org/abs/2109.06870>`__ by Felix Wu, Kwangyoun Kim,
    Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
-64. :doc:`SpeechToTextTransformer <model_doc/speech_to_text>` (from Facebook), released together with the paper
+66. :doc:`SpeechToTextTransformer <model_doc/speech_to_text>` (from Facebook), released together with the paper
    `fairseq S2T: Fast Speech-to-Text Modeling with fairseq <https://arxiv.org/abs/2010.05171>`__ by Changhan Wang, Yun
    Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
-65. :doc:`SpeechToTextTransformer2 <model_doc/speech_to_text_2>` (from Facebook), released together with the paper
+67. :doc:`SpeechToTextTransformer2 <model_doc/speech_to_text_2>` (from Facebook), released together with the paper
    `Large-Scale Self- and Semi-Supervised Learning for Speech Translation <https://arxiv.org/abs/2104.06678>`__ by
    Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
-66. :doc:`Splinter <model_doc/splinter>` (from Tel Aviv University), released together with the paper `Few-Shot
+68. :doc:`Splinter <model_doc/splinter>` (from Tel Aviv University), released together with the paper `Few-Shot
    Question Answering by Pretraining Span Selection <https://arxiv.org/abs/2101.00438>`__ by Ori Ram, Yuval Kirstain,
    Jonathan Berant, Amir Globerson, Omer Levy.
-67. :doc:`SqueezeBert <model_doc/squeezebert>` (from Berkeley) released with the paper `SqueezeBERT: What can computer
+69. :doc:`SqueezeBert <model_doc/squeezebert>` (from Berkeley) released with the paper `SqueezeBERT: What can computer
    vision teach NLP about efficient neural networks? <https://arxiv.org/abs/2006.11316>`__ by Forrest N. Iandola,
    Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
-68. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
+70. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
    Unified Text-to-Text Transformer <https://arxiv.org/abs/1910.10683>`__ by Colin Raffel and Noam Shazeer and Adam
    Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-69. :doc:`T5v1.1 <model_doc/t5v1.1>` (from Google AI) released in the repository
+71. :doc:`T5v1.1 <model_doc/t5v1.1>` (from Google AI) released in the repository
    `google-research/text-to-text-transfer-transformer
    <https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511>`__ by
    Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi
    Zhou and Wei Li and Peter J. Liu.
-70. :doc:`TAPAS <model_doc/tapas>` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via
+72. :doc:`TAPAS <model_doc/tapas>` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via
    Pre-training <https://arxiv.org/abs/2004.02349>`__ by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller,
    Francesco Piccinno and Julian Martin Eisenschlos.
-71. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
+73. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
    Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`__ by Zihang Dai*,
    Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-72. :doc:`TrOCR <model_doc/trocr>` (from Microsoft), released together with the paper `TrOCR: Transformer-based Optical
+74. :doc:`TrOCR <model_doc/trocr>` (from Microsoft), released together with the paper `TrOCR: Transformer-based Optical
    Character Recognition with Pre-trained Models <https://arxiv.org/abs/2109.10282>`__ by Minghao Li, Tengchao Lv, Lei
    Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
-73. :doc:`UniSpeech <model_doc/unispeech>` (from Microsoft Research) released with the paper `UniSpeech: Unified Speech
+75. :doc:`UniSpeech <model_doc/unispeech>` (from Microsoft Research) released with the paper `UniSpeech: Unified Speech
    Representation Learning with Labeled and Unlabeled Data <https://arxiv.org/abs/2101.07597>`__ by Chengyi Wang, Yu
    Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
-74. :doc:`UniSpeechSat <model_doc/unispeech_sat>` (from Microsoft Research) released with the paper `UNISPEECH-SAT:
+76. :doc:`UniSpeechSat <model_doc/unispeech_sat>` (from Microsoft Research) released with the paper `UNISPEECH-SAT:
    UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING <https://arxiv.org/abs/2110.05752>`__ by
    Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li,
    Xiangzhan Yu.
-75. :doc:`Vision Transformer (ViT) <model_doc/vit>` (from Google AI) released with the paper `An Image is Worth 16x16
+77. :doc:`Vision Transformer (ViT) <model_doc/vit>` (from Google AI) released with the paper `An Image is Worth 16x16
    Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>`__ by Alexey Dosovitskiy,
    Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias
    Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-76. :doc:`VisualBERT <model_doc/visual_bert>` (from UCLA NLP) released with the paper `VisualBERT: A Simple and
+78. :doc:`VisualBERT <model_doc/visual_bert>` (from UCLA NLP) released with the paper `VisualBERT: A Simple and
    Performant Baseline for Vision and Language <https://arxiv.org/pdf/1908.03557>`__ by Liunian Harold Li, Mark
    Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
-77. :doc:`Wav2Vec2 <model_doc/wav2vec2>` (from Facebook AI) released with the paper `wav2vec 2.0: A Framework for
+79. :doc:`Wav2Vec2 <model_doc/wav2vec2>` (from Facebook AI) released with the paper `wav2vec 2.0: A Framework for
    Self-Supervised Learning of Speech Representations <https://arxiv.org/abs/2006.11477>`__ by Alexei Baevski, Henry
    Zhou, Abdelrahman Mohamed, Michael Auli.
-78. :doc:`XLM <model_doc/xlm>` (from Facebook) released together with the paper `Cross-lingual Language Model
+80. :doc:`XLM <model_doc/xlm>` (from Facebook) released together with the paper `Cross-lingual Language Model
    Pretraining <https://arxiv.org/abs/1901.07291>`__ by Guillaume Lample and Alexis Conneau.
-79. :doc:`XLM-ProphetNet <model_doc/xlmprophetnet>` (from Microsoft Research) released with the paper `ProphetNet:
+81. :doc:`XLM-ProphetNet <model_doc/xlmprophetnet>` (from Microsoft Research) released with the paper `ProphetNet:
    Predicting Future N-gram for Sequence-to-Sequence Pre-training <https://arxiv.org/abs/2001.04063>`__ by Yu Yan,
    Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-80. :doc:`XLM-RoBERTa <model_doc/xlmroberta>` (from Facebook AI), released together with the paper `Unsupervised
+82. :doc:`XLM-RoBERTa <model_doc/xlmroberta>` (from Facebook AI), released together with the paper `Unsupervised
    Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`__ by Alexis Conneau*, Kartikay
    Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke
    Zettlemoyer and Veselin Stoyanov.
-81. :doc:`XLNet <model_doc/xlnet>` (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive
+83. :doc:`XLNet <model_doc/xlnet>` (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive
    Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`__ by Zhilin Yang*, Zihang Dai*, Yiming
    Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-82. :doc:`XLSR-Wav2Vec2 <model_doc/xlsr_wav2vec2>` (from Facebook AI) released with the paper `Unsupervised
+84. :doc:`XLSR-Wav2Vec2 <model_doc/xlsr_wav2vec2>` (from Facebook AI) released with the paper `Unsupervised
    Cross-Lingual Representation Learning For Speech Recognition <https://arxiv.org/abs/2006.13979>`__ by Alexis
    Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.

@ -379,7 +385,7 @@ Flax), PyTorch, and/or TensorFlow.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |       BigBirdPegasus        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|         Blenderbot          |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|         Blenderbot          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |       BlenderbotSmall       |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
@ -425,6 +431,8 @@ Flax), PyTorch, and/or TensorFlow.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |           I-BERT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|          ImageGPT           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |          LayoutLM           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         LayoutLMv2          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
@ -459,6 +467,8 @@ Flax), PyTorch, and/or TensorFlow.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         ProphetNet          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|           QDQBert           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |             RAG             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |          Reformer           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
@ -489,7 +499,7 @@ Flax), PyTorch, and/or TensorFlow.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |             T5              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|            TAPAS            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            TAPAS            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |       Transformer-XL        |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
@ -499,11 +509,11 @@ Flax), PyTorch, and/or TensorFlow.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |        UniSpeechSat         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|   Vision Encoder decoder    |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|   Vision Encoder decoder    |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         VisualBert          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|             ViT             |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|             ViT             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |          Wav2Vec2           |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
@ -559,6 +569,7 @@ Flax), PyTorch, and/or TensorFlow.
    testing
    debugging
    serialization
+    pr_checks

 .. toctree::
    :maxdepth: 2
@ -628,6 +639,7 @@ Flax), PyTorch, and/or TensorFlow.
    model_doc/funnel
    model_doc/herbert
    model_doc/ibert
+    model_doc/imagegpt
    model_doc/layoutlm
    model_doc/layoutlmv2
    model_doc/layoutxlm
@ -651,6 +663,7 @@ Flax), PyTorch, and/or TensorFlow.
    model_doc/pegasus
    model_doc/phobert
    model_doc/prophetnet
+    model_doc/qdqbert
    model_doc/rag
    model_doc/reformer
    model_doc/rembert
--- a/docs/source/main_classes/deepspeed.rst
+++ b/docs/source/main_classes/deepspeed.rst
@ -46,6 +46,20 @@ won't be possible on a single GPU.
   parts of DeepSpeed like ``zero.Init`` for ZeRO stage 3 and higher. To tap into this feature read the docs on
   :ref:`deepspeed-non-trainer-integration`.

+What is integrated:
+
+Training:
+
+1. DeepSpeed ZeRO training supports the full ZeRO stages 1, 2 and 3 with ZeRO-Infinity (CPU and NVME offload).
+
+Inference:
+
+1. DeepSpeed ZeRO Inference supports ZeRO stage 3 with ZeRO-Infinity. It uses the same ZeRO protocol as training, but
+   it doesn't use an optimizer and a lr scheduler and only stage 3 is relevant. For more details see:
+   :ref:`deepspeed-zero-inference`.
+
+There is also DeepSpeed Inference - this is a totally different technology which uses Tensor Parallelism instead of
+ZeRO (coming soon).



@ -1628,6 +1642,47 @@ larger multi-dimensional shape, this means that the parameter is partitioned and



+.. _deepspeed-zero-inference:
+
+
+ZeRO Inference
+=======================================================================================================================
+
+ZeRO Inference uses the same config as ZeRO-3 Training. You just don't need the optimizer and scheduler sections. In
+fact you can leave these in the config file if you want to share the same one with the training. They will just be
+ignored.
+
+Otherwise you just need to pass the usual :class:`~transformers.TrainingArguments` arguments. For example:
+
+.. code-block:: bash
+
+    deepspeed --num_gpus=2 your_program.py <normal cl args> --do_eval --deepspeed ds_config.json
+
+The only important thing is that you need to use a ZeRO-3 configuration, since ZeRO-2 provides no benefit whatsoever
+for the inference as only ZeRO-3 performs sharding of parameters, whereas ZeRO-1 shards gradients and optimizer states.
+
+Here is an example of running ``run_translation.py`` under DeepSpeed deploying all available GPUs:
+
+.. code-block:: bash
+
+    deepspeed examples/pytorch/translation/run_translation.py \
+    --deepspeed tests/deepspeed/ds_config_zero3.json \
+    --model_name_or_path t5-small --output_dir output_dir \
+    --do_eval --max_eval_samples 50 --warmup_steps 50  \
+    --max_source_length 128 --val_max_target_length 128 \
+    --overwrite_output_dir --per_device_eval_batch_size 4 \
+    --predict_with_generate --dataset_config "ro-en" --fp16 \
+    --source_lang en --target_lang ro --dataset_name wmt16 \
+    --source_prefix "translate English to Romanian: "
+
+Since for inference there is no need for additional large memory used by the optimizer states and the gradients you
+should be able to fit much larger batches and/or sequence length onto the same hardware.
+
+
+Additionally DeepSpeed is currently developing a related product called Deepspeed-Inference which has no relationship
+to the ZeRO technology, but instead uses tensor parallelism to scale models that can't fit onto a single GPU. This is a
+work in progress and we will provide the integration once that product is complete.
+

 Filing Issues
 =======================================================================================================================
--- a/docs/source/main_classes/pipelines.rst
+++ b/docs/source/main_classes/pipelines.rst
@ -45,7 +45,7 @@ The pipeline abstraction
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 The `pipeline` abstraction is a wrapper around all the other available pipelines. It is instantiated as any other
-pipeline but requires an additional argument which is the `task`.
+pipeline but can provide additional quality of life.

 Simple call on one item:

@ -55,6 +55,15 @@ Simple call on one item:
    >>> pipe("This restaurant is awesome")
    [{'label': 'POSITIVE', 'score': 0.9998743534088135}]

+If you want to use a specific model from the `hub <https://huggingface.co>`__ you can ignore the task if the model on
+the hub already defines it:
+
+.. code-block::
+
+    >>> pipe = pipeline(model="roberta-large-mnli")
+    >>> pipe("This restaurant is awesome")
+    [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
+
 To call a pipeline on many items, you can either call with a `list`.

 .. code-block::
@ -71,6 +80,11 @@ GPU. If it doesn't don't hesitate to create an issue.

 .. code-block::

+    import datasets
+    from transformers import pipeline
+    from transformers.pipelines.base import KeyDataset
+    import tqdm
+
    pipe = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h", device=0)
    dataset = datasets.load_dataset("superb", name="asr", split="test")

@ -85,6 +99,170 @@ GPU. If it doesn't don't hesitate to create an issue.

 .. autofunction:: transformers.pipeline

+Pipeline batching
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+All pipelines (except `zero-shot-classification` and `question-answering` currently) can use batching. This will work
+whenever the pipeline uses its streaming ability (so when passing lists or :obj:`Dataset`).
+
+.. code-block::
+
+    from transformers import pipeline                                                   
+    from transformers.pipelines.base import KeyDataset
+    import datasets
+    import tqdm                                                                         
+
+    dataset = datasets.load_dataset("imdb", name="plain_text", split="unsupervised")
+    pipe = pipeline("text-classification", device=0)
+    for out in pipe(KeyDataset(dataset, "text"), batch_size=8, truncation="only_first"):
+        print(out)
+        # [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
+        # Exactly the same output as before, but the content are passed
+        # as batches to the model
+
+
+.. warning::
+
+    However, this is not automatically a win for performance. It can be either a 10x speedup or 5x slowdown depending
+    on hardware, data and the actual model being used.
+
+    Example where it's most a speedup:
+
+
+.. code-block::
+
+    from transformers import pipeline                                                   
+    from torch.utils.data import Dataset                                                
+    import tqdm                                                                         
+
+
+    pipe = pipeline("text-classification", device=0)                                    
+
+
+    class MyDataset(Dataset):                                                           
+        def __len__(self):                                                              
+            return 5000                                                                 
+
+        def __getitem__(self, i):                                                       
+            return "This is a test"                                                     
+
+
+    dataset = MyDataset()   
+
+    for batch_size in [1, 8, 64, 256]:
+        print("-" * 30)                                                                     
+        print(f"Streaming batch_size={batch_size}")    
+        for out in tqdm.tqdm(pipe(dataset, batch_size=batch_size), total=len(dataset)):              
+            pass
+
+
+.. code-block::
+
+    # On GTX 970
+    ------------------------------
+    Streaming no batching
+    100%|██████████████████████████████████████████████████████████████████████| 5000/5000 [00:26<00:00, 187.52it/s]
+    ------------------------------
+    Streaming batch_size=8
+    100%|█████████████████████████████████████████████████████████████████████| 5000/5000 [00:04<00:00, 1205.95it/s]
+    ------------------------------
+    Streaming batch_size=64
+    100%|█████████████████████████████████████████████████████████████████████| 5000/5000 [00:02<00:00, 2478.24it/s]
+    ------------------------------
+    Streaming batch_size=256
+    100%|█████████████████████████████████████████████████████████████████████| 5000/5000 [00:01<00:00, 2554.43it/s]
+    (diminishing returns, saturated the GPU)
+
+
+Example where it's most a slowdown:
+
+.. code-block::
+
+    class MyDataset(Dataset):                                                           
+        def __len__(self):                                                              
+            return 5000                                                                 
+
+        def __getitem__(self, i):                                                       
+            if i % 64 == 0:                                                          
+                n = 100                                                              
+            else:                                                                    
+                n = 1                                                                
+            return "This is a test" * n
+
+This is a occasional very long sentence compared to the other. In that case, the **whole** batch will need to be 400
+tokens long, so the whole batch will be [64, 400] instead of [64, 4], leading to the high slowdown. Even worse, on
+bigger batches, the program simply crashes.
+
+
+.. code-block::
+
+    ------------------------------
+    Streaming no batching
+    100%|█████████████████████████████████████████████████████████████████████| 1000/1000 [00:05<00:00, 183.69it/s]
+    ------------------------------
+    Streaming batch_size=8
+    100%|█████████████████████████████████████████████████████████████████████| 1000/1000 [00:03<00:00, 265.74it/s]
+    ------------------------------
+    Streaming batch_size=64
+    100%|██████████████████████████████████████████████████████████████████████| 1000/1000 [00:26<00:00, 37.80it/s]
+    ------------------------------
+    Streaming batch_size=256
+      0%|                                                                                 | 0/1000 [00:00<?, ?it/s]
+    Traceback (most recent call last):
+      File "/home/nicolas/src/transformers/test.py", line 42, in <module>
+        for out in tqdm.tqdm(pipe(dataset, batch_size=256), total=len(dataset)):
+    ....
+        q = q / math.sqrt(dim_per_head)  # (bs, n_heads, q_length, dim_per_head)
+    RuntimeError: CUDA out of memory. Tried to allocate 376.00 MiB (GPU 0; 3.95 GiB total capacity; 1.72 GiB already allocated; 354.88 MiB free; 2.46 GiB reserved in total by PyTorch)
+
+
+There are no good (general) solutions for this problem, and your mileage may vary depending on your use cases. Rule of
+thumb:
+
+For users, a rule of thumb is:
+
+- **Measure performance on your load, with your hardware. Measure, measure, and keep measuring. Real numbers are the
+  only way to go.**
+- If you are latency constrained (live product doing inference), don't batch
+- If you are using CPU, don't batch.
+- If you are using throughput (you want to run your model on a bunch of static data), on GPU, then:
+
+      - If you have no clue about the size of the sequence_length ("natural" data), by default don't batch, measure and
+        try tentatively to add it, add OOM checks to recover when it will fail (and it will at some point if you don't
+        control the sequence_length.)
+      - If your sequence_length is super regular, then batching is more likely to be VERY interesting, measure and push
+        it until you get OOMs.
+      - The larger the GPU the more likely batching is going to be more interesting
+- As soon as you enable batching, make sure you can handle OOMs nicely.
+
+Pipeline custom code
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If you want to override a specific pipeline.
+
+Don't hesitate to create an issue for your task at hand, the goal of the pipeline is to be easy to use and support most
+cases, so :obj:`transformers` could maybe support your use case.
+
+
+If you want to try simply you can:
+
+- Subclass your pipeline of choice
+
+.. code-block::
+
+    class MyPipeline(TextClassificationPipeline):
+        def postprocess(...):
+            ...
+            scores = scores * 100
+            ...
+
+    my_pipeline = MyPipeline(model=model, tokenizer=tokenizer, ...)
+    # or if you use `pipeline` function, then:
+    my_pipeline = pipeline(model="xxxx", pipeline_class=MyPipeline)
+
+That should enable you to do all the custom code you want.
+
+
 Implementing a pipeline
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/main_classes/tokenizer.rst
+++ b/docs/source/main_classes/tokenizer.rst
@ -39,7 +39,8 @@ methods for using all the tokenizers:
 - Managing special tokens (like mask, beginning-of-sentence, etc.): adding them, assigning them to attributes in the
  tokenizer for easy access and making sure they are not split during tokenization.

-:class:`~transformers.BatchEncoding` holds the output of the tokenizer's encoding methods (``__call__``,
+:class:`~transformers.BatchEncoding` holds the output of the
+:class:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase`'s encoding methods (``__call__``,
 ``encode_plus`` and ``batch_encode_plus``) and is derived from a Python dictionary. When the tokenizer is a pure python
 tokenizer, this class behaves just like a standard python dictionary and holds the various model inputs computed by
 these methods (``input_ids``, ``attention_mask``...). When the tokenizer is a "Fast" tokenizer (i.e., backed by
--- a/docs/source/model_doc/auto.rst
+++ b/docs/source/model_doc/auto.rst
@ -76,6 +76,13 @@ AutoFeatureExtractor
    :members:


+AutoProcessor
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoProcessor
+    :members:
+
+
 AutoModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@ -160,6 +167,13 @@ AutoModelForImageClassification
    :members:


+AutoModelForVision2Seq
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoModelForVision2Seq
+    :members:
+
+
 AutoModelForAudioClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@ -216,6 +230,13 @@ TFAutoModelForCausalLM
    :members:


+TFAutoModelForImageClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAutoModelForImageClassification
+    :members:
+
+
 TFAutoModelForMaskedLM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@ -244,6 +265,13 @@ TFAutoModelForMultipleChoice
    :members:


+TFAutoModelForTableQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAutoModelForTableQuestionAnswering
+    :members:
+
+
 TFAutoModelForTokenClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@ -333,3 +361,10 @@ FlaxAutoModelForImageClassification

 .. autoclass:: transformers.FlaxAutoModelForImageClassification
    :members:
+
+
+FlaxAutoModelForVision2Seq
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxAutoModelForVision2Seq
+    :members:
--- a/docs/source/model_doc/bart.rst
+++ b/docs/source/model_doc/bart.rst
@ -74,7 +74,7 @@ The :obj:`facebook/bart-base` and :obj:`facebook/bart-large` checkpoints can be
 .. code-block::

    from transformers import BartForConditionalGeneration, BartTokenizer
-    model = BartForConditionalGeneration.from_pretrained("facebook/bart-large", force_bos_token_to_be_generated=True)
+    model = BartForConditionalGeneration.from_pretrained("facebook/bart-large", forced_bos_token_id=0)
    tok = BartTokenizer.from_pretrained("facebook/bart-large")
    example_english_phrase = "UN Chief Says There Is No <mask> in Syria"
    batch = tok(example_english_phrase, return_tensors='pt')
--- a/docs/source/model_doc/beit.rst
+++ b/docs/source/model_doc/beit.rst
@ -63,6 +63,17 @@ This model was contributed by `nielsr <https://huggingface.co/nielsr>`__. The JA
 contributed by `kamalkraj <https://huggingface.co/kamalkraj>`__. The original code can be found `here
 <https://github.com/microsoft/unilm/tree/master/beit>`__.

+
+BEiT specific outputs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.models.beit.modeling_beit.BeitModelOutputWithPooling
+    :members:
+
+.. autoclass:: transformers.models.beit.modeling_flax_beit.FlaxBeitModelOutputWithPooling
+    :members:
+
+
 BeitConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@ -98,6 +109,13 @@ BeitForImageClassification
    :members: forward


+BeitForSemanticSegmentation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BeitForSemanticSegmentation
+    :members: forward
+
+
 FlaxBeitModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/blenderbot.rst
+++ b/docs/source/model_doc/blenderbot.rst
@ -81,6 +81,13 @@ BlenderbotTokenizer
    :members: build_inputs_with_special_tokens


+BlenderbotTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BlenderbotTokenizerFast
+    :members: build_inputs_with_special_tokens
+
+
 BlenderbotModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@ -118,3 +125,17 @@ TFBlenderbotForConditionalGeneration

 .. autoclass:: transformers.TFBlenderbotForConditionalGeneration
    :members: call
+
+
+FlaxBlenderbotModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxBlenderbotModel
+    :members: __call__, encode, decode
+
+
+FlaxBlenderbotForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxBlenderbotForConditionalGeneration
+    :members: __call__, encode, decode
--- a/docs/source/model_doc/dpr.rst
+++ b/docs/source/model_doc/dpr.rst
@ -41,13 +41,6 @@ DPRConfig
    :members:


-DPRPreTrainedModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.DPRPreTrainedModel
-    :members:
-
-
 DPRContextEncoderTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/imagegpt.rst
+++ b/docs/source/model_doc/imagegpt.rst
@ -0,0 +1,110 @@
+.. 
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+ImageGPT
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The ImageGPT model was proposed in `Generative Pretraining from Pixels <https://openai.com/blog/image-gpt/>`__ by Mark
+Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever. ImageGPT (iGPT) is a GPT-2-like
+model trained to predict the next pixel value, allowing for both unconditional and conditional image generation.
+
+The abstract from the paper is the following:
+
+*Inspired by progress in unsupervised representation learning for natural language, we examine whether similar models
+can learn useful representations for images. We train a sequence Transformer to auto-regressively predict pixels,
+without incorporating knowledge of the 2D input structure. Despite training on low-resolution ImageNet without labels,
+we find that a GPT-2 scale model learns strong image representations as measured by linear probing, fine-tuning, and
+low-data classification. On CIFAR-10, we achieve 96.3% accuracy with a linear probe, outperforming a supervised Wide
+ResNet, and 99.0% accuracy with full fine-tuning, matching the top supervised pre-trained models. We are also
+competitive with self-supervised benchmarks on ImageNet when substituting pixels for a VQVAE encoding, achieving 69.0%
+top-1 accuracy on a linear probe of our features.*
+
+The figure below summarizes the approach (taken from the `original paper
+<https://cdn.openai.com/papers/Generative_Pretraining_from_Pixels_V2.pdf>`__):
+
+.. image:: https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/imagegpt_architecture.png
+  :width: 600
+
+Tips:
+
+- ImageGPT is almost exactly the same as :doc:`GPT-2 <gpt2>`, with the exception that a different activation function
+  is used (namely "quick gelu"), and the layer normalization layers don't mean center the inputs. ImageGPT also doesn't
+  have tied input- and output embeddings.
+- As the time- and memory requirements of the attention mechanism of Transformers scales quadratically in the sequence
+  length, the authors pre-trained ImageGPT on smaller input resolutions, such as 32x32 and 64x64. However, feeding a
+  sequence of 32x32x3=3072 tokens from 0..255 into a Transformer is still prohibitively large. Therefore, the authors
+  applied k-means clustering to the (R,G,B) pixel values with k=512. This way, we only have a 32*32 = 1024-long
+  sequence, but now of integers in the range 0..511. So we are shrinking the sequence length at the cost of a bigger
+  embedding matrix. In other words, the vocabulary size of ImageGPT is 512, + 1 for a special "start of sentence" (SOS)
+  token, used at the beginning of every sequence. One can use :class:`~transformers.ImageGPTFeatureExtractor` to
+  prepare images for the model.
+- Despite being pre-trained entirely unsupervised (i.e. without the use of any labels), ImageGPT produces fairly
+  performant image features useful for downstream tasks, such as image classification. The authors showed that the
+  features in the middle of the network are the most performant, and can be used as-is to train a linear model (such as
+  a sklearn logistic regression model for example). This is also referred to as "linear probing". Features can be
+  easily obtained by first forwarding the image through the model, then specifying `output_hidden_states=True`, and
+  then average-pool the hidden states at whatever layer you like.
+- Alternatively, one can further fine-tune the entire model on a downstream dataset, similar to BERT. For this, you can
+  use :class:`~transformers.ImageGPTForImageClassification`.
+- ImageGPT comes in different sizes: there's ImageGPT-small, ImageGPT-medium and ImageGPT-large. The authors did also
+  train an XL variant, which they didn't release. The differences in size are summarized in the following table:
+
+-------------------+----------------------+-----------------+---------------------+--------------+
+| **Model variant** | **Number of layers** | **Hidden size** | **Number of heads** | **# params** |
+-------------------+----------------------+-----------------+---------------------+--------------+
+| iGPT-small        | 24                   | 512             | 8                   | 76 million   |
+-------------------+----------------------+-----------------+---------------------+--------------+
+| iGPT-medium       | 36                   | 1024            | 8                   | 455 million  |
+-------------------+----------------------+-----------------+---------------------+--------------+
+| iGPT-large        | 48                   | 1536            | 16                  | 1.4 million  |
+-------------------+----------------------+-----------------+---------------------+--------------+
+| iGPT-XL           | 60                   | 3072            | not specified       | 6.8 billion  |
+-------------------+----------------------+-----------------+---------------------+--------------+
+
+This model was contributed by `nielsr <https://huggingface.co/nielsr>`__, based on `this issue
+<https://github.com/openai/image-gpt/issues/7>`__. The original code can be found `here
+<https://github.com/openai/image-gpt>`__.
+
+ImageGPTConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ImageGPTConfig
+    :members:
+
+ImageGPTFeatureExtractor
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ImageGPTFeatureExtractor
+    :members: __call__
+
+ImageGPTModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ImageGPTModel
+    :members: forward
+
+
+ImageGPTForCausalImageModeling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ImageGPTForCausalImageModeling
+    :members: forward
+
+
+ImageGPTForImageClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ImageGPTForImageClassification
+    :members: forward
--- a/docs/source/model_doc/layoutxlm.rst
+++ b/docs/source/model_doc/layoutxlm.rst
@ -40,17 +40,45 @@ One can directly plug in the weights of LayoutXLM into a LayoutLMv2 model, like

    model = LayoutLMv2Model.from_pretrained('microsoft/layoutxlm-base') 

-Note that LayoutXLM requires a different tokenizer, based on :class:`~transformers.XLMRobertaTokenizer`. You can
-initialize it as follows:
+Note that LayoutXLM has its own tokenizer, based on
+:class:`~transformers.LayoutXLMTokenizer`/:class:`~transformers.LayoutXLMTokenizerFast`. You can initialize it as
+follows:

 .. code-block::

-    from transformers import AutoTokenizer
+    from transformers import LayoutXLMTokenizer

-    tokenizer = AutoTokenizer.from_pretrained('microsoft/layoutxlm-base') 
+    tokenizer = LayoutXLMTokenizer.from_pretrained('microsoft/layoutxlm-base') 
+
+Similar to LayoutLMv2, you can use :class:`~transformers.LayoutXLMProcessor` (which internally applies
+:class:`~transformers.LayoutLMv2FeatureExtractor` and
+:class:`~transformers.LayoutXLMTokenizer`/:class:`~transformers.LayoutXLMTokenizerFast` in sequence) to prepare all
+data for the model.

 As LayoutXLM's architecture is equivalent to that of LayoutLMv2, one can refer to :doc:`LayoutLMv2's documentation page
 <layoutlmv2>` for all tips, code examples and notebooks.

 This model was contributed by `nielsr <https://huggingface.co/nielsr>`__. The original code can be found `here
 <https://github.com/microsoft/unilm>`__.
+
+
+LayoutXLMTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutXLMTokenizer
+    :members: __call__, build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+LayoutXLMTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutXLMTokenizerFast
+    :members: __call__
+
+
+LayoutXLMProcessor
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutXLMProcessor
+    :members: __call__
--- a/docs/source/model_doc/marian.rst
+++ b/docs/source/model_doc/marian.rst
@ -103,8 +103,8 @@ Here is the code to see all available pretrained models on the hub:

 .. code-block:: python

-    from huggingface_hub.hf_api import HfApi
-    model_list = HfApi().list_models()
+    from huggingface_hub import list_models
+    model_list = list_models()
    org = "Helsinki-NLP"
    model_ids = [x.modelId for x in model_list if x.modelId.startswith(org)]
    suffix = [x.split('/')[1] for x in model_ids]
--- a/docs/source/model_doc/qdqbert.rst
+++ b/docs/source/model_doc/qdqbert.rst
@ -0,0 +1,189 @@
+.. 
+    Copyright 2021 NVIDIA Corporation and The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+QDQBERT
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The QDQBERT model can be referenced in `Integer Quantization for Deep Learning Inference: Principles and Empirical
+Evaluation <https://arxiv.org/abs/2004.09602>`__ by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius
+Micikevicius.
+
+The abstract from the paper is the following:
+
+*Quantization techniques can reduce the size of Deep Neural Networks and improve inference latency and throughput by
+taking advantage of high throughput integer instructions. In this paper we review the mathematical aspects of
+quantization parameters and evaluate their choices on a wide range of neural network models for different application
+domains, including vision, speech, and language. We focus on quantization techniques that are amenable to acceleration
+by processors with high-throughput integer math pipelines. We also present a workflow for 8-bit quantization that is
+able to maintain accuracy within 1% of the floating-point baseline on all networks studied, including models that are
+more difficult to quantize, such as MobileNets and BERT-large.*
+
+Tips:
+
+- QDQBERT model adds fake quantization operations (pair of QuantizeLinear/DequantizeLinear ops) to (i) linear layer
+  inputs and weights, (ii) matmul inputs, (iii) residual add inputs, in BERT model.
+
+- QDQBERT requires the dependency of `Pytorch Quantization Toolkit
+  <https://github.com/NVIDIA/TensorRT/tree/master/tools/pytorch-quantization>`__. To install ``pip install
+  pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com``
+
+- QDQBERT model can be loaded from any checkpoint of HuggingFace BERT model (for example *bert-base-uncased*), and
+  perform Quantization Aware Training/Post Training Quantization.
+
+- A complete example of using QDQBERT model to perform Quatization Aware Training and Post Training Quantization for
+  SQUAD task can be found at `transformers/examples/research_projects/quantization-qdqbert/
+  </examples/research_projects/quantization-qdqbert/>`_.
+
+This model was contributed by `shangz <https://huggingface.co/shangz>`__.
+
+
+Set default quantizers
+_______________________________________________________________________________________________________________________
+
+QDQBERT model adds fake quantization operations (pair of QuantizeLinear/DequantizeLinear ops) to BERT by
+:obj:`TensorQuantizer` in `Pytorch Quantization Toolkit
+<https://github.com/NVIDIA/TensorRT/tree/master/tools/pytorch-quantization>`__. :obj:`TensorQuantizer` is the module
+for quantizing tensors, with :obj:`QuantDescriptor` defining how the tensor should be quantized. Refer to `Pytorch
+Quantization Toolkit userguide
+<https://docs.nvidia.com/deeplearning/tensorrt/pytorch-quantization-toolkit/docs/userguide.html>`__ for more details.
+
+Before creating QDQBERT model, one has to set the default :obj:`QuantDescriptor` defining default tensor quantizers.
+Example:
+
+.. code-block::
+
+    >>> import pytorch_quantization.nn as quant_nn
+    >>> from pytorch_quantization.tensor_quant import QuantDescriptor
+
+    >>> # The default tensor quantizer is set to use Max calibration method
+    >>> input_desc = QuantDescriptor(num_bits=8, calib_method="max")
+    >>> # The default tensor quantizer is set to be per-channel quantization for weights
+    >>> weight_desc = QuantDescriptor(num_bits=8, axis=((0,)))
+    >>> quant_nn.QuantLinear.set_default_quant_desc_input(input_desc)
+    >>> quant_nn.QuantLinear.set_default_quant_desc_weight(weight_desc)
+
+
+Calibration
+_______________________________________________________________________________________________________________________
+
+Calibration is the terminology of passing data samples to the quantizer and deciding the best scaling factors for
+tensors. After setting up the tensor quantizers, one can use the following example to calibrate the model:
+
+.. code-block::
+
+    >>> # Find the TensorQuantizer and enable calibration
+    >>> for name, module in model.named_modules():
+    >>>     if name.endswith('_input_quantizer'):
+    >>>         module.enable_calib()
+    >>>         module.disable_quant()  # Use full precision data to calibrate
+
+    >>> # Feeding data samples
+    >>> model(x)
+    >>> # ...
+
+    >>> # Finalize calibration
+    >>> for name, module in model.named_modules():
+    >>>     if name.endswith('_input_quantizer'):
+    >>>         module.load_calib_amax()
+    >>>         module.enable_quant()
+
+    >>> # If running on GPU, it needs to call .cuda() again because new tensors will be created by calibration process
+    >>> model.cuda()
+
+    >>> # Keep running the quantized model
+    >>> # ...
+
+
+Export to ONNX
+_______________________________________________________________________________________________________________________
+
+The goal of exporting to ONNX is to deploy inference by `TensorRT <https://developer.nvidia.com/tensorrt>`__. Fake
+quantization will be broken into a pair of QuantizeLinear/DequantizeLinear ONNX ops. After setting static member of
+TensorQuantizer to use Pytorch’s own fake quantization functions, fake quantized model can be exported to ONNX, follow
+the instructions in `torch.onnx <https://pytorch.org/docs/stable/onnx.html>`__. Example:
+
+.. code-block::
+
+    >>> from pytorch_quantization.nn import TensorQuantizer
+    >>> TensorQuantizer.use_fb_fake_quant = True
+
+    >>> # Load the calibrated model
+    >>> ...
+    >>> # ONNX export
+    >>> torch.onnx.export(...)
+
+
+QDQBertConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.QDQBertConfig
+    :members:
+
+
+QDQBertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.QDQBertModel
+    :members: forward
+
+
+QDQBertLMHeadModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.QDQBertLMHeadModel
+    :members: forward
+
+
+QDQBertForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.QDQBertForMaskedLM
+    :members: forward
+
+
+QDQBertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.QDQBertForSequenceClassification
+    :members: forward
+
+
+QDQBertForNextSentencePrediction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.QDQBertForNextSentencePrediction
+    :members: forward
+
+
+QDQBertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.QDQBertForMultipleChoice
+    :members: forward
+
+
+QDQBertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.QDQBertForTokenClassification
+    :members: forward
+
+
+QDQBertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.QDQBertForQuestionAnswering
+    :members: forward
+
--- a/docs/source/model_doc/segformer.rst
+++ b/docs/source/model_doc/segformer.rst
@ -38,6 +38,58 @@ Cityscapes validation set and shows excellent zero-shot robustness on Cityscapes
 This model was contributed by `nielsr <https://huggingface.co/nielsr>`__. The original code can be found `here
 <https://github.com/NVlabs/SegFormer>`__.

+The figure below illustrates the architecture of SegFormer. Taken from the `original paper
+<https://arxiv.org/abs/2105.15203>`__.
+
+.. image:: https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/segformer_architecture.png
+  :width: 600
+
+Tips:
+
+- SegFormer consists of a hierarchical Transformer encoder, and a lightweight all-MLP decode head.
+  :class:`~transformers.SegformerModel` is the hierarchical Transformer encoder (which in the paper is also referred to
+  as Mix Transformer or MiT). :class:`~transformers.SegformerForSemanticSegmentation` adds the all-MLP decode head on
+  top to perform semantic segmentation of images. In addition, there's
+  :class:`~transformers.SegformerForImageClassification` which can be used to - you guessed it - classify images. The
+  authors of SegFormer first pre-trained the Transformer encoder on ImageNet-1k to classify images. Next, they throw
+  away the classification head, and replace it by the all-MLP decode head. Next, they fine-tune the model altogether on
+  ADE20K, Cityscapes and COCO-stuff, which are important benchmarks for semantic segmentation. All checkpoints can be
+  found on the `hub <https://huggingface.co/models?other=segformer>`__.
+- The quickest way to get started with SegFormer is by checking the `example notebooks
+  <https://github.com/NielsRogge/Transformers-Tutorials/tree/master/SegFormer>`__ (which showcase both inference and
+  fine-tuning on custom data).
+- One can use :class:`~transformers.SegformerFeatureExtractor` to prepare images and corresponding segmentation maps
+  for the model. Note that this feature extractor is fairly basic and does not include all data augmentations used in
+  the original paper. The original preprocessing pipelines (for the ADE20k dataset for instance) can be found `here
+  <https://github.com/NVlabs/SegFormer/blob/master/local_configs/_base_/datasets/ade20k_repeat.py>`__. The most
+  important preprocessing step is that images and segmentation maps are randomly cropped and padded to the same size,
+  such as 512x512 or 640x640, after which they are normalized.
+- One additional thing to keep in mind is that one can initialize :class:`~transformers.SegformerFeatureExtractor` with
+  :obj:`reduce_labels` set to `True` or `False`. In some datasets (like ADE20k), the 0 index is used in the annotated
+  segmentation maps for background. However, ADE20k doesn't include the "background" class in its 150 labels.
+  Therefore, :obj:`reduce_labels` is used to reduce all labels by 1, and to make sure no loss is computed for the
+  background class (i.e. it replaces 0 in the annotated maps by 255, which is the `ignore_index` of the loss function
+  used by :class:`~transformers.SegformerForSemanticSegmentation`). However, other datasets use the 0 index as
+  background class and include this class as part of all labels. In that case, :obj:`reduce_labels` should be set to
+  `False`, as loss should also be computed for the background class.
+- As most models, SegFormer comes in different sizes, the details of which can be found in the table below.
+
+-------------------+---------------+---------------------+-------------------------+----------------+-----------------------+
+| **Model variant** | **Depths**    | **Hidden sizes**    | **Decoder hidden size** | **Params (M)** | **ImageNet-1k Top 1** |
+-------------------+---------------+---------------------+-------------------------+----------------+-----------------------+
+| MiT-b0            | [2, 2, 2, 2]  | [32, 64, 160, 256]  | 256                     | 3.7            | 70.5                  |
+-------------------+---------------+---------------------+-------------------------+----------------+-----------------------+
+| MiT-b1            | [2, 2, 2, 2]  | [64, 128, 320, 512] | 256                     | 14.0           | 78.7                  |
+-------------------+---------------+---------------------+-------------------------+----------------+-----------------------+
+| MiT-b2            | [3, 4, 6, 3]  | [64, 128, 320, 512] | 768                     | 25.4           | 81.6                  |
+-------------------+---------------+---------------------+-------------------------+----------------+-----------------------+
+| MiT-b3            | [3, 4, 18, 3] | [64, 128, 320, 512] | 768                     | 45.2           | 83.1                  |
+-------------------+---------------+---------------------+-------------------------+----------------+-----------------------+
+| MiT-b4            | [3, 8, 27, 3] | [64, 128, 320, 512] | 768                     | 62.6           | 83.6                  |
+-------------------+---------------+---------------------+-------------------------+----------------+-----------------------+
+| MiT-b5            | [3, 6, 40, 3] | [64, 128, 320, 512] | 768                     | 82.0           | 83.8                  |
+-------------------+---------------+---------------------+-------------------------+----------------+-----------------------+
+
 SegformerConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/speech_to_text_2.rst
+++ b/docs/source/model_doc/speech_to_text_2.rst
@ -36,7 +36,7 @@ Tips:
 - Speech2Text2 achieves state-of-the-art results on the CoVoST Speech Translation dataset. For more information, see
  the `official models <https://huggingface.co/models?other=speech2text2>`__ .
 - Speech2Text2 is always used within the :doc:`SpeechEncoderDecoder <speechencoderdecoder>` framework.
- Speech2Text2's tokenizer currently only supports inference, but not training.
+- Speech2Text2's tokenizer is based on `fastBPE <https://github.com/glample/fastBPE>`.

 Inference
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/docs/source/model_doc/tapas.rst
+++ b/docs/source/model_doc/tapas.rst
@ -49,7 +49,8 @@ entailment (a binary classification task). For more details, see their follow-up
 intermediate pre-training <https://www.aclweb.org/anthology/2020.findings-emnlp.27/>`__ by Julian Martin Eisenschlos,
 Syrine Krichene and Thomas Müller.

-This model was contributed by `nielsr <https://huggingface.co/nielsr>`__. The original code can be found `here
+This model was contributed by `nielsr <https://huggingface.co/nielsr>`__. The Tensorflow version of this model was
+contributed by `kamalkraj <https://huggingface.co/kamalkraj>`__. The original code can be found `here
 <https://github.com/google-research/tapas>`__.

 Tips:
@ -130,6 +131,24 @@ for your environment):
        >>> config = TapasConfig('google-base-finetuned-wikisql-supervised')
        >>> model = TapasForQuestionAnswering.from_pretrained('google/tapas-base', config=config)

+In TensorFlow, this can be done as follows (make sure to have installed the `tensorflow_probability dependency
+<https://github.com/tensorflow/probability`>__ for your environment):
+
+.. code-block::
+
+        >>> from transformers import TapasConfig, TFTapasForQuestionAnswering
+
+        >>> # for example, the base sized model with default SQA configuration
+        >>> model = TFTapasForQuestionAnswering.from_pretrained('google/tapas-base')
+
+        >>> # or, the base sized model with WTQ configuration
+        >>> config = TapasConfig.from_pretrained('google/tapas-base-finetuned-wtq')
+        >>> model = TFTapasForQuestionAnswering.from_pretrained('google/tapas-base', config=config)
+
+        >>> # or, the base sized model with WikiSQL configuration
+        >>> config = TapasConfig('google-base-finetuned-wikisql-supervised')
+        >>> model = TFTapasForQuestionAnswering.from_pretrained('google/tapas-base', config=config)
+

 Of course, you don't necessarily have to follow one of these three ways in which TAPAS was fine-tuned. You can also
 experiment by defining any hyperparameters you want when initializing :class:`~transformers.TapasConfig`, and then
@ -142,10 +161,21 @@ way. Here's an example:
        >>> from transformers import TapasConfig, TapasForQuestionAnswering

        >>> # you can initialize the classification heads any way you want (see docs of TapasConfig)
-        >>> config = TapasConfig(num_aggregation_labels=3, average_logits_per_cell=True, select_one_column=False)
+        >>> config = TapasConfig(num_aggregation_labels=3, average_logits_per_cell=True)
        >>> # initializing the pre-trained base sized model with our custom classification heads
        >>> model = TapasForQuestionAnswering.from_pretrained('google/tapas-base', config=config)

+And here is the equivalent code for TensorFlow:
+
+.. code-block::
+
+        >>> from transformers import TapasConfig, TFTapasForQuestionAnswering
+
+        >>> # you can initialize the classification heads any way you want (see docs of TapasConfig)
+        >>> config = TapasConfig(num_aggregation_labels=3, average_logits_per_cell=True)
+        >>> # initializing the pre-trained base sized model with our custom classification heads
+        >>> model = TFTapasForQuestionAnswering.from_pretrained('google/tapas-base', config=config)
+
 What you can also do is start from an already fine-tuned checkpoint. A note here is that the already fine-tuned
 checkpoint on WTQ has some issues due to the L2-loss which is somewhat brittle. See `here
 <https://github.com/google-research/tapas/issues/91#issuecomment-735719340>`__ for more info.
@ -180,12 +210,13 @@ SQA format. The author explains this `here
 are not perfect (the ``answer_coordinates`` and ``float_answer`` fields are populated based on the ``answer_text``),
 meaning that WTQ and WikiSQL results could actually be improved.

-**STEP 3: Convert your data into PyTorch tensors using TapasTokenizer**
+**STEP 3: Convert your data into PyTorch/TensorFlow tensors using TapasTokenizer**

 Third, given that you've prepared your data in this TSV/CSV format (and corresponding CSV files containing the tabular
 data), you can then use :class:`~transformers.TapasTokenizer` to convert table-question pairs into :obj:`input_ids`,
 :obj:`attention_mask`, :obj:`token_type_ids` and so on. Again, based on which of the three cases you picked above,
-:class:`~transformers.TapasForQuestionAnswering` requires different inputs to be fine-tuned:
+:class:`~transformers.TapasForQuestionAnswering`/:class:`~transformers.TFTapasForQuestionAnswering` requires different
+inputs to be fine-tuned:

 +------------------------------------+----------------------------------------------------------------------------------------------+
 | **Task**                           | **Required inputs**                                                                          |
@ -220,6 +251,8 @@ are already in the TSV file of step 2. Here's an example:
        {'input_ids': tensor([[ ... ]]), 'attention_mask': tensor([[...]]), 'token_type_ids': tensor([[[...]]]),
        'numeric_values': tensor([[ ... ]]), 'numeric_values_scale: tensor([[ ... ]]), labels: tensor([[ ... ]])}

+Set `return_tensors='tf'` when calling the tokenizer to prepare data for the TF models.
+
 Note that :class:`~transformers.TapasTokenizer` expects the data of the table to be **text-only**. You can use
 ``.astype(str)`` on a dataframe to turn it into text-only data. Of course, this only shows how to encode a single
 training example. It is advised to create a PyTorch dataset and a corresponding dataloader:
@ -261,15 +294,67 @@ training example. It is advised to create a PyTorch dataset and a corresponding
        >>> train_dataset = TableDataset(data, tokenizer)
        >>> train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32)

+And here is the equivalent code for TensorFlow:
+
+.. code-block::
+
+        >>> import tensorflow as tf
+        >>> import pandas as pd
+
+        >>> tsv_path = "your_path_to_the_tsv_file"
+        >>> table_csv_path = "your_path_to_a_directory_containing_all_csv_files"
+
+        >>> class TableDataset:
+        ...     def __init__(self, data, tokenizer):
+        ...         self.data = data
+        ...         self.tokenizer = tokenizer
+        ...
+        ...     def __iter__(self):
+        ...         for idx in range(self.__len__()):
+        ...             item = self.data.iloc[idx]
+        ...             table = pd.read_csv(table_csv_path + item.table_file).astype(str) # be sure to make your table data text only
+        ...             encoding = self.tokenizer(table=table, 
+        ...                                   queries=item.question, 
+        ...                                   answer_coordinates=item.answer_coordinates, 
+        ...                                   answer_text=item.answer_text,
+        ...                                   truncation=True,
+        ...                                   padding="max_length",
+        ...                                   return_tensors="tf"
+        ...             )
+        ...             # remove the batch dimension which the tokenizer adds by default
+        ...             encoding = {key: tf.squeeze(val,0) for key, val in encoding.items()}
+        ...             # add the float_answer which is also required (weak supervision for aggregation case)
+        ...             encoding["float_answer"] = tf.convert_to_tensor(item.float_answer,dtype=tf.float32)
+        ...             yield encoding['input_ids'], encoding['attention_mask'], encoding['numeric_values'], \
+        ...                   encoding['numeric_values_scale'], encoding['token_type_ids'], encoding['labels'], \
+        ...                   encoding['float_answer']
+        ...
+        ...     def __len__(self):
+        ...        return len(self.data)
+
+        >>> data = pd.read_csv(tsv_path, sep='\t')
+        >>> train_dataset = TableDataset(data, tokenizer)
+        >>> output_signature = (
+        ... tf.TensorSpec(shape=(512,), dtype=tf.int32),
+        ... tf.TensorSpec(shape=(512,), dtype=tf.int32),
+        ... tf.TensorSpec(shape=(512,), dtype=tf.float32),
+        ... tf.TensorSpec(shape=(512,), dtype=tf.float32),
+        ... tf.TensorSpec(shape=(512,7), dtype=tf.int32),
+        ... tf.TensorSpec(shape=(512,), dtype=tf.int32),
+        ... tf.TensorSpec(shape=(512,), dtype=tf.float32))
+        >>> train_dataloader = tf.data.Dataset.from_generator(train_dataset, output_signature=output_signature).batch(32)
+
 Note that here, we encode each table-question pair independently. This is fine as long as your dataset is **not
 conversational**. In case your dataset involves conversational questions (such as in SQA), then you should first group
 together the ``queries``, ``answer_coordinates`` and ``answer_text`` per table (in the order of their ``position``
 index) and batch encode each table with its questions. This will make sure that the ``prev_labels`` token types (see
 docs of :class:`~transformers.TapasTokenizer`) are set correctly. See `this notebook
 <https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb>`__
-for more info.
+for more info. See `this notebook
+<https://github.com/kamalkraj/Tapas-Tutorial/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb>`__
+for more info regarding using the TensorFlow model.

-**STEP 4: Train (fine-tune) TapasForQuestionAnswering**
+**STEP 4: Train (fine-tune) TapasForQuestionAnswering/TFTapasForQuestionAnswering**

 You can then fine-tune :class:`~transformers.TapasForQuestionAnswering` using native PyTorch as follows (shown here for
 the weak supervision for aggregation case):
@ -316,6 +401,52 @@ the weak supervision for aggregation case):
        ...         loss.backward()
        ...         optimizer.step()

+
+Equivalently, fine-tuning :class:`~transformers.TFTapasForQuestionAnswering` in native TensorFlow can be done as
+follows (shown here for the weak supervision for aggregation case):
+
+.. code-block::
+
+        >>> import tensorflow as tf
+        >>> from transformers import TapasConfig, TFTapasForQuestionAnswering
+
+        >>> # this is the default WTQ configuration
+        >>> config = TapasConfig(
+        ...            num_aggregation_labels = 4,
+        ...            use_answer_as_supervision = True,
+        ...            answer_loss_cutoff = 0.664694,
+        ...            cell_selection_preference = 0.207951,
+        ...            huber_loss_delta = 0.121194,
+        ...            init_cell_selection_weights_to_zero = True,
+        ...            select_one_column = True,
+        ...            allow_empty_column_selection = False,
+        ...            temperature = 0.0352513,
+        ... )
+        >>> model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config)
+
+        >>> optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
+
+        >>> for epoch in range(2):  # loop over the dataset multiple times
+        ...    for idx, batch in enumerate(train_dataloader):
+        ...         # get the inputs; 
+        ...         input_ids = batch[0]
+        ...         attention_mask = batch[1]
+        ...         token_type_ids = batch[4]
+        ...         labels = batch[-1]
+        ...         numeric_values = batch[2]
+        ...         numeric_values_scale = batch[3]
+        ...         float_answer = batch[6]
+
+        ...         # forward + backward + optimize
+        ...         with tf.GradientTape() as tape:
+        ...              outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, 
+        ...                        labels=labels, numeric_values=numeric_values, numeric_values_scale=numeric_values_scale, 
+        ...                        float_answer=float_answer )
+        ...         grads = tape.gradient(outputs.loss, model.trainable_weights)
+        ...         optimizer.apply_gradients(zip(grads, model.trainable_weights))
+
+
+
 Usage: inference
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@ -380,10 +511,68 @@ of that:
        What is the total number of movies?
        Predicted answer: SUM > 87, 53, 69

+
+And here is the equivalent code for TensorFlow:
+
+.. code-block::
+
+        >>> from transformers import TapasTokenizer, TFTapasForQuestionAnswering
+        >>> import pandas as pd 
+
+        >>> model_name = 'google/tapas-base-finetuned-wtq'
+        >>> model = TFTapasForQuestionAnswering.from_pretrained(model_name)
+        >>> tokenizer = TapasTokenizer.from_pretrained(model_name)
+
+        >>> data = {'Actors': ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], 'Number of movies': ["87", "53", "69"]}
+        >>> queries = ["What is the name of the first actor?", "How many movies has George Clooney played in?", "What is the total number of movies?"]
+        >>> table = pd.DataFrame.from_dict(data)
+        >>> inputs = tokenizer(table=table, queries=queries, padding='max_length', return_tensors="tf") 
+        >>> outputs = model(**inputs)
+        >>> predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
+        ...         inputs, 
+        ...         outputs.logits, 
+        ...         outputs.logits_aggregation
+        ... )
+
+        >>> # let's print out the results:
+        >>> id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3:"COUNT"}
+        >>> aggregation_predictions_string = [id2aggregation[x] for x in predicted_aggregation_indices]
+
+        >>> answers = []
+        >>> for coordinates in predicted_answer_coordinates:
+        ...   if len(coordinates) == 1:
+        ...     # only a single cell:
+        ...     answers.append(table.iat[coordinates[0]])
+        ...   else:
+        ...     # multiple cells
+        ...     cell_values = []
+        ...     for coordinate in coordinates:
+        ...        cell_values.append(table.iat[coordinate])
+        ...     answers.append(", ".join(cell_values))
+
+        >>> display(table)
+        >>> print("")
+        >>> for query, answer, predicted_agg in zip(queries, answers, aggregation_predictions_string):
+        ...   print(query)
+        ...   if predicted_agg == "NONE":
+        ...     print("Predicted answer: " + answer)
+        ...   else:
+        ...     print("Predicted answer: " + predicted_agg + " > " + answer)    
+        What is the name of the first actor?
+        Predicted answer: Brad Pitt
+        How many movies has George Clooney played in?
+        Predicted answer: COUNT > 69
+        What is the total number of movies?
+        Predicted answer: SUM > 87, 53, 69
+
+
 In case of a conversational set-up, then each table-question pair must be provided **sequentially** to the model, such
 that the ``prev_labels`` token types can be overwritten by the predicted ``labels`` of the previous table-question
 pair. Again, more info can be found in `this notebook
-<https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb>`__.
+<https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb>`__
+(for PyTorch) and `this notebook
+<https://github.com/kamalkraj/Tapas-Tutorial/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb>`__
+(for TensorFlow).


 Tapas specific outputs
@ -433,3 +622,31 @@ TapasForQuestionAnswering

 .. autoclass:: transformers.TapasForQuestionAnswering
    :members: forward
+
+
+TFTapasModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFTapasModel
+    :members: call
+
+
+TFTapasForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFTapasForMaskedLM
+    :members: call
+
+
+TFTapasForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFTapasForSequenceClassification
+    :members: call
+
+
+TFTapasForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFTapasForQuestionAnswering
+    :members: call
--- a/docs/source/model_doc/visionencoderdecoder.rst
+++ b/docs/source/model_doc/visionencoderdecoder.rst
@ -39,3 +39,10 @@ VisionEncoderDecoderModel

 .. autoclass:: transformers.VisionEncoderDecoderModel
    :members: forward, from_encoder_decoder_pretrained
+
+
+FlaxVisionEncoderDecoderModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxVisionEncoderDecoderModel
+    :members: __call__, from_encoder_decoder_pretrained
--- a/docs/source/model_doc/vit.rst
+++ b/docs/source/model_doc/vit.rst
@ -120,6 +120,20 @@ ViTForImageClassification
    :members: forward


+TFViTModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFViTModel
+    :members: call
+
+
+TFViTForImageClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFViTForImageClassification
+    :members: call
+
+
 FlaxVitModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/parallelism.md
+++ b/docs/source/parallelism.md
@ -170,27 +170,44 @@ With `chunks=1` you end up with the naive MP, which is very inefficient. With a

 While the diagram shows that there is a bubble of "dead" time that can't be parallelized because the last `forward` stage has to wait for `backward` to complete the pipeline, the purpose of finding the best value for `chunks` is to enable a high concurrent GPU utilization across all participating GPUs which translates to minimizing the size of the bubble.

-Problems:
+There are 2 groups of solutions - the traditional Pipeline API and the more modern solutions that make things much easier for the end user.
+
+Traditional Pipeline API solutions:
+- PyTorch
+- FairScale
+- DeepSpeed
+- Megatron-LM
+
+Modern solutions:
+- Varuna
+- Sagemaker
+
+Problems with traditional Pipeline API solutions:
 - have to modify the model quite heavily, because Pipeline requires one to rewrite the normal flow of modules into a `nn.Sequential` sequence of the same, which may require changes to the design of the model.
 - currently the Pipeline API is very restricted. If you had a bunch of python variables being passed in the very first stage of the Pipeline, you will have to find a way around it. Currently, the pipeline interface requires either a single Tensor or a tuple of Tensors as the only input and output. These tensors must have a batch size as the very first dimension, since pipeline is going to chunk the mini batch into micro-batches. Possible improvements are being discussed here https://github.com/pytorch/pytorch/pull/50693
- have to arrange each layer so that the output of one model becomes an input to the other model
+- conditional control flow at the level of pipe stages is not possible - e.g., Encoder-Decoder models like T5 require special workarounds to handle a conditional encoder stage.
+- have to arrange each layer so that the output of one model becomes an input to the other model.
+
+We are yet to experiment with Varuna and SageMaker but their papers report that they have overcome the list of problems mentioned above and that they require much smaller changes to the user's model.

 Implementations:
 - [Pytorch](https://pytorch.org/docs/stable/pipeline.html) (initial support in pytorch-1.8, and progressively getting improved in 1.9 and more so in 1.10). Some [examples](https://github.com/pytorch/pytorch/blob/master/benchmarks/distributed/pipeline/pipe.py)
 - [FairScale](https://fairscale.readthedocs.io/en/latest/tutorials/pipe.html)
 - [DeepSpeed](https://www.deepspeed.ai/tutorials/pipeline/)
 - [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) has an internal implementation - no API.
+- [Varuna](https://github.com/microsoft/varuna)
+- [SageMaker](https://arxiv.org/abs/2111.05972) - this is a proprietary solution that can only be used on AWS.

 🤗 Transformers status: as of this writing none of the models supports full-PP. GPT2 and T5 models have naive PP support. The main obstacle is being unable to convert the models to `nn.Sequential` and have all the inputs to be Tensors. This is because currently the models include many features that make the conversion very complicated, and will need to be removed to accomplish that.

 Other approaches:

-DeepSpeed and SageMaker use the concept of an [Interleaved Pipeline](https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-core-features.html)
+DeepSpeed, Varuna and SageMaker use the concept of an [Interleaved Pipeline](https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-core-features.html)
 ![interleaved-pipeline-execution](imgs/parallelism-sagemaker-interleaved-pipeline.png)

 Here the bubble (idle time) is further minimized by prioritizing backward passes.

-According to [the same document](https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-core-features.html), it might be able to automate the non `nn.Sequential` model conversion to pipeline. The only problem is that this is currently only available at AWS, so you can't run it on your own hardware.
+Varuna further tries to improve the schedule by using simulations to discover the most efficient scheduling.


 ## Tensor Parallelism
@ -220,12 +237,15 @@ Special considerations: TP requires very fast network, and therefore it's not ad
 This section is based on the original much more [detailed TP overview](https://github.com/huggingface/transformers/issues/10321#issuecomment-783543530).
 by [@anton-l](https://github.com/anton-l).

+SageMaker combines TP with DP for a more efficient processing.
+
 Alternative names:
 - DeepSpeed calls it [tensor slicing](https://www.deepspeed.ai/features/#model-parallelism)

 Implementations:
 - [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) has an internal implementation, as it's very model-specific
 - [parallelformers](https://github.com/tunib-ai/parallelformers) (only inference at the moment)
+- [SageMaker](https://arxiv.org/abs/2111.05972) - this is a proprietary solution that can only be used on AWS.

 🤗 Transformers status:
 - core: not yet implemented in the core
@ -247,6 +267,8 @@ Since each dimension requires at least 2 GPUs, here you'd need at least 4 GPUs.
 Implementations:
 - [DeepSpeed](https://github.com/microsoft/DeepSpeed)
 - [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)
+- [Varuna](https://github.com/microsoft/varuna)
+- [SageMaker](https://arxiv.org/abs/2111.05972)

 🤗 Transformers status: not yet implemented

@ -264,6 +286,8 @@ Since each dimension requires at least 2 GPUs, here you'd need at least 8 GPUs.
 Implementations:
 - [DeepSpeed](https://github.com/microsoft/DeepSpeed) - DeepSpeed also includes an even more efficient DP, which they call ZeRO-DP.
 - [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)
+- [Varuna](https://github.com/microsoft/varuna)
+- [SageMaker](https://arxiv.org/abs/2111.05972)

 🤗 Transformers status: not yet implemented, since we have no PP and TP.

--- a/docs/source/performance.md
+++ b/docs/source/performance.md
@ -164,10 +164,49 @@ Software: `pytorch-1.8-to-be` + `cuda-11.0` / `transformers==4.3.0.dev0`
 ### Anatomy of Model's Memory

 The components on GPU memory are the following:
- the model weights
- the forward activations saved for gradient computation
- the gradients
- the optimizer state
+1. model weights
+2. optimizer states
+3. gradients
+4. forward activations saved for gradient computation
+5. temporary buffers
+6. functionality-specific memory
+
+A typical model trained in mixed precision with AdamW requires 18 bytes per model parameter plus activation memory.
+
+For inference there are no optimizer states and gradients, so we can subtract those. And thus we end up with 6 bytes per model parameter for mixed precision inference, plus activation memory.
+
+Let's look at the details.
+
+#### Model Weights
+
+- 4 bytes * number of parameters for fp32 training
+- 6 bytes * number of parameters for mixed precision training
+
+#### Optimizer States
+
+- 8 bytes * number of parameters for normal AdamW (maintains 2 states)
+- 2 bytes * number of parameters for 8-bit AdamW optimizers like [bitsandbytes](https://github.com/facebookresearch/bitsandbytes)
+- 4 bytes * number of parameters for optimizers like SGD (maintains only 1 state)
+
+#### Gradients
+
+- 4 bytes * number of parameters for either fp32 or mixed precision training
+
+#### Forward Activations
+
+- size depends on many factors, the key ones being sequence length, hidden size and batch size.
+
+There are the input and output that are being passed and returned by the forward and the backward functions and the forward activations saved for gradient computation.
+
+#### Temporary Memory
+
+Additionally there are all kinds of temporary variables which get released once the calculation is done, but in the moment these could require additional memory and could push to OOM. Therefore when coding it's crucial to think strategically about such temporary variables and sometimes to explicitly free those as soon as they are no longer needed.
+
+#### Functionality-specific memory
+
+Then your software could have special memory needs. For example, when generating text using beam search, the software needs to maintain multiple copies of inputs and outputs.
+
+

 ### `forward` vs `backward` Execution Speed

@ -225,7 +264,7 @@ Some amazing tutorials to read on mixed precision:

 pytorch `autocast` which performs AMP include a caching feature, which speed things up by caching fp16-converted values. Here is the full description from this [comment](https://discuss.pytorch.org/t/autocast-and-torch-no-grad-unexpected-behaviour/93475/3):

-Autocast maintains a cache of the FP16 casts of model params (leaves). This helps streamline parameter reuse: if the same FP32 param is used in several different FP16list ops, like several matmuls, instead of re-casting the param to FP16 on entering each matmul, the cast will occur on the first matmul, the casted FP16 copy will be cached, and for all later matmuls the FP16 copy will be reused. The cache is maintained only within a particular outermost autocast context. When you exit the autocast context the cache is dropped. For recommended usage, in which autocast wraps the forward pass, and then you exit the context before calling backward(), this means the cache only lasts the duration of the forward pass each iteration, and will be rebuilt next iteration. (The cache of FP16-casted copies MUST be rebuilt each iteration. The FP32 params get updated by the optimizer, so the FP16 copies must be recreated, otherwise the FP16 values will be stale.)
+Autocast maintains a cache of the FP16 casts of model parameters (leaves). This helps streamline parameter reuse: if the same FP32 param is used in several different FP16list ops, like several matmuls, instead of re-casting the param to FP16 on entering each matmul, the cast will occur on the first matmul, the casted FP16 copy will be cached, and for all later matmuls the FP16 copy will be reused. The cache is maintained only within a particular outermost autocast context. When you exit the autocast context the cache is dropped. For recommended usage, in which autocast wraps the forward pass, and then you exit the context before calling backward(), this means the cache only lasts the duration of the forward pass each iteration, and will be rebuilt next iteration. (The cache of FP16-casted copies MUST be rebuilt each iteration. The FP32 parameters get updated by the optimizer, so the FP16 copies must be recreated, otherwise the FP16 values will be stale.)


 ### Gradient Checkpointing
--- a/docs/source/pr_checks.md
+++ b/docs/source/pr_checks.md
@ -0,0 +1,131 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Checks on a Pull Request
+
+When you open a pull request on 🤗 Transformers, a fair number of checks will be run to make sure the patch you are adding is not breaking anything existing. Those checks are of four types:
+- regular tests
+- documentation build
+- code and documentation style
+- general repository consistency
+
+In this document, we will take a stab at explaining what those various checks are and the reason behind them, as well as how to debug them locally if one of them fails on your PR.
+
+Note that they all require you to have a dev install:
+
+```bash
+pip install transformers[dev]
+```
+
+or for an editable install:
+
+```bash
+pip install -e .[dev]
+```
+
+inside the Transformers repo.
+
+## Tests
+
+All the jobs that begin with `ci/circleci: run_tests_` run parts of the Transformers testing suite. Each of those jobs focuses on a part of the library in a certain environment: for instance `ci/circleci: run_tests_pipelines_tf` runs the pipelines test in an environment where TensorFlow only is installed.
+
+Note that to avoid running tests when there is no real change in the modules they are testing, only part of the test suite is run each time: a utility is run to determine the differences in the library between before and after the PR (what GitHub shows you in the "Files changes" tab) and picks the tests impacted by that diff. That utility can be run locally with:
+
+```bash
+python utils/test_fetcher.py
+```
+
+from the root of the Transformers repo. It will:
+
+1. Check for each file in the diff if the changes are in the code or only in comments or docstrings. Only the files with real code changes are kept.
+2. Build an internal map that gives for each file of the source code of the library all the files it recursively impacts. Module A is said to impact module B if module B imports module A. For the recursive impact, we need a chain of modules going from module A to module B in which each module imports the previous one.
+3. Apply this map on the files gathered in step 1, which  gives us the list of model files impacted by the PR.
+4. Map each of those files to their corresponding test file(s) and get the list of tests to run.
+
+When executing the script locally, you should get the results of step 1, 3 and 4 printed and thus know which tests are run. The script will also create a file named `test_list.txt` which contains the list of tests to run, and you can run them locally with the following command:
+
+```bash
+python -m pytest -n 8 --dist=loadfile -rA -s $(cat test_list.txt)
+```
+
+Just in case anything slipped through the cracks, the full test suite is also run daily.
+
+## Documentation build
+
+The job `ci/circleci: build_doc` runs a build of the documentation just to make sure everything will be okay once your PR is merged. If that steps fails, you can inspect it locally by going into the `docs` folder of the Transformers repo and then typing
+
+```bash
+make html
+```
+
+Sphinx is not known for its helpful error messages, so you might have to try a few things to really find the source of the error.
+
+## Code and documentation style
+
+Code formatting is applied to all the source files, the examples and the tests using `black` and `isort`. We also have a custom tool taking care of the formatting of docstrings and `rst` files (`utils/style_doc.py`), as well as the order of the lazy imports performed in the Transformers `__init__.py` files (`utils/custom_init_isort.py`). All of this can be launched by executing
+
+```bash
+make style
+```
+
+The CI checks those have been applied inside the `ci/circleci: check_code_quality` check. It also runs `flake8`, that will have a basic look at your code and will complain if it finds an undefined variable, or one that is not used. To run that check locally, use
+
+```bash
+make quality
+```
+
+This can take a lot of time, so to run the same thing on only the files you modified in the current branch, run
+
+```bash
+make fixup
+```
+
+This last command will also run all the additional checks for the repository consistency. Let's have a look at them.
+
+## Repository consistency
+
+This regroups all the tests to make sure your PR leaves the repository in a good state, and is performed by the `ci/circleci: check_repository_consistency` check. You can locally run that check by executing the following:
+
+```bash
+make repo-consistency
+```
+
+This checks that:
+
+- All objects added to the init are documented (performed by `utils/check_repo.py`)
+- All `__init__.py` files have the same content in their two sections (performed by `utils/check_inits.py`)
+- All code identified as a copy from another module is consistent with the original (performed by `utils/check_copies.py`)
+- The translations of the READMEs and the index of the doc have the same model list as the main README (performed by `utils/check_copies.py`)
+- The auto-generated tables in the documentation are up to date (performed by `utils/check_table.py`)
+- The library has all objects available even if not all optional dependencies are installed (performed by `utils/check_dummies.py`)
+
+Should this check fail, the first two items require manual fixing, the last four can be fixed automatically for you by running the command
+
+```bash
+make fix-copies
+```
+
+Additional checks concern PRs that add new models, mainly that:
+
+- All models added are in an Auto-mapping (performed by `utils/check_repo.py`)
+<!-- TODO Sylvain, add a check that makes sure the common tests are implemented.-->
+- All models are properly tested (performed by `utils/check_repo.py`)
+
+<!-- TODO Sylvain, add the following
+- All models are added to the main README, inside the master doc
+- All checkpoints used actually exist on the Hub
+
+-->
--- a/docs/source/quicktour.rst
+++ b/docs/source/quicktour.rst
@ -51,6 +51,15 @@ The easiest way to use a pretrained model on a given task is to use :func:`~tran
 Let's see how this work for sentiment analysis (the other tasks are all covered in the :doc:`task summary
 </task_summary>`):

+Install the following dependencies (if not already installed):
+
+.. code-block:: bash
+
+    ## PYTORCH CODE
+    pip install torch
+    ## TENSORFLOW CODE
+    pip install tensorflow
+
 .. code-block::

    >>> from transformers import pipeline
@ -337,27 +346,42 @@ Once your model is fine-tuned, you can save it with its tokenizer in the followi

 .. code-block::

-    tokenizer.save_pretrained(save_directory)
-    model.save_pretrained(save_directory)
+    >>> ## PYTORCH CODE
+    >>> pt_save_directory = './pt_save_pretrained'
+    >>> tokenizer.save_pretrained(pt_save_directory)
+    >>> pt_model.save_pretrained(pt_save_directory)
+    >>> ## TENSORFLOW CODE
+    >>> tf_save_directory = './tf_save_pretrained'
+    >>> tokenizer.save_pretrained(tf_save_directory)
+    >>> tf_model.save_pretrained(tf_save_directory)

 You can then load this model back using the :func:`~transformers.AutoModel.from_pretrained` method by passing the
 directory name instead of the model name. One cool feature of 🤗 Transformers is that you can easily switch between
-PyTorch and TensorFlow: any model saved as before can be loaded back either in PyTorch or TensorFlow. If you are
-loading a saved PyTorch model in a TensorFlow model, use :func:`~transformers.TFAutoModel.from_pretrained` like this:
+PyTorch and TensorFlow: any model saved as before can be loaded back either in PyTorch or TensorFlow.
+
+
+If you would like to load your saved model in the other framework, first make sure it is installed:
+
+.. code-block:: bash
+
+    ## PYTORCH CODE
+    pip install tensorflow
+    ## TENSORFLOW CODE
+    pip install torch
+
+Then, use the corresponding Auto class to load it like this:

 .. code-block::

-    from transformers import TFAutoModel
-    tokenizer = AutoTokenizer.from_pretrained(save_directory)
-    model = TFAutoModel.from_pretrained(save_directory, from_pt=True)
+    ## PYTORCH CODE
+    >>> from transformers import TFAutoModel
+    >>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
+    >>> tf_model = TFAutoModel.from_pretrained(pt_save_directory, from_pt=True)
+    ## TENSORFLOW CODE
+    >>> from transformers import AutoModel
+    >>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
+    >>> pt_model = AutoModel.from_pretrained(tf_save_directory, from_tf=True)

-and if you are loading a saved TensorFlow model in a PyTorch model, you should use the following code:
-
-.. code-block::
-
-    from transformers import AutoModel
-    tokenizer = AutoTokenizer.from_pretrained(save_directory)
-    model = AutoModel.from_pretrained(save_directory, from_tf=True)

 Lastly, you can also ask the model to return all hidden states and all attention weights if you need them:

--- a/examples/flax/language-modeling/requirements.txt
+++ b/examples/flax/language-modeling/requirements.txt
@ -1,5 +1,5 @@
 datasets >= 1.1.3
 jax>=0.2.8
 jaxlib>=0.1.59
-flax>=0.3.4
+flax>=0.3.5
 optax>=0.0.9
--- a/examples/flax/language-modeling/run_clm_flax.py
+++ b/examples/flax/language-modeling/run_clm_flax.py
@ -27,6 +27,7 @@ import os
 import sys
 import time
 from dataclasses import dataclass, field
+from itertools import chain
 from pathlib import Path
 from typing import Callable, Optional

@ -430,7 +431,7 @@ def main():
    # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
    def group_texts(examples):
        # Concatenate all texts.
-        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
--- a/examples/flax/language-modeling/run_mlm_flax.py
+++ b/examples/flax/language-modeling/run_mlm_flax.py
@ -25,6 +25,7 @@ import os
 import sys
 import time
 from dataclasses import dataclass, field
+from itertools import chain

 # You can also adapt this script on your own masked language modeling task. Pointers for this are left as comments.
 from pathlib import Path
@ -453,7 +454,7 @@ if __name__ == "__main__":
        # max_seq_length.
        def group_texts(examples):
            # Concatenate all texts.
-            concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+            concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
            total_length = len(concatenated_examples[list(examples.keys())[0]])
            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
            # customize this part to your needs.
--- a/examples/flax/language-modeling/run_t5_mlm_flax.py
+++ b/examples/flax/language-modeling/run_t5_mlm_flax.py
@ -25,6 +25,7 @@ import os
 import sys
 import time
 from dataclasses import dataclass, field
+from itertools import chain
 from pathlib import Path
 from typing import Dict, List, Optional

@ -290,7 +291,7 @@ class FlaxDataCollatorForT5MLM:
        start_indices[:, 0] = mask_indices[:, 0]

        sentinel_ids = np.where(start_indices != 0, np.cumsum(start_indices, axis=-1), start_indices)
-        sentinel_ids = np.where(sentinel_ids != 0, (sentinel_ids + self.tokenizer.vocab_size - 1), 0)
+        sentinel_ids = np.where(sentinel_ids != 0, (len(self.tokenizer) - sentinel_ids), 0)
        sentinel_ids -= mask_indices - start_indices

        return sentinel_ids
@ -563,7 +564,7 @@ if __name__ == "__main__":
    # Main data processing function that will concatenate all texts from our dataset and generate chunks of expanded_inputs_length.
    def group_texts(examples):
        # Concatenate all texts.
-        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
--- a/examples/flax/question-answering/requirements.txt
+++ b/examples/flax/question-answering/requirements.txt
@ -1,5 +1,5 @@
 datasets >= 1.8.0
 jax>=0.2.17
 jaxlib>=0.1.68
-flax>=0.3.4
+flax>=0.3.5
 optax>=0.0.8
--- a/examples/flax/question-answering/run_qa.py
+++ b/examples/flax/question-answering/run_qa.py
@ -60,7 +60,7 @@ from utils_qa import postprocess_qa_predictions
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.12.0")
+check_min_version("4.13.0.dev0")

 Array = Any
 Dataset = datasets.arrow_dataset.Dataset
--- a/examples/flax/summarization/requirements.txt
+++ b/examples/flax/summarization/requirements.txt
@ -1,5 +1,5 @@
 datasets >= 1.1.3
 jax>=0.2.8
 jaxlib>=0.1.59
-flax>=0.3.4
+flax>=0.3.5
 optax>=0.0.8
--- a/examples/flax/text-classification/requirements.txt
+++ b/examples/flax/text-classification/requirements.txt
@ -1,5 +1,5 @@
 datasets >= 1.1.3
 jax>=0.2.8
 jaxlib>=0.1.59
-flax>=0.3.4
+flax>=0.3.5
 optax>=0.0.8
--- a/examples/flax/token-classification/requirements.txt
+++ b/examples/flax/token-classification/requirements.txt
@ -1,6 +1,6 @@
 datasets >= 1.8.0
 jax>=0.2.8
 jaxlib>=0.1.59
-flax>=0.3.4
+flax>=0.3.5
 optax>=0.0.8
 seqeval
--- a/examples/flax/token-classification/run_flax_ner.py
+++ b/examples/flax/token-classification/run_flax_ner.py
@ -53,7 +53,7 @@ from transformers.utils.versions import require_version

 logger = logging.getLogger(__name__)
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.12.0")
+check_min_version("4.13.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")

@ -598,7 +598,7 @@ def main():
            state, train_metric, dropout_rngs = p_train_step(state, batch, dropout_rngs)
            train_metrics.append(train_metric)

-            cur_step = epoch * step_per_epoch + step
+            cur_step = (epoch * step_per_epoch) + (step + 1)

            if cur_step % training_args.logging_steps == 0 and cur_step > 0:
                # Save metrics
--- a/examples/flax/vision/requirements.txt
+++ b/examples/flax/vision/requirements.txt
@ -1,6 +1,6 @@
 jax>=0.2.8
 jaxlib>=0.1.59
-flax>=0.3.4
+flax>=0.3.5
 optax>=0.0.8
 -f https://download.pytorch.org/whl/torch_stable.html
 torch==1.9.0+cpu 
--- a/examples/legacy/question-answering/README.md
+++ b/examples/legacy/question-answering/README.md
@ -0,0 +1,126 @@
+#### Fine-tuning BERT on SQuAD1.0 with relative position embeddings
+
+The following examples show how to fine-tune BERT models with different relative position embeddings. The BERT model 
+`bert-base-uncased` was pretrained with default absolute position embeddings. We provide the following pretrained 
+models which were pre-trained on the same training data (BooksCorpus and English Wikipedia) as in the BERT model 
+training, but with different relative position embeddings. 
+
+* `zhiheng-huang/bert-base-uncased-embedding-relative-key`, trained from scratch with relative embedding proposed by 
+Shaw et al., [Self-Attention with Relative Position Representations](https://arxiv.org/abs/1803.02155)
+* `zhiheng-huang/bert-base-uncased-embedding-relative-key-query`, trained from scratch with relative embedding method 4 
+in Huang et al. [Improve Transformer Models with Better Relative Position Embeddings](https://arxiv.org/abs/2009.13658)
+* `zhiheng-huang/bert-large-uncased-whole-word-masking-embedding-relative-key-query`, fine-tuned from model 
+`bert-large-uncased-whole-word-masking` with 3 additional epochs with relative embedding method 4 in Huang et al. 
+[Improve Transformer Models with Better Relative Position Embeddings](https://arxiv.org/abs/2009.13658)
+
+
+##### Base models fine-tuning
+
+```bash
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+python -m torch.distributed.launch --nproc_per_node=8 ./examples/question-answering/run_squad.py \
+    --model_name_or_path zhiheng-huang/bert-base-uncased-embedding-relative-key-query \
+    --dataset_name squad \
+    --do_train \
+    --do_eval \
+    --learning_rate 3e-5 \
+    --num_train_epochs 2 \
+    --max_seq_length 512 \
+    --doc_stride 128 \
+    --output_dir relative_squad \
+    --per_device_eval_batch_size=60 \
+    --per_device_train_batch_size=6
+```
+Training with the above command leads to the following results. It boosts the BERT default from f1 score of 88.52 to 90.54.
+
+```bash
+'exact': 83.6802270577105, 'f1': 90.54772098174814
+```
+
+The change of `max_seq_length` from 512 to 384 in the above command leads to the f1 score of 90.34. Replacing the above 
+model `zhiheng-huang/bert-base-uncased-embedding-relative-key-query` with 
+`zhiheng-huang/bert-base-uncased-embedding-relative-key` leads to the f1 score of 89.51. The changing of 8 gpus to one 
+gpu training leads to the f1 score of 90.71.
+
+##### Large models fine-tuning
+
+```bash
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+python -m torch.distributed.launch --nproc_per_node=8 ./examples/question-answering/run_squad.py \
+    --model_name_or_path zhiheng-huang/bert-large-uncased-whole-word-masking-embedding-relative-key-query \
+    --dataset_name squad \
+    --do_train \
+    --do_eval \
+    --learning_rate 3e-5 \
+    --num_train_epochs 2 \
+    --max_seq_length 512 \
+    --doc_stride 128 \
+    --output_dir relative_squad \
+    --per_gpu_eval_batch_size=6 \
+    --per_gpu_train_batch_size=2 \
+    --gradient_accumulation_steps 3
+```
+Training with the above command leads to the f1 score of 93.52, which is slightly better than the f1 score of 93.15 for 
+`bert-large-uncased-whole-word-masking`.
+
+#### Distributed training
+
+Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD1.1:
+
+```bash
+python -m torch.distributed.launch --nproc_per_node=8 ./examples/question-answering/run_squad.py \
+    --model_name_or_path bert-large-uncased-whole-word-masking \
+    --dataset_name squad \
+    --do_train \
+    --do_eval \
+    --learning_rate 3e-5 \
+    --num_train_epochs 2 \
+    --max_seq_length 384 \
+    --doc_stride 128 \
+    --output_dir ./examples/models/wwm_uncased_finetuned_squad/ \
+    --per_device_eval_batch_size=3   \
+    --per_device_train_batch_size=3   \
+```
+
+Training with the previously defined hyper-parameters yields the following results:
+
+```bash
+f1 = 93.15
+exact_match = 86.91
+```
+
+This fine-tuned model is available as a checkpoint under the reference
+[`bert-large-uncased-whole-word-masking-finetuned-squad`](https://huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad).
+
+## Results
+
+Larger batch size may improve the performance while costing more memory.
+
+##### Results for SQuAD1.0 with the previously defined hyper-parameters:
+
+```python
+{
+"exact": 85.45884578997162,
+"f1": 92.5974600601065,
+"total": 10570,
+"HasAns_exact": 85.45884578997162,
+"HasAns_f1": 92.59746006010651,
+"HasAns_total": 10570
+}
+```
+
+##### Results for SQuAD2.0 with the previously defined hyper-parameters:
+
+```python
+{
+"exact": 80.4177545691906,
+"f1": 84.07154997729623,
+"total": 11873,
+"HasAns_exact": 76.73751686909581,
+"HasAns_f1": 84.05558584352873,
+"HasAns_total": 5928,
+"NoAns_exact": 84.0874684608915,
+"NoAns_f1": 84.0874684608915,
+"NoAns_total": 5945
+}
+```
--- a/examples/pytorch/audio-classification/README.md
+++ b/examples/pytorch/audio-classification/README.md
@ -143,5 +143,4 @@ It has been verified that the script works for the following datasets:
 | Keyword Spotting | [facebook/wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base) | 12 | 0.9826 | 1 V100 GPU | 14min  | [here](https://huggingface.co/anton-l/wav2vec2-base-ft-keyword-spotting) |
 | Keyword Spotting | [facebook/hubert-base-ls960](https://huggingface.co/facebook/hubert-base-ls960) | 12 | 0.9819 | 1 V100 GPU | 14min  | [here](https://huggingface.co/anton-l/hubert-base-ft-keyword-spotting) |
 | Keyword Spotting | [asapp/sew-mid-100k](https://huggingface.co/asapp/sew-mid-100k) | 24 | 0.9757 | 1 V100 GPU | 15min  | [here](https://huggingface.co/anton-l/sew-mid-100k-ft-keyword-spotting) |
-| Common Language | [ntu-spml/distilhubert](https://huggingface.co/ntu-spml/distilhubert) | 2 | 0.2797 | 4 V100 GPUs | 38min  | [here](https://huggingface.co/anton-l/distilhubert-ft-common-language) |
 | Common Language | [facebook/wav2vec2-base](https://huggingface.co/facebook/wav2vec2-base) | 12 | 0.7945 | 4 V100 GPUs | 1h10m  | [here](https://huggingface.co/anton-l/wav2vec2-base-lang-id) |
--- a/examples/pytorch/audio-classification/run_audio_classification.py
+++ b/examples/pytorch/audio-classification/run_audio_classification.py
@ -43,7 +43,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.12.0")
+check_min_version("4.13.0.dev0")

 require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt")

--- a/examples/pytorch/image-classification/run_image_classification.py
+++ b/examples/pytorch/image-classification/run_image_classification.py
@ -54,7 +54,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.12.0")
+check_min_version("4.13.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")

--- a/examples/pytorch/language-modeling/run_clm.py
+++ b/examples/pytorch/language-modeling/run_clm.py
@ -26,6 +26,7 @@ import math
 import os
 import sys
 from dataclasses import dataclass, field
+from itertools import chain
 from typing import Optional

 import datasets
@ -51,7 +52,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.12.0")
+check_min_version("4.13.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

@ -324,6 +325,7 @@ def main():
        if model_args.config_overrides is not None:
            logger.info(f"Overriding config: {model_args.config_overrides}")
            config.update_from_string(model_args.config_overrides)
+            logger.info(f"New config: {config}")

    tokenizer_kwargs = {
        "cache_dir": model_args.cache_dir,
@ -407,7 +409,7 @@ def main():
    # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
    def group_texts(examples):
        # Concatenate all texts.
-        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@ -27,6 +27,7 @@ import logging
 import math
 import os
 import random
+from itertools import chain
 from pathlib import Path

 import datasets
@ -366,7 +367,7 @@ def main():
    # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
    def group_texts(examples):
        # Concatenate all texts.
-        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
@ -507,7 +508,9 @@ def main():
            unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
            if accelerator.is_main_process:
                tokenizer.save_pretrained(args.output_dir)
-                repo.push_to_hub(commit_message=f"Training in progress epoch {epoch}", blocking=False)
+                repo.push_to_hub(
+                    commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
+                )

    if args.output_dir is not None:
        accelerator.wait_for_everyone()
@ -516,7 +519,7 @@ def main():
        if accelerator.is_main_process:
            tokenizer.save_pretrained(args.output_dir)
            if args.push_to_hub:
-                repo.push_to_hub(commit_message="End of training")
+                repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)


 if __name__ == "__main__":
--- a/examples/pytorch/language-modeling/run_mlm.py
+++ b/examples/pytorch/language-modeling/run_mlm.py
@ -26,6 +26,7 @@ import math
 import os
 import sys
 from dataclasses import dataclass, field
+from itertools import chain
 from typing import Optional

 import datasets
@ -50,7 +51,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.12.0")
+check_min_version("4.13.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

@ -326,6 +327,7 @@ def main():
        if model_args.config_overrides is not None:
            logger.info(f"Overriding config: {model_args.config_overrides}")
            config.update_from_string(model_args.config_overrides)
+            logger.info(f"New config: {config}")

    tokenizer_kwargs = {
        "cache_dir": model_args.cache_dir,
@ -431,7 +433,7 @@ def main():
        # max_seq_length.
        def group_texts(examples):
            # Concatenate all texts.
-            concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+            concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
            total_length = len(concatenated_examples[list(examples.keys())[0]])
            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
            # customize this part to your needs.
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@ -27,6 +27,7 @@ import logging
 import math
 import os
 import random
+from itertools import chain
 from pathlib import Path

 import datasets
@ -406,7 +407,7 @@ def main():
        # max_seq_length.
        def group_texts(examples):
            # Concatenate all texts.
-            concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+            concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
            total_length = len(concatenated_examples[list(examples.keys())[0]])
            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
            # customize this part to your needs.
@ -548,7 +549,9 @@ def main():
            unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
            if accelerator.is_main_process:
                tokenizer.save_pretrained(args.output_dir)
-                repo.push_to_hub(commit_message=f"Training in progress epoch {epoch}", blocking=False)
+                repo.push_to_hub(
+                    commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
+                )

    if args.output_dir is not None:
        accelerator.wait_for_everyone()
@ -557,7 +560,7 @@ def main():
        if accelerator.is_main_process:
            tokenizer.save_pretrained(args.output_dir)
            if args.push_to_hub:
-                repo.push_to_hub(commit_message="End of training")
+                repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)


 if __name__ == "__main__":
--- a/examples/pytorch/language-modeling/run_plm.py
+++ b/examples/pytorch/language-modeling/run_plm.py
@ -23,6 +23,7 @@ import math
 import os
 import sys
 from dataclasses import dataclass, field
+from itertools import chain
 from typing import Optional

 import datasets
@ -46,7 +47,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.12.0")
+check_min_version("4.13.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

@ -318,6 +319,7 @@ def main():
        if model_args.config_overrides is not None:
            logger.info(f"Overriding config: {model_args.config_overrides}")
            config.update_from_string(model_args.config_overrides)
+            logger.info(f"New config: {config}")

    tokenizer_kwargs = {
        "cache_dir": model_args.cache_dir,
@ -402,7 +404,7 @@ def main():
        # max_seq_length.
        def group_texts(examples):
            # Concatenate all texts.
-            concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+            concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
            total_length = len(concatenated_examples[list(examples.keys())[0]])
            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
            # customize this part to your needs.
--- a/examples/pytorch/multiple-choice/run_swag.py
+++ b/examples/pytorch/multiple-choice/run_swag.py
@ -22,6 +22,7 @@ import logging
 import os
 import sys
 from dataclasses import dataclass, field
+from itertools import chain
 from typing import Optional, Union

 import datasets
@ -47,7 +48,7 @@ from transformers.utils import check_min_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.12.0")
+check_min_version("4.13.0.dev0")

 logger = logging.getLogger(__name__)

@ -185,7 +186,7 @@ class DataCollatorForMultipleChoice:
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
-        flattened_features = sum(flattened_features, [])
+        flattened_features = list(chain(*flattened_features))

        batch = self.tokenizer.pad(
            flattened_features,
@ -333,8 +334,8 @@ def main():
        ]

        # Flatten out
-        first_sentences = sum(first_sentences, [])
-        second_sentences = sum(second_sentences, [])
+        first_sentences = list(chain(*first_sentences))
+        second_sentences = list(chain(*second_sentences))

        # Tokenize
        tokenized_examples = tokenizer(
--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@ -24,6 +24,7 @@ import math
 import os
 import random
 from dataclasses import dataclass
+from itertools import chain
 from pathlib import Path
 from typing import Optional, Union

@ -224,7 +225,7 @@ class DataCollatorForMultipleChoice:
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
-        flattened_features = sum(flattened_features, [])
+        flattened_features = list(chain(*flattened_features))

        batch = self.tokenizer.pad(
            flattened_features,
@ -365,8 +366,8 @@ def main():
        labels = examples[label_column_name]

        # Flatten out
-        first_sentences = sum(first_sentences, [])
-        second_sentences = sum(second_sentences, [])
+        first_sentences = list(chain(*first_sentences))
+        second_sentences = list(chain(*second_sentences))

        # Tokenize
        tokenized_examples = tokenizer(
@ -505,7 +506,9 @@ def main():
            unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
            if accelerator.is_main_process:
                tokenizer.save_pretrained(args.output_dir)
-                repo.push_to_hub(commit_message=f"Training in progress epoch {epoch}", blocking=False)
+                repo.push_to_hub(
+                    commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
+                )

    if args.output_dir is not None:
        accelerator.wait_for_everyone()
@ -514,7 +517,7 @@ def main():
        if accelerator.is_main_process:
            tokenizer.save_pretrained(args.output_dir)
            if args.push_to_hub:
-                repo.push_to_hub(commit_message="End of training")
+                repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)


 if __name__ == "__main__":
--- a/examples/pytorch/question-answering/README.md
+++ b/examples/pytorch/question-answering/README.md
@ -1,5 +1,5 @@
 <!---
-Copyright 2020 The HuggingFace Team. All rights reserved.
+Copyright 2021 The HuggingFace Team. All rights reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@ -14,25 +14,27 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->

-# SQuAD
+# Question answering

-Based on the script [`run_qa.py`](https://github.com/huggingface/transformers/blob/master/examples/pytorch/question-answering/run_qa.py).
-
-**Note:** This script only works with models that have a fast tokenizer (backed by the 🤗 Tokenizers library) as it
-uses special features of those tokenizers. You can check if your favorite model has a fast tokenizer in
-[this table](https://huggingface.co/transformers/index.html#supported-frameworks), if it doesn't you can still use the old version
-of the script.
-
-The old version of this script can be found [here](https://github.com/huggingface/transformers/tree/master/examples/legacy/question-answering).
-
-`run_qa.py` allows you to fine-tune any model from our [hub](https://huggingface.co/models) (as long as its architecture as a `ForQuestionAnswering` version in the library) on the SQUAD dataset or another question-answering dataset of the `datasets` library or your own csv/jsonlines files as long as they are structured the same way as SQUAD. You might need to tweak the data processing inside the script if your data is structured differently.
-
-Note that if your dataset contains samples with no possible answers (like SQUAD version 2), you need to pass along the flag `--version_2_with_negative`.
+This folder contains several scripts that showcase how to fine-tune a 🤗 Transformers model on a question answering dataset,
+like SQuAD. 

 ## Trainer-based scripts

+The [`run_qa.py`](https://github.com/huggingface/transformers/blob/master/examples/pytorch/question-answering/run_qa.py),
+[`run_qa_beam_search.py`](https://github.com/huggingface/transformers/blob/master/examples/pytorch/question-answering/run_qa_beam_search.py) and [`run_seq2seq_qa.py`](https://github.com/huggingface/transformers/blob/master/examples/pytorch/question-answering/run_seq2seq_qa.py) leverage the 🤗 [Trainer](https://huggingface.co/transformers/main_classes/trainer.html) for fine-tuning.
+
 ### Fine-tuning BERT on SQuAD1.0

+The [`run_qa.py`](https://github.com/huggingface/transformers/blob/master/examples/pytorch/question-answering/run_qa.py) script
+allows to fine-tune any model from our [hub](https://huggingface.co/models) (as long as its architecture has a `ForQuestionAnswering` version in the library) on a question-answering dataset (such as SQuAD, or any other QA dataset available in the `datasets` library, or your own csv/jsonlines files) as long as they are structured the same way as SQuAD. You might need to tweak the data processing inside the script if your data is structured differently.
+
+**Note:** This script only works with models that have a fast tokenizer (backed by the 🤗 Tokenizers library) as it
+uses special features of those tokenizers. You can check if your favorite model has a fast tokenizer in
+[this table](https://huggingface.co/transformers/index.html#supported-frameworks), if it doesn't you can still use the old version of the script which can be found [here](https://github.com/huggingface/transformers/tree/master/examples/legacy/question-answering).
+
+Note that if your dataset contains samples with no possible answers (like SQuAD version 2), you need to pass along the flag `--version_2_with_negative`.
+
 This example code fine-tunes BERT on the SQuAD1.0 dataset. It runs in 24 min (with BERT-base) or 68 min (with BERT-large)
 on a single tesla V100 16GB.

@ -57,62 +59,11 @@ f1 = 88.52
 exact_match = 81.22
 ```

-### Fine-tuning T5 on SQuAD2.0
+### Fine-tuning XLNet with beam search on SQuAD

-This example code fine-tunes T5 on the SQuAD2.0 dataset.
+The [`run_qa_beam_search.py`](https://github.com/huggingface/transformers/blob/master/examples/pytorch/question-answering/run_qa_beam_search.py) script is only meant to fine-tune XLNet, which is a special encoder-only Transformer model. The example code below fine-tunes XLNet on the SQuAD1.0 and SQuAD2.0 datasets.

-```bash
-python run_seq2seq_qa.py \
-  --model_name_or_path t5-small \
-  --dataset_name squad_v2 \
-  --context_column context \
-  --question_column question \
-  --answer_column answer \
-  --do_train \
-  --do_eval \
-  --per_device_train_batch_size 12 \
-  --learning_rate 3e-5 \
-  --num_train_epochs 2 \
-  --max_seq_length 384 \
-  --doc_stride 128 \
-  --output_dir /tmp/debug_seq2seq_squad/
-```
-
-
-#### Distributed training
-
-Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD1.1:
-
-```bash
-python -m torch.distributed.launch --nproc_per_node=8 ./examples/question-answering/run_squad.py \
-    --model_name_or_path bert-large-uncased-whole-word-masking \
-    --dataset_name squad \
-    --do_train \
-    --do_eval \
-    --learning_rate 3e-5 \
-    --num_train_epochs 2 \
-    --max_seq_length 384 \
-    --doc_stride 128 \
-    --output_dir ./examples/models/wwm_uncased_finetuned_squad/ \
-    --per_device_eval_batch_size=3   \
-    --per_device_train_batch_size=3   \
-```
-
-Training with the previously defined hyper-parameters yields the following results:
-
-```bash
-f1 = 93.15
-exact_match = 86.91
-```
-
-This fine-tuned model is available as a checkpoint under the reference
-[`bert-large-uncased-whole-word-masking-finetuned-squad`](https://huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad).
-
-#### Fine-tuning XLNet with beam search on SQuAD
-
-This example code fine-tunes XLNet on both SQuAD1.0 and SQuAD2.0 dataset.
-
-##### Command for SQuAD1.0:
+#### Command for SQuAD1.0:

 ```bash
 python run_qa_beam_search.py \
@ -130,7 +81,7 @@ python run_qa_beam_search.py \
    --save_steps 5000
 ```

-##### Command for SQuAD2.0:
+#### Command for SQuAD2.0:

 ```bash
 export SQUAD_DIR=/path/to/SQUAD
@ -151,18 +102,38 @@ python run_qa_beam_search.py \
    --save_steps 5000
 ```

-## With Accelerate
+### Fine-tuning T5 on SQuAD2.0

-Based on the script `run_qa_no_trainer.py` and `run_qa_beam_search_no_trainer.py`.
+The [`run_seq2seq_qa.py`](https://github.com/huggingface/transformers/blob/master/examples/pytorch/question-answering/run_seq2seq_qa.py) script is meant for encoder-decoder (also called seq2seq) Transformer models, such as T5 or BART. These
+models are generative, rather than discriminative. This means that they learn to generate the correct answer, rather than predicting the start and end position of the tokens of the answer.
+
+This example code fine-tunes T5 on the SQuAD2.0 dataset.
+
+```bash
+python run_seq2seq_qa.py \
+  --model_name_or_path t5-small \
+  --dataset_name squad_v2 \
+  --context_column context \
+  --question_column question \
+  --answer_column answer \
+  --do_train \
+  --do_eval \
+  --per_device_train_batch_size 12 \
+  --learning_rate 3e-5 \
+  --num_train_epochs 2 \
+  --max_seq_length 384 \
+  --doc_stride 128 \
+  --output_dir /tmp/debug_seq2seq_squad/
+```
+
+## Accelerate-based scripts
+
+Based on the scripts `run_qa_no_trainer.py` and `run_qa_beam_search_no_trainer.py`.

 Like `run_qa.py` and `run_qa_beam_search.py`, these scripts allow you to fine-tune any of the models supported on a
-SQUAD or a similar dataset, the main difference is that this
-script exposes the bare training loop, to allow you to quickly experiment and add any customization you would like.
+SQuAD or a similar dataset, the main difference is that this script exposes the bare training loop, to allow you to quickly experiment and add any customization you would like. It offers less options than the script with `Trainer` (for instance you can easily change the options for the optimizer or the dataloaders directly in the script), but still run in a distributed setup, on TPU and supports mixed precision by leveraging the [🤗 `Accelerate`](https://github.com/huggingface/accelerate) library. 

-It offers less options than the script with `Trainer` (for instance you can easily change the options for the optimizer
-or the dataloaders directly in the script) but still run in a distributed setup, on TPU and supports mixed precision by
-the mean of the [🤗 `Accelerate`](https://github.com/huggingface/accelerate) library. You can use the script normally
-after installing it:
+You can use the script normally after installing it:

 ```bash
 pip install accelerate
@ -209,103 +180,4 @@ This command is the same and will work for:
 - a distributed training with several GPUs (single or multi node)
 - a training on TPUs

-Note that this library is in alpha release so your feedback is more than welcome if you encounter any problem using it.
-
-
-## Results
-
-Larger batch size may improve the performance while costing more memory.
-
-##### Results for SQuAD1.0 with the previously defined hyper-parameters:
-
-```python
-{
-"exact": 85.45884578997162,
-"f1": 92.5974600601065,
-"total": 10570,
-"HasAns_exact": 85.45884578997162,
-"HasAns_f1": 92.59746006010651,
-"HasAns_total": 10570
-}
-```
-
-##### Results for SQuAD2.0 with the previously defined hyper-parameters:
-
-```python
-{
-"exact": 80.4177545691906,
-"f1": 84.07154997729623,
-"total": 11873,
-"HasAns_exact": 76.73751686909581,
-"HasAns_f1": 84.05558584352873,
-"HasAns_total": 5928,
-"NoAns_exact": 84.0874684608915,
-"NoAns_f1": 84.0874684608915,
-"NoAns_total": 5945
-}
-```
-
-#### Fine-tuning BERT on SQuAD1.0 with relative position embeddings
-
-The following examples show how to fine-tune BERT models with different relative position embeddings. The BERT model 
-`bert-base-uncased` was pretrained with default absolute position embeddings. We provide the following pretrained 
-models which were pre-trained on the same training data (BooksCorpus and English Wikipedia) as in the BERT model 
-training, but with different relative position embeddings. 
-
-* `zhiheng-huang/bert-base-uncased-embedding-relative-key`, trained from scratch with relative embedding proposed by 
-Shaw et al., [Self-Attention with Relative Position Representations](https://arxiv.org/abs/1803.02155)
-* `zhiheng-huang/bert-base-uncased-embedding-relative-key-query`, trained from scratch with relative embedding method 4 
-in Huang et al. [Improve Transformer Models with Better Relative Position Embeddings](https://arxiv.org/abs/2009.13658)
-* `zhiheng-huang/bert-large-uncased-whole-word-masking-embedding-relative-key-query`, fine-tuned from model 
-`bert-large-uncased-whole-word-masking` with 3 additional epochs with relative embedding method 4 in Huang et al. 
-[Improve Transformer Models with Better Relative Position Embeddings](https://arxiv.org/abs/2009.13658)
-
-
-##### Base models fine-tuning
-
-```bash
-export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-python -m torch.distributed.launch --nproc_per_node=8 ./examples/question-answering/run_squad.py \
-    --model_name_or_path zhiheng-huang/bert-base-uncased-embedding-relative-key-query \
-    --dataset_name squad \
-    --do_train \
-    --do_eval \
-    --learning_rate 3e-5 \
-    --num_train_epochs 2 \
-    --max_seq_length 512 \
-    --doc_stride 128 \
-    --output_dir relative_squad \
-    --per_device_eval_batch_size=60 \
-    --per_device_train_batch_size=6
-```
-Training with the above command leads to the following results. It boosts the BERT default from f1 score of 88.52 to 90.54.
-
-```bash
-'exact': 83.6802270577105, 'f1': 90.54772098174814
-```
-
-The change of `max_seq_length` from 512 to 384 in the above command leads to the f1 score of 90.34. Replacing the above 
-model `zhiheng-huang/bert-base-uncased-embedding-relative-key-query` with 
-`zhiheng-huang/bert-base-uncased-embedding-relative-key` leads to the f1 score of 89.51. The changing of 8 gpus to one 
-gpu training leads to the f1 score of 90.71.
-
-##### Large models fine-tuning
-
-```bash
-export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-python -m torch.distributed.launch --nproc_per_node=8 ./examples/question-answering/run_squad.py \
-    --model_name_or_path zhiheng-huang/bert-large-uncased-whole-word-masking-embedding-relative-key-query \
-    --dataset_name squad \
-    --do_train \
-    --do_eval \
-    --learning_rate 3e-5 \
-    --num_train_epochs 2 \
-    --max_seq_length 512 \
-    --doc_stride 128 \
-    --output_dir relative_squad \
-    --per_gpu_eval_batch_size=6 \
-    --per_gpu_train_batch_size=2 \
-    --gradient_accumulation_steps 3
-```
-Training with the above command leads to the f1 score of 93.52, which is slightly better than the f1 score of 93.15 for 
-`bert-large-uncased-whole-word-masking`.
+Note that this library is in alpha release so your feedback is more than welcome if you encounter any problem using it.
--- a/examples/pytorch/question-answering/run_qa.py
+++ b/examples/pytorch/question-answering/run_qa.py
@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Fine-tuning the library models for question answering.
+Fine-tuning the library models for question answering using a slightly adapted version of the 🤗 Trainer.
 """
 # You can also adapt this script on your own question answering task. Pointers for this are left as comments.

@ -48,7 +48,7 @@ from utils_qa import postprocess_qa_predictions


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.12.0")
+check_min_version("4.13.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

--- a/examples/pytorch/question-answering/run_qa_beam_search.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search.py
@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Fine-tuning XLNet for question answering with beam search.
+Fine-tuning XLNet for question answering with beam search using a slightly adapted version of the 🤗 Trainer.
 """
 # You can also adapt this script on your own question answering task. Pointers for this are left as comments.

@ -47,7 +47,7 @@ from utils_qa import postprocess_qa_predictions_with_beam_search


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.12.0")
+check_min_version("4.13.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

--- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Fine-tuning a 🤗 Transformers model on question answering.
+Fine-tuning XLNet for question answering with beam search using 🤗 Accelerate.
 """
 # You can also adapt this script on your own question answering task. Pointers for this are left as comments.

@ -54,7 +54,7 @@ from utils_qa import postprocess_qa_predictions_with_beam_search


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.12.0")
+check_min_version("4.13.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

@ -731,7 +731,9 @@ def main():
            unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
            if accelerator.is_main_process:
                tokenizer.save_pretrained(args.output_dir)
-                repo.push_to_hub(commit_message=f"Training in progress epoch {epoch}", blocking=False)
+                repo.push_to_hub(
+                    commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
+                )

    # intialize all lists to collect the batches
    all_start_top_log_probs = []
@ -853,7 +855,7 @@ def main():
        if accelerator.is_main_process:
            tokenizer.save_pretrained(args.output_dir)
            if args.push_to_hub:
-                repo.push_to_hub(commit_message="End of training")
+                repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)


 if __name__ == "__main__":
--- a/examples/pytorch/question-answering/run_qa_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_no_trainer.py
@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Fine-tuning a 🤗 Transformers model on question answering.
+Fine-tuning a 🤗 Transformers model for question answering using 🤗 Accelerate.
 """
 # You can also adapt this script on your own question answering task. Pointers for this are left as comments.

@ -56,7 +56,7 @@ from utils_qa import postprocess_qa_predictions


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.12.0")
+check_min_version("4.13.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

@ -737,7 +737,9 @@ def main():
            unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
            if accelerator.is_main_process:
                tokenizer.save_pretrained(args.output_dir)
-                repo.push_to_hub(commit_message=f"Training in progress epoch {epoch}", blocking=False)
+                repo.push_to_hub(
+                    commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
+                )

    # Evaluation
    logger.info("***** Running Evaluation *****")
@ -816,7 +818,7 @@ def main():
        if accelerator.is_main_process:
            tokenizer.save_pretrained(args.output_dir)
            if args.push_to_hub:
-                repo.push_to_hub(commit_message="End of training")
+                repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)


 if __name__ == "__main__":
--- a/examples/pytorch/question-answering/run_seq2seq_qa.py
+++ b/examples/pytorch/question-answering/run_seq2seq_qa.py
@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Fine-tuning the library's seq2seq models for question answering.
+Fine-tuning the library's seq2seq models for question answering using the 🤗 Seq2SeqTrainer.
 """
 # You can also adapt this script on your own question answering task. Pointers for this are left as comments.

@ -25,28 +25,26 @@ from dataclasses import dataclass, field
 from typing import List, Optional, Tuple

 import datasets
-import nltk
-import numpy as np
 from datasets import load_dataset, load_metric

 import transformers
+from trainer_seq2seq_qa import QuestionAnsweringSeq2SeqTrainer
 from transformers import (
    AutoConfig,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    HfArgumentParser,
-    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    set_seed,
 )
-from transformers.trainer_utils import EvalPrediction, get_last_checkpoint
+from transformers.trainer_utils import EvalLoopOutput, EvalPrediction, get_last_checkpoint
 from transformers.utils import check_min_version
 from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.12.0")
+check_min_version("4.13.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

@ -411,7 +409,7 @@ def main():
        )
    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)

-    def preprocess_sqaud_batch(
+    def preprocess_squad_batch(
        examples,
        question_column: str,
        context_column: str,
@ -422,14 +420,14 @@ def main():
        answers = examples[answer_column]

        def generate_input(_question, _context):
-            return " ".join(["question:", _question, "context:", _context])
+            return " ".join(["question:", _question.lstrip(), "context:", _context.lstrip()])

        inputs = [generate_input(question, context) for question, context in zip(questions, contexts)]
        targets = [answer["text"][0] if len(answer["text"]) > 0 else "" for answer in answers]
        return inputs, targets

    def preprocess_function(examples):
-        inputs, targets = preprocess_sqaud_batch(examples, question_column, context_column, answer_column)
+        inputs, targets = preprocess_squad_batch(examples, question_column, context_column, answer_column)

        model_inputs = tokenizer(inputs, max_length=max_seq_length, padding=padding, truncation=True)
        # Setup the tokenizer for targets
@ -446,6 +444,45 @@ def main():
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

+    # Validation preprocessing
+    def preprocess_validation_function(examples):
+        inputs, targets = preprocess_squad_batch(examples, question_column, context_column, answer_column)
+
+        model_inputs = tokenizer(
+            inputs,
+            max_length=max_seq_length,
+            padding=padding,
+            truncation=True,
+            return_overflowing_tokens=True,
+            return_offsets_mapping=True,
+        )
+        # Setup the tokenizer for targets
+        with tokenizer.as_target_tokenizer():
+            labels = tokenizer(targets, max_length=max_answer_length, padding=padding, truncation=True)
+
+        # Since one example might give us several features if it has a long context, we need a map from a feature to
+        # its corresponding example. This key gives us just that.
+        sample_mapping = model_inputs.pop("overflow_to_sample_mapping")
+
+        # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
+        # corresponding example_id and we will store the offset mappings.
+        model_inputs["example_id"] = []
+
+        for i in range(len(model_inputs["input_ids"])):
+            # One example can give several spans, this is the index of the example containing this span of text.
+            sample_index = sample_mapping[i]
+            model_inputs["example_id"].append(examples["id"][sample_index])
+
+        # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
+        # padding in the loss.
+        if padding == "max_length" and data_args.ignore_pad_token_for_loss:
+            labels["input_ids"] = [
+                [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
+            ]
+
+        model_inputs["labels"] = labels["input_ids"]
+        return model_inputs
+
    if training_args.do_train:
        if "train" not in raw_datasets:
            raise ValueError("--do_train requires a train dataset")
@ -477,7 +514,7 @@ def main():
        # Validation Feature Creation
        with training_args.main_process_first(desc="validation dataset map pre-processing"):
            eval_dataset = eval_examples.map(
-                preprocess_function,
+                preprocess_validation_function,
                batched=True,
                num_proc=data_args.preprocessing_num_workers,
                remove_columns=column_names,
@ -498,7 +535,7 @@ def main():
        # Predict Feature Creation
        with training_args.main_process_first(desc="prediction dataset map pre-processing"):
            predict_dataset = predict_examples.map(
-                preprocess_function,
+                preprocess_validation_function,
                batched=True,
                num_proc=data_args.preprocessing_num_workers,
                remove_columns=column_names,
@ -518,50 +555,53 @@ def main():
        pad_to_multiple_of=8 if training_args.fp16 else None,
    )

+    metric = load_metric("squad_v2" if data_args.version_2_with_negative else "squad")
+
+    def compute_metrics(p: EvalPrediction):
+        return metric.compute(predictions=p.predictions, references=p.label_ids)
+
    # Post-processing:
-    def postprocess_text(preds, labels):
-        preds = [" ".join(pred) for pred in preds]
-        preds = [pred.strip() for pred in preds]
-        labels = [label.strip() for label in labels]
-
-        # rougeLSum expects newline after each sentence
-        preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
-        labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]
-
-        return preds, labels
-
-    metric = load_metric("rouge")
-
-    def compute_metrics(eval_preds: EvalPrediction):
-        preds, labels = eval_preds
+    def post_processing_function(
+        examples: datasets.Dataset, features: datasets.Dataset, outputs: EvalLoopOutput, stage="eval"
+    ):
+        # Decode the predicted tokens.
+        preds = outputs.predictions
        if isinstance(preds, tuple):
            preds = preds[0]
-        decoded_preds = [tokenizer.batch_decode(pred, skip_special_tokens=True) for pred in preds]
-        if data_args.ignore_pad_token_for_loss:
-            # Replace -100 in the labels as we can't decode them.
-            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
-        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
+        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

-        # Some simple post-processing
-        decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
-        result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
-        # Extract a few results from ROUGE
-        result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
+        # Build a map example to its corresponding features.
+        example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
+        feature_per_example = {example_id_to_index[feature["example_id"]]: i for i, feature in enumerate(features)}
+        predictions = {}
+        # Let's loop over all the examples!
+        for example_index, example in enumerate(examples):
+            # This is the index of the feature associated to the current example.
+            feature_index = feature_per_example[example_index]
+            predictions[example["id"]] = decoded_preds[feature_index]

-        prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
-        result["gen_len"] = np.mean(prediction_lens)
-        result = {k: round(v, 4) for k, v in result.items()}
-        return result
+        # Format the result to the format the metric expects.
+        if data_args.version_2_with_negative:
+            formatted_predictions = [
+                {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
+            ]
+        else:
+            formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
+
+        references = [{"id": ex["id"], "answers": ex[answer_column]} for ex in examples]
+        return EvalPrediction(predictions=formatted_predictions, label_ids=references)

    # Initialize our Trainer
-    trainer = Seq2SeqTrainer(
+    trainer = QuestionAnsweringSeq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=eval_dataset if training_args.do_eval else None,
+        eval_examples=eval_examples if training_args.do_eval else None,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
+        post_process_function=post_processing_function,
    )

    # Training
--- a/examples/pytorch/question-answering/trainer_seq2seq_qa.py
+++ b/examples/pytorch/question-answering/trainer_seq2seq_qa.py
@ -0,0 +1,120 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Team All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+A subclass of `Trainer` specific to Question-Answering tasks
+"""
+from typing import Dict, List, Optional
+
+from torch.utils.data import Dataset
+
+from transformers import Seq2SeqTrainer, is_torch_tpu_available
+from transformers.trainer_utils import PredictionOutput
+
+
+if is_torch_tpu_available():
+    import torch_xla.core.xla_model as xm
+    import torch_xla.debug.metrics as met
+
+
+class QuestionAnsweringSeq2SeqTrainer(Seq2SeqTrainer):
+    def __init__(self, *args, eval_examples=None, post_process_function=None, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.eval_examples = eval_examples
+        self.post_process_function = post_process_function
+
+    # def evaluate(self, eval_dataset=None, eval_examples=None, ignore_keys=None, metric_key_prefix: str = "eval"):
+    def evaluate(
+        self,
+        eval_dataset: Optional[Dataset] = None,
+        eval_examples=None,
+        ignore_keys: Optional[List[str]] = None,
+        metric_key_prefix: str = "eval",
+        max_length: Optional[int] = None,
+        num_beams: Optional[int] = None,
+    ) -> Dict[str, float]:
+        self._max_length = max_length if max_length is not None else self.args.generation_max_length
+        self._num_beams = num_beams if num_beams is not None else self.args.generation_num_beams
+
+        eval_dataset = self.eval_dataset if eval_dataset is None else eval_dataset
+        eval_dataloader = self.get_eval_dataloader(eval_dataset)
+        eval_examples = self.eval_examples if eval_examples is None else eval_examples
+
+        # Temporarily disable metric computation, we will do it in the loop here.
+        compute_metrics = self.compute_metrics
+        self.compute_metrics = None
+        eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
+        try:
+            output = eval_loop(
+                eval_dataloader,
+                description="Evaluation",
+                # No point gathering the predictions if there are no metrics, otherwise we defer to
+                # self.args.prediction_loss_only
+                prediction_loss_only=True if compute_metrics is None else None,
+                ignore_keys=ignore_keys,
+            )
+        finally:
+            self.compute_metrics = compute_metrics
+
+        if self.post_process_function is not None and self.compute_metrics is not None:
+            eval_preds = self.post_process_function(eval_examples, eval_dataset, output)
+            metrics = self.compute_metrics(eval_preds)
+
+            # Prefix all keys with metric_key_prefix + '_'
+            for key in list(metrics.keys()):
+                if not key.startswith(f"{metric_key_prefix}_"):
+                    metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
+
+            self.log(metrics)
+        else:
+            metrics = {}
+
+        if self.args.tpu_metrics_debug or self.args.debug:
+            # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.)
+            xm.master_print(met.metrics_report())
+
+        self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, metrics)
+        return metrics
+
+    def predict(self, predict_dataset, predict_examples, ignore_keys=None, metric_key_prefix: str = "test"):
+        predict_dataloader = self.get_test_dataloader(predict_dataset)
+
+        # Temporarily disable metric computation, we will do it in the loop here.
+        compute_metrics = self.compute_metrics
+        self.compute_metrics = None
+        eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
+        try:
+            output = eval_loop(
+                predict_dataloader,
+                description="Prediction",
+                # No point gathering the predictions if there are no metrics, otherwise we defer to
+                # self.args.prediction_loss_only
+                prediction_loss_only=True if compute_metrics is None else None,
+                ignore_keys=ignore_keys,
+            )
+        finally:
+            self.compute_metrics = compute_metrics
+
+        if self.post_process_function is None or self.compute_metrics is None:
+            return output
+
+        predictions = self.post_process_function(predict_examples, predict_dataset, output.predictions, "predict")
+        metrics = self.compute_metrics(predictions)
+
+        # Prefix all keys with metric_key_prefix + '_'
+        for key in list(metrics.keys()):
+            if not key.startswith(f"{metric_key_prefix}_"):
+                metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
+
+        return PredictionOutput(predictions=predictions.predictions, label_ids=predictions.label_ids, metrics=metrics)
--- a/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
+++ b/examples/pytorch/speech-pretraining/run_wav2vec2_pretraining_no_trainer.py
@ -667,7 +667,11 @@ def main():
                    unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)

                if (args.push_to_hub and epoch < args.num_train_epochs - 1) and accelerator.is_main_process:
-                    repo.push_to_hub(commit_message=f"Training in progress step {completed_steps}", blocking=False)
+                    repo.push_to_hub(
+                        commit_message=f"Training in progress step {completed_steps}",
+                        blocking=False,
+                        auto_lfs_prune=True,
+                    )

            # if completed steps > `args.max_train_steps` stop
            if completed_steps >= args.max_train_steps:
@ -714,7 +718,7 @@ def main():
            unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
            if accelerator.is_main_process:
                if args.push_to_hub:
-                    repo.push_to_hub(commit_message="End of training")
+                    repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)


 if __name__ == "__main__":
--- a/examples/pytorch/speech-recognition/README.md
+++ b/examples/pytorch/speech-recognition/README.md
@ -142,4 +142,14 @@ they can serve as a baseline to improve upon.
 | Dataset | Dataset Config | Pretrained Model | Word error rate on eval | GPU setup | Training time | Fine-tuned Model & Logs | Command to reproduce |
 |-------|------------------------------|-------------|---------------|---------------|----------------------|-------------| -------------|
 | [Common Voice](https://huggingface.co/datasets/common_voice)| `"tr"`  | [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53)  | 0.36     | 8 GPU V100   |  18min                 | [here](https://huggingface.co/patrickvonplaten/wav2vec2-common_voice-tr-demo-dist)      |  [run.sh](https://huggingface.co/patrickvonplaten/wav2vec2-common_voice-tr-demo-dist/blob/main/run_dist.sh) |
+| [Common Voice](https://huggingface.co/datasets/common_voice)| `"tr"`  | [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53)  | 0.31     | 8 GPU V100   |  1h05                 | [here](https://huggingface.co/patrickvonplaten/wav2vec2-large-xlsr-53-common_voice-tr-ft)      |  [run.sh](https://huggingface.co/patrickvonplaten/wav2vec2-large-xlsr-53-common_voice-tr-ft/blob/main/run.sh) |
 | [Common Voice](https://huggingface.co/datasets/common_voice)| `"tr"`  | [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) | 0.35 | 1 GPU V100   |  1h20min                      | [here](https://huggingface.co/patrickvonplaten/wav2vec2-common_voice-tr-demo)  | [run.sh](https://huggingface.co/patrickvonplaten/wav2vec2-common_voice-tr-demo/blob/main/run.sh) |
+| [Common Voice](https://huggingface.co/datasets/common_voice)| `"tr"`  | [facebook/wav2vec2-xls-r-300m](https://huggingface.co/facebook/wav2vec2-xls-r-300m)  | 0.31     | 8 GPU V100   |  1h05            | [here](https://huggingface.co/patrickvonplaten/wav2vec2-large-xls-r-300m-common_voice-tr-ft)      |  [run.sh](https://huggingface.co/patrickvonplaten/wav2vec2-large-xls-r-300m-common_voice-tr-ft/blob/main/run.sh) |
+| [Common Voice](https://huggingface.co/datasets/common_voice)| `"tr"`  | [facebook/wav2vec2-xls-r-1b](https://huggingface.co/facebook/wav2vec2-xls-r-1b)  | 0.21     | 2 GPU Titan 24 GB RAM   |  15h10            | [here](https://huggingface.co/patrickvonplaten/wav2vec2-xls-r-1b-common_voice-tr-ft)      |  [run.sh](https://huggingface.co/patrickvonplaten/wav2vec2-large-xls-r-1b-common_voice-tr-ft/blob/main/run.sh) |
+
+- [Multilingual Librispeech](https://huggingface.co/datasets/multilingual_librispeech)
+
+| Dataset | Dataset Config | Pretrained Model | Word error rate on eval | GPU setup | Training time | Fine-tuned Model & Logs | Command to reproduce |
+|-------|------------------------------|-------------|---------------|---------------|----------------------|-------------| -------------|
+| [Multilingual Librispeech](https://huggingface.co/datasets/multilingual_librispeech)| `"german"`  | [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53)  | 0.13     | 1 GPU Titan 24 GB RAM  |  15h04                 | [here](https://huggingface.co/patrickvonplaten/wav2vec2-xlsr-53-300m-mls-german-ft)      |  [run.sh](https://huggingface.co/patrickvonplaten/wav2vec2-xlsr-53-300m-mls-german-ft/blob/main/run.sh) |
+| [Multilingual Librispeech](https://huggingface.co/datasets/multilingual_librispeech)| `"german"`  | [facebook/wav2vec2-xls-r-300m](https://huggingface.co/facebook/wav2vec2-xls-r-300m)  | 0.15     | 1 GPU Titan 24 GB RAM  |  15h04                 | [here](https://huggingface.co/patrickvonplaten/wav2vec2-300m-mls-german-ft)      |  [run.sh](https://huggingface.co/patrickvonplaten/wav2vec2-300m-mls-german-ft/blob/main/run.sh) |
--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
@ -47,7 +47,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.12.0")
+check_min_version("4.13.0.dev0")

 require_version("datasets>=1.13.3", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")

@ -99,9 +99,24 @@ class ModelArguments:
        metadata={
            "help": "Probability of each feature vector along the time axis to be chosen as the start of the vector"
            "span to be masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature"
-            "vectors will be masked along the time axis. This is only relevant if ``apply_spec_augment is True``."
+            "vectors will be masked along the time axis."
        },
    )
+    mask_time_length: Optional[int] = field(
+        default=10,
+        metadata={"help": "Length of vector span to mask along the time axis."},
+    )
+    mask_feature_prob: Optional[float] = field(
+        default=0.0,
+        metadata={
+            "help": "Probability of each feature vector along the feature axis to be chosen as the start of the vector"
+            "span to be masked. Approximately ``mask_feature_prob * sequence_length // mask_feature_length`` feature bins will be masked along the time axis."
+        },
+    )
+    mask_feature_length: Optional[int] = field(
+        default=10,
+        metadata={"help": "Length of vector span to mask along the feature axis."},
+    )
    layerdrop: Optional[float] = field(default=0.0, metadata={"help": "The LayerDrop probability."})
    ctc_loss_reduction: Optional[str] = field(
        default="mean", metadata={"help": "The way the ctc loss should be reduced. Should be one of 'mean' or 'sum'."}
@ -169,6 +184,10 @@ class DataTrainingArguments:
        default=None,
        metadata={"help": "A list of characters to remove from the transcripts."},
    )
+    eval_metrics: Optional[List[str]] = list_field(
+        default=["wer"],
+        metadata={"help": "A list of metrics the model should be evaluated on. E.g. `'wer cer'`"},
+    )
    max_duration_in_seconds: Optional[float] = field(
        default=20.0,
        metadata={
@ -446,6 +465,9 @@ def main():
            "hidden_dropout": model_args.hidden_dropout,
            "final_dropout": model_args.final_dropout,
            "mask_time_prob": model_args.mask_time_prob,
+            "mask_time_length": model_args.mask_time_length,
+            "mask_feature_prob": model_args.mask_feature_prob,
+            "mask_feature_length": model_args.mask_feature_length,
            "gradient_checkpointing": training_args.gradient_checkpointing,
            "layerdrop": model_args.layerdrop,
            "ctc_loss_reduction": model_args.ctc_loss_reduction,
@ -519,8 +541,8 @@ def main():
    # Let's use word error rate (WER) as our evaluation metric,
    # instantiate a data collator and the trainer

-    # Define Metric during training
-    wer_metric = load_metric("wer")
+    # Define evaluation metrics during training, *i.e.* word error rate, character error rate
+    eval_metrics = {metric: load_metric(metric) for metric in data_args.eval_metrics}

    # for large datasets it is advised to run the preprocessing on a
    # single machine first with ``args.preprocessing_only`` since there will mostly likely
@ -541,9 +563,9 @@ def main():
        # we do not want to group tokens when computing the metrics
        label_str = processor.batch_decode(pred.label_ids, group_tokens=False)

-        wer = wer_metric.compute(predictions=pred_str, references=label_str)
+        metrics = {k: v.compute(predictions=pred_str, references=label_str) for k, v in eval_metrics.items()}

-        return {"wer": wer}
+        return metrics

    # Instantiate custom data collator
    data_collator = DataCollatorCTCWithPadding(processor=processor)
--- a/examples/pytorch/summarization/run_summarization.py
+++ b/examples/pytorch/summarization/run_summarization.py
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.12.0")
+check_min_version("4.13.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")

--- a/examples/pytorch/summarization/run_summarization_no_trainer.py
+++ b/examples/pytorch/summarization/run_summarization_no_trainer.py
@ -601,7 +601,9 @@ def main():
            unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
            if accelerator.is_main_process:
                tokenizer.save_pretrained(args.output_dir)
-                repo.push_to_hub(commit_message=f"Training in progress epoch {epoch}", blocking=False)
+                repo.push_to_hub(
+                    commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
+                )

    if args.output_dir is not None:
        accelerator.wait_for_everyone()
@ -610,7 +612,7 @@ def main():
        if accelerator.is_main_process:
            tokenizer.save_pretrained(args.output_dir)
            if args.push_to_hub:
-                repo.push_to_hub(commit_message="End of training")
+                repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)


 if __name__ == "__main__":
--- a/examples/pytorch/test_examples.py
+++ b/examples/pytorch/test_examples.py
@ -25,7 +25,7 @@ import torch

 from transformers import Wav2Vec2ForPreTraining
 from transformers.file_utils import is_apex_available
-from transformers.testing_utils import TestCasePlus, get_gpu_count, slow, torch_device
+from transformers.testing_utils import CaptureLogger, TestCasePlus, get_gpu_count, slow, torch_device


 SRC_DIRS = [
@ -157,6 +157,31 @@ class ExamplesTests(TestCasePlus):
            result = get_results(tmp_dir)
            self.assertLess(result["perplexity"], 100)

+    def test_run_clm_config_overrides(self):
+        # test that config_overrides works, despite the misleading dumps of default un-updated
+        # config via tokenizer
+
+        tmp_dir = self.get_auto_remove_tmp_dir()
+        testargs = f"""
+            run_clm.py
+            --model_type gpt2
+            --tokenizer_name gpt2
+            --train_file ./tests/fixtures/sample_text.txt
+            --output_dir {tmp_dir}
+            --config_overrides n_embd=10,n_head=2
+            """.split()
+
+        if torch_device != "cuda":
+            testargs.append("--no_cuda")
+
+        logger = run_clm.logger
+        with patch.object(sys, "argv", testargs):
+            with CaptureLogger(logger) as cl:
+                run_clm.main()
+
+        self.assertIn('"n_embd": 10', cl.out)
+        self.assertIn('"n_head": 2', cl.out)
+
    def test_run_mlm(self):
        stream_handler = logging.StreamHandler(sys.stdout)
        logger.addHandler(stream_handler)
@ -274,10 +299,8 @@ class ExamplesTests(TestCasePlus):
        with patch.object(sys, "argv", testargs):
            run_squad_seq2seq.main()
            result = get_results(tmp_dir)
-            self.assertGreaterEqual(result["eval_rouge1"], 10)
-            self.assertGreaterEqual(result["eval_rouge2"], 10)
-            self.assertGreaterEqual(result["eval_rougeL"], 10)
-            self.assertGreaterEqual(result["eval_rougeLsum"], 10)
+            self.assertGreaterEqual(result["eval_f1"], 30)
+            self.assertGreaterEqual(result["eval_exact"], 30)

    def test_run_swag(self):
        stream_handler = logging.StreamHandler(sys.stdout)
--- a/examples/pytorch/text-classification/README.md
+++ b/examples/pytorch/text-classification/README.md
@ -168,3 +168,34 @@ This command is the same and will work for:
 - a training on TPUs

 Note that this library is in alpha release so your feedback is more than welcome if you encounter any problem using it.
+
+## XNLI
+
+Based on the script [`run_xnli.py`](https://github.com/huggingface/transformers/examples/pytorch/text-classification/run_xnli.py).
+
+[XNLI](https://www.nyu.edu/projects/bowman/xnli/) is a crowd-sourced dataset based on [MultiNLI](http://www.nyu.edu/projects/bowman/multinli/). It is an evaluation benchmark for cross-lingual text representations. Pairs of text are labeled with textual entailment annotations for 15 different languages (including both high-resource language such as English and low-resource languages such as Swahili).
+
+#### Fine-tuning on XNLI
+
+This example code fine-tunes mBERT (multi-lingual BERT) on the XNLI dataset. It runs in 106 mins on a single tesla V100 16GB.
+
+```bash
+python run_xnli.py \
+  --model_name_or_path bert-base-multilingual-cased \
+  --language de \
+  --train_language en \
+  --do_train \
+  --do_eval \
+  --per_device_train_batch_size 32 \
+  --learning_rate 5e-5 \
+  --num_train_epochs 2.0 \
+  --max_seq_length 128 \
+  --output_dir /tmp/debug_xnli/ \
+  --save_steps -1
+```
+
+Training with the previously defined hyper-parameters yields the following results on the **test** set:
+
+```bash
+acc = 0.7093812375249501
+```
--- a/examples/pytorch/text-classification/run_glue.py
+++ b/examples/pytorch/text-classification/run_glue.py
@ -47,7 +47,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.12.0")
+check_min_version("4.13.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")

--- a/examples/pytorch/text-classification/run_glue_no_trainer.py
+++ b/examples/pytorch/text-classification/run_glue_no_trainer.py
@ -453,7 +453,9 @@ def main():
            unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
            if accelerator.is_main_process:
                tokenizer.save_pretrained(args.output_dir)
-                repo.push_to_hub(commit_message=f"Training in progress epoch {epoch}", blocking=False)
+                repo.push_to_hub(
+                    commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
+                )

    if args.output_dir is not None:
        accelerator.wait_for_everyone()
@ -462,7 +464,7 @@ def main():
        if accelerator.is_main_process:
            tokenizer.save_pretrained(args.output_dir)
            if args.push_to_hub:
-                repo.push_to_hub(commit_message="End of training")
+                repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)

    if args.task_name == "mnli":
        # Final evaluation on mismatched validation set
--- a/examples/pytorch/text-classification/run_xnli.py
+++ b/examples/pytorch/text-classification/run_xnli.py
@ -47,7 +47,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.12.0")
+check_min_version("4.13.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")

--- a/examples/pytorch/token-classification/run_ner.py
+++ b/examples/pytorch/token-classification/run_ner.py
@ -47,7 +47,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.12.0")
+check_min_version("4.13.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")

--- a/examples/pytorch/token-classification/run_ner_no_trainer.py
+++ b/examples/pytorch/token-classification/run_ner_no_trainer.py
@ -590,7 +590,9 @@ def main():
            unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
            if accelerator.is_main_process:
                tokenizer.save_pretrained(args.output_dir)
-                repo.push_to_hub(commit_message=f"Training in progress epoch {epoch}", blocking=False)
+                repo.push_to_hub(
+                    commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
+                )

    if args.output_dir is not None:
        accelerator.wait_for_everyone()
@ -599,7 +601,7 @@ def main():
        if accelerator.is_main_process:
            tokenizer.save_pretrained(args.output_dir)
            if args.push_to_hub:
-                repo.push_to_hub(commit_message="End of training")
+                repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)


 if __name__ == "__main__":
--- a/examples/pytorch/translation/run_translation.py
+++ b/examples/pytorch/translation/run_translation.py
@ -51,7 +51,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.12.0")
+check_min_version("4.13.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")

--- a/examples/pytorch/translation/run_translation_no_trainer.py
+++ b/examples/pytorch/translation/run_translation_no_trainer.py
@ -580,7 +580,9 @@ def main():
            unwrapped_model.save_pretrained(args.output_dir, save_function=accelerator.save)
            if accelerator.is_main_process:
                tokenizer.save_pretrained(args.output_dir)
-                repo.push_to_hub(commit_message=f"Training in progress epoch {epoch}", blocking=False)
+                repo.push_to_hub(
+                    commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
+                )

    if args.output_dir is not None:
        accelerator.wait_for_everyone()
@ -589,7 +591,7 @@ def main():
        if accelerator.is_main_process:
            tokenizer.save_pretrained(args.output_dir)
            if args.push_to_hub:
-                repo.push_to_hub(commit_message="End of training")
+                repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)


 if __name__ == "__main__":
--- a/examples/research_projects/distillation/training_configs/distilgpt2.json
+++ b/examples/research_projects/distillation/training_configs/distilgpt2.json
@ -1,7 +1,6 @@
 {
 	"initializer_range": 0.02,
 	"layer_norm_epsilon": 0.00001,
-	"n_ctx": 1024,
 	"n_embd": 768,
 	"n_head": 12,
 	"n_layer": 6,
--- a/examples/research_projects/jax-projects/hybrid_clip/modeling_hybrid_clip.py
+++ b/examples/research_projects/jax-projects/hybrid_clip/modeling_hybrid_clip.py
@ -50,13 +50,13 @@ class FlaxHybridCLIPModule(nn.Module):
        self.visual_projection = nn.Dense(
            self.projection_dim,
            dtype=self.dtype,
-            kernel_init=jax.nn.initializers.normal(0.02, dtype=self.dtype),
+            kernel_init=jax.nn.initializers.normal(0.02),
            use_bias=False,
        )
        self.text_projection = nn.Dense(
            self.projection_dim,
            dtype=self.dtype,
-            kernel_init=jax.nn.initializers.normal(0.02, dtype=self.dtype),
+            kernel_init=jax.nn.initializers.normal(0.02),
            use_bias=False,
        )
        self.logit_scale = self.param("logit_scale", jax.nn.initializers.ones, [])
--- a/examples/research_projects/jax-projects/hybrid_clip/requirements.txt
+++ b/examples/research_projects/jax-projects/hybrid_clip/requirements.txt
@ -1,6 +1,6 @@
 jax>=0.2.8
 jaxlib>=0.1.59
-flax>=0.3.4
+flax>=0.3.5
 optax>=0.0.8
 -f https://download.pytorch.org/whl/torch_stable.html
 torch==1.9.0+cpu 
--- a/examples/research_projects/jax-projects/model_parallel/run_clm_mp.py
+++ b/examples/research_projects/jax-projects/model_parallel/run_clm_mp.py
@ -23,6 +23,7 @@ import os
 import sys
 import time
 from dataclasses import dataclass, field
+from itertools import chain
 from pathlib import Path
 from typing import Callable, Optional

@ -364,7 +365,7 @@ def main():
    # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
    def group_texts(examples):
        # Concatenate all texts.
-        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
--- a/examples/research_projects/jax-projects/wav2vec2/run_wav2vec2_pretrain_flax.py
+++ b/examples/research_projects/jax-projects/wav2vec2/run_wav2vec2_pretrain_flax.py
@ -48,9 +48,6 @@ class ModelArguments:
    freeze_feature_extractor: Optional[bool] = field(
        default=True, metadata={"help": "Whether to freeze the feature extractor layers of the model."}
    )
-    gradient_checkpointing: Optional[bool] = field(
-        default=False, metadata={"help": "Whether to freeze the feature extractor layers of the model."}
-    )
    verbose_logging: Optional[bool] = field(
        default=False,
        metadata={"help": "Whether to log verbose messages or not."},
@ -356,7 +353,6 @@ def main():
    config = Wav2Vec2Config.from_pretrained(
        model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
-        gradient_checkpointing=model_args.gradient_checkpointing,
    )

    if not config.do_stable_layer_norm or config.feat_extract_norm != "layer":
@ -366,6 +362,10 @@ def main():

    model = FlaxWav2Vec2ForPreTraining(config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype))

+    # Activate gradient checkpointing if needed
+    if training_args.gradient_checkpointing:
+        model.gradient_checkpointing_enable()
+
    data_collator = FlaxDataCollatorForWav2Vec2Pretraining(
        model=model, feature_extractor=feature_extractor, pad_to_multiple_of=data_args.pad_to_multiple_of
    )
--- a/examples/research_projects/movement-pruning/emmental/modeling_bert_masked.py
+++ b/examples/research_projects/movement-pruning/emmental/modeling_bert_masked.py
@ -30,7 +30,7 @@ from emmental import MaskedBertConfig
 from emmental.modules import MaskedLinear
 from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
 from transformers.modeling_utils import PreTrainedModel, prune_linear_layer
-from transformers.models.bert.modeling_bert import ACT2FN, BertLayerNorm, load_tf_weights_in_bert
+from transformers.models.bert.modeling_bert import ACT2FN, load_tf_weights_in_bert


 logger = logging.getLogger(__name__)
@ -47,7 +47,7 @@ class BertEmbeddings(nn.Module):

        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
        # any TensorFlow checkpoint file
-        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
@ -182,7 +182,7 @@ class BertSelfOutput(nn.Module):
            mask_init=config.mask_init,
            mask_scale=config.mask_scale,
        )
-        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor, threshold):
@ -275,7 +275,7 @@ class BertOutput(nn.Module):
            mask_init=config.mask_init,
            mask_scale=config.mask_scale,
        )
-        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor, threshold):
@ -398,7 +398,7 @@ class MaskedBertPreTrainedModel(PreTrainedModel):
            # Slightly different from the TF version which uses truncated_normal for initialization
            # cf https://github.com/pytorch/pytorch/pull/5617
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-        elif isinstance(module, BertLayerNorm):
+        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        if isinstance(module, nn.Linear) and module.bias is not None:
--- a/examples/research_projects/pplm/README.md
+++ b/examples/research_projects/pplm/README.md
@ -17,7 +17,7 @@ Please check out the repo under uber-research for more information: https://gith
 git clone https://github.com/huggingface/transformers && cd transformers
 pip install .
 pip install nltk torchtext # additional requirements.
-cd examples/text-generation/pplm
+cd examples/research_projects/pplm
 ```

 ## PPLM-BoW 
--- a/examples/research_projects/quantization-qdqbert/Dockerfile
+++ b/examples/research_projects/quantization-qdqbert/Dockerfile
@ -0,0 +1,37 @@
+# coding=utf-8
+# Copyright 2021 NVIDIA Corporation. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+FROM nvcr.io/nvidia/pytorch:21.07-py3
+LABEL maintainer="Hugging Face"
+LABEL repository="transformers"
+
+RUN apt-get update
+RUN apt-get install sudo
+
+RUN python3 -m pip install --no-cache-dir --upgrade pip
+RUN python3 -m pip install --no-cache-dir --ignore-installed ruamel.yaml \
+    mkl \
+    absl-py \
+    yamlpy \
+    tensorboardX
+RUN python3 -m pip install --no-cache-dir \
+    pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com
+
+WORKDIR /workspace
+COPY . transformers/
+RUN cd transformers/ && \
+    python3 -m pip install --no-cache-dir .
+
+RUN python3 -m pip install --no-cache-dir datasets \
+    accelerate
--- a/examples/research_projects/quantization-qdqbert/README.md
+++ b/examples/research_projects/quantization-qdqbert/README.md
@ -0,0 +1,197 @@
+<!---
+Copyright 2021 NVIDIA Corporation. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Huggingface QDQBERT Quantization Example
+
+The QDQBERT model adds fake quantization (pair of QuantizeLinear/DequantizeLinear ops) to:
+ * linear layer inputs and weights
+ * matmul inputs
+ * residual add inputs
+
+In this example, we use QDQBERT model to do quantization on SQuAD task, including Quantization Aware Training (QAT), Post Training Quantization (PTQ) and inferencing using TensorRT.
+
+Required:
+- [pytorch-quantization toolkit](https://github.com/NVIDIA/TensorRT/tree/master/tools/pytorch-quantization)
+- [TensorRT >= 8.2](https://developer.nvidia.com/tensorrt)
+- PyTorch >= 1.10.0
+
+## Setup the environment with Dockerfile
+
+Under the directory of `transformers/`, build the docker image:
+```
+docker build . -f examples/research_projects/quantization-qdqbert/Dockerfile -t bert_quantization:latest
+```
+
+Run the docker:
+```
+docker run --gpus all --privileged --rm -it --shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 bert_quantization:latest
+```
+
+*Note that the current NGC pytorch container (pytorch:21.07-py3) has TensorRT 8.0 which doesn't meet the requiremnt of TensorRT >= 8.2. One can either update the Dockerfile with the latest [NGC pytorch container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch) once it supports TensorRT 8.2, or manually download and install [TensorRT >= 8.2](https://developer.nvidia.com/nvidia-tensorrt-download) in the container.*
+
+
+In the container:
+```
+cd transformers/examples/research_projects/quantization-qdqbert/
+```
+
+## Quantization Aware Training (QAT)
+
+Calibrate the pretrained model and finetune with quantization awared:
+
+```
+python3 run_quant_qa.py \
+  --model_name_or_path bert-base-uncased \
+  --dataset_name squad \
+  --max_seq_length 128 \
+  --doc_stride 32 \
+  --output_dir calib/bert-base-uncased \
+  --do_calib \
+  --calibrator percentile \
+  --percentile 99.99
+```
+
+```
+python3 run_quant_qa.py \
+  --model_name_or_path calib/bert-base-uncased \
+  --dataset_name squad \
+  --do_train \
+  --do_eval \
+  --per_device_train_batch_size 12 \
+  --learning_rate 4e-5 \
+  --num_train_epochs 2 \
+  --max_seq_length 128 \
+  --doc_stride 32 \
+  --output_dir finetuned_int8/bert-base-uncased \
+  --tokenizer_name bert-base-uncased \
+  --save_steps 0
+```
+
+### Export QAT model to ONNX
+
+To export the QAT model finetuned above:
+
+```
+python3 run_quant_qa.py \
+  --model_name_or_path finetuned_int8/bert-base-uncased \
+  --output_dir ./ \
+  --save_onnx \
+  --per_device_eval_batch_size 1 \
+  --max_seq_length 128 \
+  --doc_stride 32 \
+  --dataset_name squad \
+  --tokenizer_name bert-base-uncased
+```
+
+Use `--recalibrate-weights` to calibrate the weight ranges according to the quantizer axis. Use `--quant-per-tensor` for per tensor quantization (default is per channel).
+Recalibrating will affect the accuracy of the model, but the change should be minimal (< 0.5 F1).
+
+### Benchmark the INT8 QAT ONNX model inference with TensorRT using dummy input
+
+```
+trtexec --onnx=model.onnx --explicitBatch --workspace=16384 --int8 --shapes=input_ids:64x128,attention_mask:64x128,token_type_ids:64x128 --verbose
+```
+
+### Evaluate the INT8 QAT ONNX model inference with TensorRT
+
+```
+python3 evaluate-hf-trt-qa.py \
+  --onnx_model_path=./model.onnx \
+  --output_dir ./ \
+  --per_device_eval_batch_size 64 \
+  --max_seq_length 128 \
+  --doc_stride 32 \
+  --dataset_name squad \
+  --tokenizer_name bert-base-uncased \
+  --int8 \
+  --seed 42
+```
+
+## Fine-tuning of FP32 model for comparison
+
+Finetune a fp32 precision model with [transformers/examples/pytorch/question-answering/](../../pytorch/question-answering/):
+
+```
+python3 ../../pytorch/question-answering/run_qa.py \
+  --model_name_or_path bert-base-uncased \
+  --dataset_name squad \
+  --per_device_train_batch_size 12 \
+  --learning_rate 3e-5 \
+  --num_train_epochs 2 \
+  --max_seq_length 128 \
+  --doc_stride 32 \
+  --output_dir ./finetuned_fp32/bert-base-uncased \
+  --save_steps 0 \
+  --do_train \
+  --do_eval
+```
+
+## Post Training Quantization (PTQ)
+
+### PTQ by calibrating and evaluating the finetuned FP32 model above:
+
+```
+python3 run_quant_qa.py \
+  --model_name_or_path ./finetuned_fp32/bert-base-uncased \
+  --dataset_name squad \
+  --calibrator percentile \
+  --percentile 99.99 \
+  --max_seq_length 128 \
+  --doc_stride 32 \
+  --output_dir ./calib/bert-base-uncased \
+  --save_steps 0 \
+  --do_calib \
+  --do_eval
+```
+
+### Export the INT8 PTQ model to ONNX
+
+```
+python3 run_quant_qa.py \
+  --model_name_or_path ./calib/bert-base-uncased \
+  --output_dir ./ \
+  --save_onnx \
+  --per_device_eval_batch_size 1 \
+  --max_seq_length 128 \
+  --doc_stride 32 \
+  --dataset_name squad \
+  --tokenizer_name bert-base-uncased
+```
+
+### Evaluate the INT8 PTQ ONNX model inference with TensorRT
+
+```
+python3 evaluate-hf-trt-qa.py \
+  --onnx_model_path=./model.onnx \
+  --output_dir ./ \
+  --per_device_eval_batch_size 64 \
+  --max_seq_length 128 \
+  --doc_stride 32 \
+  --dataset_name squad \
+  --tokenizer_name bert-base-uncased \
+  --int8 \
+  --seed 42
+```
+
+### Quantization options
+
+Some useful options to support different implementations and optimizations. These should be specified for both calibration and finetuning.
+
+|argument|description|
+|--------|-----------|
+|`--quant-per-tensor`| quantize weights with one quantization range per tensor |
+|`--fuse-qkv` | use a single range (the max) for quantizing QKV weights and output activations  |
+|`--clip-gelu N` | clip the output of GELU to a maximum of N when quantizing (e.g. 10) |
+|`--disable-dropout` | disable dropout for consistent activation ranges |
--- a/examples/research_projects/quantization-qdqbert/evaluate-hf-trt-qa.py
+++ b/examples/research_projects/quantization-qdqbert/evaluate-hf-trt-qa.py
@ -0,0 +1,456 @@
+# coding=utf-8
+# Copyright 2021 NVIDIA Corporation. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Finetuning the library models for question-answering on SQuAD (DistilBERT, Bert, XLM, XLNet)."""
+import argparse
+import logging
+import os
+import time
+import timeit
+
+import datasets
+import numpy as np
+import torch
+from absl import logging as absl_logging
+from datasets import load_dataset, load_metric
+from torch.utils.data import DataLoader
+
+import pycuda.autoinit  # noqa: F401
+import pycuda.driver as cuda
+import tensorrt as trt
+import transformers
+from accelerate import Accelerator
+from transformers import AutoTokenizer, EvalPrediction, default_data_collator, set_seed
+from transformers.trainer_pt_utils import nested_concat, nested_truncate
+from utils_qa import postprocess_qa_predictions
+
+
+TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
+absl_logger = absl_logging.get_absl_logger()
+absl_logger.setLevel(logging.WARNING)
+
+logger = logging.getLogger(__name__)
+
+parser = argparse.ArgumentParser()
+
+# Required parameters
+parser.add_argument(
+    "--onnx_model_path",
+    default=None,
+    type=str,
+    required=True,
+    help="Path to ONNX model: ",
+)
+
+parser.add_argument(
+    "--output_dir",
+    default=None,
+    type=str,
+    required=True,
+    help="The output directory where the model checkpoints and predictions will be written.",
+)
+
+# Other parameters
+
+parser.add_argument(
+    "--tokenizer_name",
+    default="",
+    type=str,
+    required=True,
+    help="Pretrained tokenizer name or path if not the same as model_name",
+)
+
+parser.add_argument(
+    "--version_2_with_negative",
+    action="store_true",
+    help="If true, the SQuAD examples contain some that do not have an answer.",
+)
+parser.add_argument(
+    "--null_score_diff_threshold",
+    type=float,
+    default=0.0,
+    help="If null_score - best_non_null is greater than the threshold predict null.",
+)
+
+parser.add_argument(
+    "--max_seq_length",
+    default=384,
+    type=int,
+    help="The maximum total input sequence length after WordPiece tokenization. Sequences "
+    "longer than this will be truncated, and sequences shorter than this will be padded.",
+)
+parser.add_argument(
+    "--doc_stride",
+    default=128,
+    type=int,
+    help="When splitting up a long document into chunks, how much stride to take between chunks.",
+)
+
+parser.add_argument("--per_device_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation.")
+
+parser.add_argument(
+    "--n_best_size",
+    default=20,
+    type=int,
+    help="The total number of n-best predictions to generate in the nbest_predictions.json output file.",
+)
+parser.add_argument(
+    "--max_answer_length",
+    default=30,
+    type=int,
+    help="The maximum length of an answer that can be generated. This is needed because the start "
+    "and end predictions are not conditioned on one another.",
+)
+
+parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
+
+parser.add_argument(
+    "--dataset_name",
+    type=str,
+    default=None,
+    required=True,
+    help="The name of the dataset to use (via the datasets library).",
+)
+parser.add_argument(
+    "--dataset_config_name",
+    type=str,
+    default=None,
+    help="The configuration name of the dataset to use (via the datasets library).",
+)
+parser.add_argument(
+    "--preprocessing_num_workers", type=int, default=4, help="A csv or a json file containing the training data."
+)
+parser.add_argument(
+    "--overwrite_cache", type=bool, default=False, help="Overwrite the cached training and evaluation sets"
+)
+parser.add_argument(
+    "--fp16",
+    action="store_true",
+    help="Whether to use 16-bit (mixed) precision instead of 32-bit",
+)
+parser.add_argument(
+    "--int8",
+    action="store_true",
+    help="Whether to use INT8",
+)
+
+args = parser.parse_args()
+
+if args.tokenizer_name:
+    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, use_fast=True)
+else:
+    raise ValueError(
+        "You are instantiating a new tokenizer from scratch. This is not supported by this script."
+        "You can do it from another script, save it, and load it from here, using --tokenizer_name."
+    )
+
+logger.info("Training/evaluation parameters %s", args)
+
+args.eval_batch_size = args.per_device_eval_batch_size
+
+INPUT_SHAPE = (args.eval_batch_size, args.max_seq_length)
+
+# TRT Engine properties
+STRICT_TYPES = True
+
+engine_name = "temp_engine/bert-fp32.engine"
+if args.fp16:
+    engine_name = "temp_engine/bert-fp16.engine"
+if args.int8:
+    engine_name = "temp_engine/bert-int8.engine"
+
+# import ONNX file
+if not os.path.exists("temp_engine"):
+    os.makedirs("temp_engine")
+
+EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+with trt.Builder(TRT_LOGGER) as builder, builder.create_network(EXPLICIT_BATCH) as network, trt.OnnxParser(
+    network, TRT_LOGGER
+) as parser:
+    with open(args.onnx_model_path, "rb") as model:
+        if not parser.parse(model.read()):
+            for error in range(parser.num_errors):
+                print(parser.get_error(error))
+
+    # Query input names and shapes from parsed TensorRT network
+    network_inputs = [network.get_input(i) for i in range(network.num_inputs)]
+    input_names = [_input.name for _input in network_inputs]  # ex: ["actual_input1"]
+
+    with builder.create_builder_config() as config:
+        config.max_workspace_size = 1 << 50
+        if STRICT_TYPES:
+            config.set_flag(trt.BuilderFlag.STRICT_TYPES)
+        if args.fp16:
+            config.set_flag(trt.BuilderFlag.FP16)
+        if args.int8:
+            config.set_flag(trt.BuilderFlag.INT8)
+        profile = builder.create_optimization_profile()
+        config.add_optimization_profile(profile)
+        for i in range(len(input_names)):
+            profile.set_shape(input_names[i], INPUT_SHAPE, INPUT_SHAPE, INPUT_SHAPE)
+        engine = builder.build_engine(network, config)
+
+        # serialize_engine and store in file (can be directly loaded and deserialized):
+        with open(engine_name, "wb") as f:
+            f.write(engine.serialize())
+
+
+# run inference with TRT
+def model_infer(inputs, context, d_inputs, h_output0, h_output1, d_output0, d_output1, stream):
+    input_ids = np.asarray(inputs["input_ids"], dtype=np.int32)
+    attention_mask = np.asarray(inputs["attention_mask"], dtype=np.int32)
+    token_type_ids = np.asarray(inputs["token_type_ids"], dtype=np.int32)
+
+    # Copy inputs
+    cuda.memcpy_htod_async(d_inputs[0], input_ids.ravel(), stream)
+    cuda.memcpy_htod_async(d_inputs[1], attention_mask.ravel(), stream)
+    cuda.memcpy_htod_async(d_inputs[2], token_type_ids.ravel(), stream)
+    # start time
+    start_time = time.time()
+    # Run inference
+    context.execute_async(
+        bindings=[int(d_inp) for d_inp in d_inputs] + [int(d_output0), int(d_output1)], stream_handle=stream.handle
+    )
+    # Transfer predictions back from GPU
+    cuda.memcpy_dtoh_async(h_output0, d_output0, stream)
+    cuda.memcpy_dtoh_async(h_output1, d_output1, stream)
+    # Synchronize the stream and take time
+    stream.synchronize()
+    # end time
+    end_time = time.time()
+    infer_time = end_time - start_time
+    outputs = (h_output0, h_output1)
+    # print(outputs)
+    return outputs, infer_time
+
+
+# Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
+accelerator = Accelerator()
+# Make one log on every process with the configuration for debugging.
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+    datefmt="%m/%d/%Y %H:%M:%S",
+    level=logging.INFO,
+)
+
+# Setup logging, we only want one process per machine to log things on the screen.
+# accelerator.is_local_main_process is only True for one process per machine.
+logger.setLevel(logging.INFO if accelerator.is_local_main_process else logging.ERROR)
+if accelerator.is_local_main_process:
+    datasets.utils.logging.set_verbosity_warning()
+    transformers.utils.logging.set_verbosity_info()
+else:
+    datasets.utils.logging.set_verbosity_error()
+    transformers.utils.logging.set_verbosity_error()
+
+# If passed along, set the training seed now.
+if args.seed is not None:
+    set_seed(args.seed)
+
+# Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
+# or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+# (the dataset will be downloaded automatically from the datasets Hub).
+#
+# For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
+# 'text' is found. You can easily tweak this behavior (see below).
+if args.dataset_name is not None:
+    # Downloading and loading a dataset from the hub.
+    raw_datasets = load_dataset(args.dataset_name, args.dataset_config_name)
+else:
+    raise ValueError("Evaluation requires a dataset name")
+# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+# https://huggingface.co/docs/datasets/loading_datasets.html.
+
+# Preprocessing the datasets.
+# Preprocessing is slighlty different for training and evaluation.
+
+column_names = raw_datasets["validation"].column_names
+
+question_column_name = "question" if "question" in column_names else column_names[0]
+context_column_name = "context" if "context" in column_names else column_names[1]
+answer_column_name = "answers" if "answers" in column_names else column_names[2]
+
+# Padding side determines if we do (question|context) or (context|question).
+pad_on_right = tokenizer.padding_side == "right"
+
+if args.max_seq_length > tokenizer.model_max_length:
+    logger.warning(
+        f"The max_seq_length passed ({args.max_seq_length}) is larger than the maximum length for the"
+        f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
+    )
+
+max_seq_length = min(args.max_seq_length, tokenizer.model_max_length)
+
+
+# Validation preprocessing
+def prepare_validation_features(examples):
+    # Some of the questions have lots of whitespace on the left, which is not useful and will make the
+    # truncation of the context fail (the tokenized question will take a lots of space). So we remove that
+    # left whitespace
+    examples[question_column_name] = [q.lstrip() for q in examples[question_column_name]]
+
+    # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
+    # in one example possible giving several features when a context is long, each of those features having a
+    # context that overlaps a bit the context of the previous feature.
+    tokenized_examples = tokenizer(
+        examples[question_column_name if pad_on_right else context_column_name],
+        examples[context_column_name if pad_on_right else question_column_name],
+        truncation="only_second" if pad_on_right else "only_first",
+        max_length=max_seq_length,
+        stride=args.doc_stride,
+        return_overflowing_tokens=True,
+        return_offsets_mapping=True,
+        padding="max_length",
+    )
+
+    # Since one example might give us several features if it has a long context, we need a map from a feature to
+    # its corresponding example. This key gives us just that.
+    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+
+    # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
+    # corresponding example_id and we will store the offset mappings.
+    tokenized_examples["example_id"] = []
+
+    for i in range(len(tokenized_examples["input_ids"])):
+        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
+        sequence_ids = tokenized_examples.sequence_ids(i)
+        context_index = 1 if pad_on_right else 0
+
+        # One example can give several spans, this is the index of the example containing this span of text.
+        sample_index = sample_mapping[i]
+        tokenized_examples["example_id"].append(examples["id"][sample_index])
+
+        # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
+        # position is part of the context or not.
+        tokenized_examples["offset_mapping"][i] = [
+            (o if sequence_ids[k] == context_index else None)
+            for k, o in enumerate(tokenized_examples["offset_mapping"][i])
+        ]
+
+    return tokenized_examples
+
+
+eval_examples = raw_datasets["validation"]
+# Validation Feature Creation
+eval_dataset = eval_examples.map(
+    prepare_validation_features,
+    batched=True,
+    num_proc=args.preprocessing_num_workers,
+    remove_columns=column_names,
+    load_from_cache_file=not args.overwrite_cache,
+    desc="Running tokenizer on validation dataset",
+)
+
+data_collator = default_data_collator
+
+eval_dataset_for_model = eval_dataset.remove_columns(["example_id", "offset_mapping"])
+eval_dataloader = DataLoader(
+    eval_dataset_for_model, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size
+)
+
+
+# Post-processing:
+def post_processing_function(examples, features, predictions, stage="eval"):
+    # Post-processing: we match the start logits and end logits to answers in the original context.
+    predictions = postprocess_qa_predictions(
+        examples=examples,
+        features=features,
+        predictions=predictions,
+        version_2_with_negative=args.version_2_with_negative,
+        n_best_size=args.n_best_size,
+        max_answer_length=args.max_answer_length,
+        null_score_diff_threshold=args.null_score_diff_threshold,
+        output_dir=args.output_dir,
+        prefix=stage,
+    )
+    # Format the result to the format the metric expects.
+    if args.version_2_with_negative:
+        formatted_predictions = [
+            {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
+        ]
+    else:
+        formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
+
+    references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples]
+    return EvalPrediction(predictions=formatted_predictions, label_ids=references)
+
+
+metric = load_metric("squad_v2" if args.version_2_with_negative else "squad")
+
+# Evaluation!
+logger.info("Loading ONNX model %s for evaluation", args.onnx_model_path)
+with open(engine_name, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime, runtime.deserialize_cuda_engine(
+    f.read()
+) as engine, engine.create_execution_context() as context:
+
+    # setup for TRT inferrence
+    for i in range(len(input_names)):
+        context.set_binding_shape(i, INPUT_SHAPE)
+    assert context.all_binding_shapes_specified
+
+    def binding_nbytes(binding):
+        return trt.volume(engine.get_binding_shape(binding)) * engine.get_binding_dtype(binding).itemsize
+
+    # Allocate device memory for inputs and outputs.
+    d_inputs = [cuda.mem_alloc(binding_nbytes(binding)) for binding in engine if engine.binding_is_input(binding)]
+
+    # Allocate output buffer
+    h_output0 = cuda.pagelocked_empty(tuple(context.get_binding_shape(3)), dtype=np.float32)
+    h_output1 = cuda.pagelocked_empty(tuple(context.get_binding_shape(4)), dtype=np.float32)
+    d_output0 = cuda.mem_alloc(h_output0.nbytes)
+    d_output1 = cuda.mem_alloc(h_output1.nbytes)
+
+    # Create a stream in which to copy inputs/outputs and run inference.
+    stream = cuda.Stream()
+
+    # Evaluation
+    logger.info("***** Running Evaluation *****")
+    logger.info(f"  Num examples = {len(eval_dataset)}")
+    logger.info(f"  Batch size = {args.per_device_eval_batch_size}")
+
+    total_time = 0.0
+    niter = 0
+    start_time = timeit.default_timer()
+
+    all_preds = None
+    for step, batch in enumerate(eval_dataloader):
+
+        outputs, infer_time = model_infer(batch, context, d_inputs, h_output0, h_output1, d_output0, d_output1, stream)
+        total_time += infer_time
+        niter += 1
+
+        start_logits, end_logits = outputs
+        start_logits = torch.tensor(start_logits)
+        end_logits = torch.tensor(end_logits)
+
+        # necessary to pad predictions and labels for being gathered
+        start_logits = accelerator.pad_across_processes(start_logits, dim=1, pad_index=-100)
+        end_logits = accelerator.pad_across_processes(end_logits, dim=1, pad_index=-100)
+
+        logits = (accelerator.gather(start_logits).cpu().numpy(), accelerator.gather(end_logits).cpu().numpy())
+        all_preds = logits if all_preds is None else nested_concat(all_preds, logits, padding_index=-100)
+
+    if all_preds is not None:
+        all_preds = nested_truncate(all_preds, len(eval_dataset))
+
+    evalTime = timeit.default_timer() - start_time
+    logger.info("  Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(eval_dataset))
+    # Inference time from TRT
+    logger.info("Average Inference Time = {:.3f} ms".format(total_time * 1000 / niter))
+    logger.info("Total Inference Time =  {:.3f} ms".format(total_time * 1000))
+    logger.info("Total Number of Inference =  %d", niter)
+
+prediction = post_processing_function(eval_examples, eval_dataset, all_preds)
+eval_metric = metric.compute(predictions=prediction.predictions, references=prediction.label_ids)
+logger.info(f"Evaluation metrics: {eval_metric}")
--- a/examples/research_projects/quantization-qdqbert/quant_trainer.py
+++ b/examples/research_projects/quantization-qdqbert/quant_trainer.py
@ -0,0 +1,303 @@
+# coding=utf-8
+# Copyright 2021 NVIDIA Corporation. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Helper functions for training models with pytorch-quantization"""
+import logging
+import re
+
+import torch
+
+import pytorch_quantization
+import pytorch_quantization.nn as quant_nn
+from pytorch_quantization import calib
+from pytorch_quantization.tensor_quant import QuantDescriptor
+
+
+logger = logging.getLogger(__name__)
+
+name_width = 50  # max width of layer names
+qname_width = 70  # max width of quantizer names
+
+# ========================================== Quant Trainer API ==========================================
+
+
+def add_arguments(parser):
+    """Add arguments to parser for functions defined in quant_trainer."""
+
+    group = parser.add_argument_group("quant_trainer arguments")
+    group.add_argument("--wprec", type=int, default=8, help="weight precision")
+    group.add_argument("--aprec", type=int, default=8, help="activation precision")
+    group.add_argument("--quant-per-tensor", action="store_true", help="per tensor weight scaling")
+    group.add_argument("--quant-disable", action="store_true", help="disable all quantizers")
+    group.add_argument("--quant-disable-embeddings", action="store_true", help="disable all embeddings quantizers")
+    group.add_argument("--quant-disable-keyword", type=str, nargs="+", help="disable quantizers by keyword")
+    group.add_argument("--quant-disable-layer-module", type=str, help="disable quantizers by keyword under layer.\d+.")
+    group.add_argument("--quant-enable-layer-module", type=str, help="enable quantizers by keyword under layer.\d+.")
+    group.add_argument("--calibrator", default="max", help="which quantization range calibrator to use")
+    group.add_argument("--percentile", default=None, type=float, help="percentile for PercentileCalibrator")
+    group.add_argument("--fuse-qkv", action="store_true", help="use the same scale factor for qkv")
+    group.add_argument("--clip-gelu", metavar="N", type=float, help="clip gelu output maximum value to N")
+    group.add_argument(
+        "--recalibrate-weights",
+        action="store_true",
+        help="recalibrate weight amaxes by taking the max of the weights."
+        " amaxes will be computed with the current quantization granularity (axis).",
+    )
+
+
+def set_default_quantizers(args):
+    """Set default quantizers before creating the model."""
+
+    if args.calibrator == "max":
+        calib_method = "max"
+    elif args.calibrator == "percentile":
+        if args.percentile is None:
+            raise ValueError("Specify --percentile when using percentile calibrator")
+        calib_method = "histogram"
+    elif args.calibrator == "mse":
+        calib_method = "histogram"
+    else:
+        raise ValueError(f"Invalid calibrator {args.calibrator}")
+
+    input_desc = QuantDescriptor(num_bits=args.aprec, calib_method=calib_method)
+    weight_desc = QuantDescriptor(num_bits=args.wprec, axis=(None if args.quant_per_tensor else (0,)))
+    quant_nn.QuantLinear.set_default_quant_desc_input(input_desc)
+    quant_nn.QuantLinear.set_default_quant_desc_weight(weight_desc)
+
+
+def configure_model(model, args, calib=False, eval=False):
+    """Function called before the training loop."""
+
+    logger.info("Configuring Model for Quantization")
+    logger.info(f"using quantization package {pytorch_quantization.__file__}")
+
+    if not calib:
+        if args.quant_disable_embeddings:
+            set_quantizer_by_name(model, ["embeddings"], which="weight", _disabled=True)
+
+        if args.quant_disable:
+            set_quantizer_by_name(model, [""], _disabled=True)
+
+        if args.quant_disable_keyword:
+            set_quantizer_by_name(model, args.quant_disable_keyword, _disabled=True)
+
+        if args.quant_disable_layer_module:
+            set_quantizer_by_name(model, ["layer.\d+." + args.quant_disable_layer_module], _disabled=True)
+
+        if args.quant_enable_layer_module:
+            set_quantizer_by_name(model, ["layer.\d+." + args.quant_enable_layer_module], _disabled=False)
+
+        if args.recalibrate_weights:
+            recalibrate_weights(model)
+
+        if args.fuse_qkv:
+            fuse_qkv(model, args)
+
+    if args.clip_gelu:
+        clip_gelu(model, args.clip_gelu)
+
+    # if args.local_rank in [-1, 0] and not calib:
+    print_quant_summary(model)
+
+
+def enable_calibration(model):
+    """Enable calibration of all *_input_quantizer modules in model."""
+
+    logger.info("Enabling Calibration")
+    for name, module in model.named_modules():
+        if name.endswith("_quantizer"):
+            if module._calibrator is not None:
+                module.disable_quant()
+                module.enable_calib()
+            else:
+                module.disable()
+            logger.info(f"{name:80}: {module}")
+
+
+def finish_calibration(model, args):
+    """Disable calibration and load amax for all "*_input_quantizer modules in model."""
+
+    logger.info("Loading calibrated amax")
+    for name, module in model.named_modules():
+        if name.endswith("_quantizer"):
+            if module._calibrator is not None:
+                if isinstance(module._calibrator, calib.MaxCalibrator):
+                    module.load_calib_amax()
+                else:
+                    module.load_calib_amax("percentile", percentile=args.percentile)
+                module.enable_quant()
+                module.disable_calib()
+            else:
+                module.enable()
+    model.cuda()
+    print_quant_summary(model)
+
+
+# ========================================== Helper Function ==========================================
+
+
+def fuse_qkv(model, args):
+    """Adjust quantization ranges to match an implementation where the QKV projections are implemented with a single GEMM.
+    Force the weight and output scale factors to match by taking the max of (Q,K,V).
+    """
+
+    def fuse3(qq, qk, qv):
+        for mod in [qq, qk, qv]:
+            if not hasattr(mod, "_amax"):
+                print("          WARNING: NO AMAX BUFFER")
+                return
+        q = qq._amax.detach().item()
+        k = qk._amax.detach().item()
+        v = qv._amax.detach().item()
+
+        amax = max(q, k, v)
+        qq._amax.fill_(amax)
+        qk._amax.fill_(amax)
+        qv._amax.fill_(amax)
+        logger.info(f"          q={q:5.2f} k={k:5.2f} v={v:5.2f} -> {amax:5.2f}")
+
+    for name, mod in model.named_modules():
+        if name.endswith(".attention.self"):
+            logger.info(f"FUSE_QKV: {name:{name_width}}")
+            fuse3(mod.matmul_q_input_quantizer, mod.matmul_k_input_quantizer, mod.matmul_v_input_quantizer)
+            if args.quant_per_tensor:
+                fuse3(mod.query._weight_quantizer, mod.key._weight_quantizer, mod.value._weight_quantizer)
+
+
+def clip_gelu(model, maxval):
+    """Clip activations generated by GELU to maxval when quantized.
+    Implemented by adjusting the amax of the following input_quantizer.
+    """
+
+    for name, mod in model.named_modules():
+        if name.endswith(".output.dense") and not name.endswith("attention.output.dense"):
+            amax_init = mod._input_quantizer._amax.data.detach().item()
+            mod._input_quantizer._amax.data.detach().clamp_(max=maxval)
+            amax = mod._input_quantizer._amax.data.detach().item()
+            logger.info(f"CLIP_GELU: {name:{name_width}} amax: {amax_init:5.2f} -> {amax:5.2f}")
+
+
+def expand_amax(model):
+    """Expand per-tensor amax to be per channel, where each channel is assigned the per-tensor amax."""
+
+    for name, mod in model.named_modules():
+        if hasattr(mod, "_weight_quantizer") and mod._weight_quantizer.axis is not None:
+            k = mod.weight.shape[0]
+            amax = mod._weight_quantizer._amax.detach()
+            mod._weight_quantizer._amax = torch.ones(k, dtype=amax.dtype, device=amax.device) * amax
+            print(f"expanding {name} {amax} -> {mod._weight_quantizer._amax}")
+
+
+def recalibrate_weights(model):
+    """Performs max calibration on the weights and updates amax."""
+
+    for name, mod in model.named_modules():
+        if hasattr(mod, "_weight_quantizer"):
+            if not hasattr(mod.weight_quantizer, "_amax"):
+                print("RECALIB: {name:{name_width}} WARNING: NO AMAX BUFFER")
+                continue
+
+            # determine which axes to reduce across
+            # e.g. a 4D tensor quantized per axis 0 should reduce over (1,2,3)
+            axis_set = set() if mod._weight_quantizer.axis is None else set(mod._weight_quantizer.axis)
+            reduce_axis = set(range(len(mod.weight.size()))) - axis_set
+            amax = pytorch_quantization.utils.reduce_amax(mod.weight, axis=reduce_axis, keepdims=True).detach()
+            logger.info(f"RECALIB: {name:{name_width}} {mod._weight_quantizer._amax.flatten()} -> {amax.flatten()}")
+            mod._weight_quantizer._amax = amax
+
+
+def print_model_summary(model, name_width=25, line_width=180, ignore=None):
+    """Print model quantization configuration."""
+
+    if ignore is None:
+        ignore = []
+    elif not isinstance(ignore, list):
+        ignore = [ignore]
+
+    name_width = 0
+    for name, mod in model.named_modules():
+        if not hasattr(mod, "weight"):
+            continue
+        name_width = max(name_width, len(name))
+
+    for name, mod in model.named_modules():
+        input_q = getattr(mod, "_input_quantizer", None)
+        weight_q = getattr(mod, "_weight_quantizer", None)
+        if not hasattr(mod, "weight"):
+            continue
+        if type(mod) in ignore:
+            continue
+        if [True for s in ignore if type(s) is str and s in name]:
+            continue
+        act_str = f"Act:{input_q.extra_repr()}"
+        wgt_str = f"Wgt:{weight_q.extra_repr()}"
+        s = f"{name:{name_width}} {act_str} {wgt_str}"
+        if len(s) <= line_width:
+            logger.info(s)
+        else:
+            logger.info(f"{name:{name_width}} {act_str}")
+            logger.info(f'{"  ":{name_width}} {wgt_str}')
+
+
+def print_quant_summary(model):
+    """Print summary of all quantizer modules in the model."""
+
+    count = 0
+    for name, mod in model.named_modules():
+        if isinstance(mod, pytorch_quantization.nn.TensorQuantizer):
+            print(f"{name:80} {mod}")
+            count += 1
+    print(f"{count} TensorQuantizers found in model")
+
+
+def set_quantizer(name, mod, quantizer, k, v):
+    """Set attributes for mod.quantizer."""
+
+    quantizer_mod = getattr(mod, quantizer, None)
+    if quantizer_mod is not None:
+        assert hasattr(quantizer_mod, k)
+        setattr(quantizer_mod, k, v)
+    else:
+        logger.warn(f"{name} has no {quantizer}")
+
+
+def set_quantizers(name, mod, which="both", **kwargs):
+    """Set quantizer attributes for mod."""
+
+    s = f"Warning: changing {which} quantizers of {name:{qname_width}}"
+    for k, v in kwargs.items():
+        s += f" {k}={v}"
+        if which in ["input", "both"]:
+            set_quantizer(name, mod, "_input_quantizer", k, v)
+        if which in ["weight", "both"]:
+            set_quantizer(name, mod, "_weight_quantizer", k, v)
+    logger.info(s)
+
+
+def set_quantizer_by_name(model, names, **kwargs):
+    """Set quantizer attributes for layers where name contains a substring in names."""
+
+    for name, mod in model.named_modules():
+        if hasattr(mod, "_input_quantizer") or hasattr(mod, "_weight_quantizer"):
+            for n in names:
+                if re.search(n, name):
+                    set_quantizers(name, mod, **kwargs)
+        elif name.endswith("_quantizer"):
+            for n in names:
+                if re.search(n, name):
+                    s = f"Warning: changing {name:{name_width}}"
+                    for k, v in kwargs.items():
+                        s += f" {k}={v}"
+                        setattr(mod, k, v)
+                    logger.info(s)
--- a/examples/research_projects/quantization-qdqbert/run_quant_qa.py
+++ b/examples/research_projects/quantization-qdqbert/run_quant_qa.py
@ -0,0 +1,668 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team All rights reserved.
+# Copyright 2021 NVIDIA Corporation. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for question answering.
+"""
+# You can also adapt this script on your own question answering task. Pointers for this are left as comments.
+
+import logging
+import os
+import sys
+from dataclasses import dataclass, field
+from typing import Optional
+
+import datasets
+from datasets import load_dataset, load_metric
+
+import quant_trainer
+import transformers
+from trainer_quant_qa import QuestionAnsweringTrainer
+from transformers import (
+    AutoTokenizer,
+    DataCollatorWithPadding,
+    EvalPrediction,
+    HfArgumentParser,
+    PreTrainedTokenizerFast,
+    QDQBertConfig,
+    QDQBertForQuestionAnswering,
+    TrainingArguments,
+    default_data_collator,
+    set_seed,
+)
+from transformers.trainer_utils import SchedulerType, get_last_checkpoint
+from transformers.utils import check_min_version
+from transformers.utils.versions import require_version
+from utils_qa import postprocess_qa_predictions
+
+
+# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
+check_min_version("4.9.0")
+
+require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to directory to store the pretrained models downloaded from huggingface.co"},
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    use_auth_token: bool = field(
+        default=False,
+        metadata={
+            "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
+            "with private models)."
+        },
+    )
+    do_calib: bool = field(default=False, metadata={"help": "Whether to run calibration of quantization ranges."})
+    num_calib_batch: int = field(
+        default=4,
+        metadata={"help": "Number of batches for calibration. 0 will disable calibration "},
+    )
+    save_onnx: bool = field(default=False, metadata={"help": "Whether to save model to onnx."})
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+
+    dataset_name: Optional[str] = field(
+        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
+    validation_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
+    )
+    test_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input test data file to evaluate the perplexity on (a text file)."},
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    max_seq_length: int = field(
+        default=384,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization. Sequences longer "
+            "than this will be truncated, sequences shorter will be padded."
+        },
+    )
+    pad_to_max_length: bool = field(
+        default=True,
+        metadata={
+            "help": "Whether to pad all samples to `max_seq_length`. "
+            "If False, will pad the samples dynamically when batching to the maximum length in the batch (which can "
+            "be faster on GPU but will be slower on TPU)."
+        },
+    )
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+            "value if set."
+        },
+    )
+    max_predict_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of prediction examples to this "
+            "value if set."
+        },
+    )
+    version_2_with_negative: bool = field(
+        default=False, metadata={"help": "If true, some of the examples do not have an answer."}
+    )
+    null_score_diff_threshold: float = field(
+        default=0.0,
+        metadata={
+            "help": "The threshold used to select the null answer: if the best answer has a score that is less than "
+            "the score of the null answer minus this threshold, the null answer is selected for this example. "
+            "Only useful when `version_2_with_negative=True`."
+        },
+    )
+    doc_stride: int = field(
+        default=128,
+        metadata={"help": "When splitting up a long document into chunks, how much stride to take between chunks."},
+    )
+    n_best_size: int = field(
+        default=20,
+        metadata={"help": "The total number of n-best predictions to generate when looking for an answer."},
+    )
+    max_answer_length: int = field(
+        default=30,
+        metadata={
+            "help": "The maximum length of an answer that can be generated. This is needed because the start "
+            "and end predictions are not conditioned on one another."
+        },
+    )
+
+    def __post_init__(self):
+        if (
+            self.dataset_name is None
+            and self.train_file is None
+            and self.validation_file is None
+            and self.test_file is None
+        ):
+            raise ValueError("Need either a dataset name or a training/validation file/test_file.")
+        else:
+            if self.train_file is not None:
+                extension = self.train_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`train_file` should be a csv or a json file."
+            if self.validation_file is not None:
+                extension = self.validation_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file."
+            if self.test_file is not None:
+                extension = self.test_file.split(".")[-1]
+                assert extension in ["csv", "json"], "`test_file` should be a csv or a json file."
+
+
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    # quant_trainer arguments
+    quant_trainer.add_arguments(parser)
+
+    # if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+    #     # If we pass only one argument to the script and it's the path to a json file,
+    #     # let's parse it to get our arguments.
+    #     model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    # else:
+
+    model_args, data_args, training_args, quant_trainer_args = parser.parse_args_into_dataclasses()
+
+    # setup QAT training args for scheduler (default to use cosine annealing learning rate schedule)
+    training_args.lr_scheduler_type = SchedulerType.COSINE
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    datasets.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()
+
+    # Log on each process the small summary:
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+    )
+    logger.info(f"Training/evaluation parameters {training_args}")
+
+    # Detecting last checkpoint.
+    last_checkpoint = None
+    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
+            raise ValueError(
+                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
+                "Use --overwrite_output_dir to overcome."
+            )
+        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
+            logger.info(
+                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
+                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
+            )
+
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+
+    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
+    # 'text' is found. You can easily tweak this behavior (see below).
+    #
+    # In distributed training, the load_dataset function guarantee that only one local process can concurrently
+    # download the dataset.
+    if data_args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        raw_datasets = load_dataset(
+            data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
+        )
+    else:
+        data_files = {}
+        if data_args.train_file is not None:
+            data_files["train"] = data_args.train_file
+            extension = data_args.train_file.split(".")[-1]
+
+        if data_args.validation_file is not None:
+            data_files["validation"] = data_args.validation_file
+            extension = data_args.validation_file.split(".")[-1]
+        if data_args.test_file is not None:
+            data_files["test"] = data_args.test_file
+            extension = data_args.test_file.split(".")[-1]
+        raw_datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir)
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+
+    # set default quantization parameters before building model
+    quant_trainer.set_default_quantizers(quant_trainer_args)
+
+    # Load pretrained model and tokenizer
+    #
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+    config = QDQBertConfig.from_pretrained(
+        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        use_fast=True,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+    model = QDQBertForQuestionAnswering.from_pretrained(
+        model_args.model_name_or_path,
+        from_tf=bool(".ckpt" in model_args.model_name_or_path),
+        config=config,
+        cache_dir=model_args.cache_dir,
+        revision=model_args.model_revision,
+        use_auth_token=True if model_args.use_auth_token else None,
+    )
+
+    # Tokenizer check: this script requires a fast tokenizer.
+    if not isinstance(tokenizer, PreTrainedTokenizerFast):
+        raise ValueError(
+            "This example script only works for models that have a fast tokenizer. Checkout the big table of models "
+            "at https://huggingface.co/transformers/index.html#supported-frameworks to find the model types that meet this "
+            "requirement"
+        )
+
+    # Preprocessing the datasets.
+    # Preprocessing is slighlty different for training and evaluation.
+    if training_args.do_train or model_args.do_calib:
+        column_names = raw_datasets["train"].column_names
+    elif training_args.do_eval or model_args.save_onnx:
+        column_names = raw_datasets["validation"].column_names
+    else:
+        column_names = raw_datasets["test"].column_names
+    question_column_name = "question" if "question" in column_names else column_names[0]
+    context_column_name = "context" if "context" in column_names else column_names[1]
+    answer_column_name = "answers" if "answers" in column_names else column_names[2]
+
+    # Padding side determines if we do (question|context) or (context|question).
+    pad_on_right = tokenizer.padding_side == "right"
+
+    if data_args.max_seq_length > tokenizer.model_max_length:
+        logger.warning(
+            f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the"
+            f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}."
+        )
+    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
+
+    # Training preprocessing
+    def prepare_train_features(examples):
+        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
+        # in one example possible giving several features when a context is long, each of those features having a
+        # context that overlaps a bit the context of the previous feature.
+        tokenized_examples = tokenizer(
+            examples[question_column_name if pad_on_right else context_column_name],
+            examples[context_column_name if pad_on_right else question_column_name],
+            truncation="only_second" if pad_on_right else "only_first",
+            max_length=max_seq_length,
+            stride=data_args.doc_stride,
+            return_overflowing_tokens=True,
+            return_offsets_mapping=True,
+            padding="max_length" if data_args.pad_to_max_length else False,
+        )
+
+        # Since one example might give us several features if it has a long context, we need a map from a feature to
+        # its corresponding example. This key gives us just that.
+        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+        # The offset mappings will give us a map from token to character position in the original context. This will
+        # help us compute the start_positions and end_positions.
+        offset_mapping = tokenized_examples.pop("offset_mapping")
+
+        # Let's label those examples!
+        tokenized_examples["start_positions"] = []
+        tokenized_examples["end_positions"] = []
+
+        for i, offsets in enumerate(offset_mapping):
+            # We will label impossible answers with the index of the CLS token.
+            input_ids = tokenized_examples["input_ids"][i]
+            cls_index = input_ids.index(tokenizer.cls_token_id)
+
+            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
+            sequence_ids = tokenized_examples.sequence_ids(i)
+
+            # One example can give several spans, this is the index of the example containing this span of text.
+            sample_index = sample_mapping[i]
+            answers = examples[answer_column_name][sample_index]
+            # If no answers are given, set the cls_index as answer.
+            if len(answers["answer_start"]) == 0:
+                tokenized_examples["start_positions"].append(cls_index)
+                tokenized_examples["end_positions"].append(cls_index)
+            else:
+                # Start/end character index of the answer in the text.
+                start_char = answers["answer_start"][0]
+                end_char = start_char + len(answers["text"][0])
+
+                # Start token index of the current span in the text.
+                token_start_index = 0
+                while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
+                    token_start_index += 1
+
+                # End token index of the current span in the text.
+                token_end_index = len(input_ids) - 1
+                while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
+                    token_end_index -= 1
+
+                # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
+                if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
+                    tokenized_examples["start_positions"].append(cls_index)
+                    tokenized_examples["end_positions"].append(cls_index)
+                else:
+                    # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
+                    # Note: we could go after the last offset if the answer is the last word (edge case).
+                    while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
+                        token_start_index += 1
+                    tokenized_examples["start_positions"].append(token_start_index - 1)
+                    while offsets[token_end_index][1] >= end_char:
+                        token_end_index -= 1
+                    tokenized_examples["end_positions"].append(token_end_index + 1)
+
+        return tokenized_examples
+
+    if training_args.do_train or model_args.do_calib:
+        if "train" not in raw_datasets:
+            raise ValueError("--do_train requires a train dataset")
+        train_dataset = raw_datasets["train"]
+        if data_args.max_train_samples is not None:
+            # We will select sample from whole data if agument is specified
+            train_dataset = train_dataset.select(range(data_args.max_train_samples))
+        # Create train feature from dataset
+        with training_args.main_process_first(desc="train dataset map pre-processing"):
+            train_dataset = train_dataset.map(
+                prepare_train_features,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                remove_columns=column_names,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc="Running tokenizer on train dataset",
+            )
+        if data_args.max_train_samples is not None:
+            # Number of samples might increase during Feature Creation, We select only specified max samples
+            train_dataset = train_dataset.select(range(data_args.max_train_samples))
+
+    # Validation preprocessing
+    def prepare_validation_features(examples):
+        # Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
+        # in one example possible giving several features when a context is long, each of those features having a
+        # context that overlaps a bit the context of the previous feature.
+        tokenized_examples = tokenizer(
+            examples[question_column_name if pad_on_right else context_column_name],
+            examples[context_column_name if pad_on_right else question_column_name],
+            truncation="only_second" if pad_on_right else "only_first",
+            max_length=max_seq_length,
+            stride=data_args.doc_stride,
+            return_overflowing_tokens=True,
+            return_offsets_mapping=True,
+            padding="max_length" if data_args.pad_to_max_length else False,
+        )
+
+        # Since one example might give us several features if it has a long context, we need a map from a feature to
+        # its corresponding example. This key gives us just that.
+        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+
+        # For evaluation, we will need to convert our predictions to substrings of the context, so we keep the
+        # corresponding example_id and we will store the offset mappings.
+        tokenized_examples["example_id"] = []
+
+        for i in range(len(tokenized_examples["input_ids"])):
+            # Grab the sequence corresponding to that example (to know what is the context and what is the question).
+            sequence_ids = tokenized_examples.sequence_ids(i)
+            context_index = 1 if pad_on_right else 0
+
+            # One example can give several spans, this is the index of the example containing this span of text.
+            sample_index = sample_mapping[i]
+            tokenized_examples["example_id"].append(examples["id"][sample_index])
+
+            # Set to None the offset_mapping that are not part of the context so it's easy to determine if a token
+            # position is part of the context or not.
+            tokenized_examples["offset_mapping"][i] = [
+                (o if sequence_ids[k] == context_index else None)
+                for k, o in enumerate(tokenized_examples["offset_mapping"][i])
+            ]
+
+        return tokenized_examples
+
+    if training_args.do_eval or model_args.save_onnx:
+        if "validation" not in raw_datasets:
+            raise ValueError("--do_eval requires a validation dataset")
+        eval_examples = raw_datasets["validation"]
+        if data_args.max_eval_samples is not None:
+            # We will select sample from whole data
+            eval_examples = eval_examples.select(range(data_args.max_eval_samples))
+        # Validation Feature Creation
+        with training_args.main_process_first(desc="validation dataset map pre-processing"):
+            eval_dataset = eval_examples.map(
+                prepare_validation_features,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                remove_columns=column_names,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc="Running tokenizer on validation dataset",
+            )
+        if data_args.max_eval_samples is not None:
+            # During Feature creation dataset samples might increase, we will select required samples again
+            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
+
+    if training_args.do_predict:
+        if "test" not in raw_datasets:
+            raise ValueError("--do_predict requires a test dataset")
+        predict_examples = raw_datasets["test"]
+        if data_args.max_predict_samples is not None:
+            # We will select sample from whole data
+            predict_examples = predict_examples.select(range(data_args.max_predict_samples))
+        # Predict Feature Creation
+        with training_args.main_process_first(desc="prediction dataset map pre-processing"):
+            predict_dataset = predict_examples.map(
+                prepare_validation_features,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                remove_columns=column_names,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc="Running tokenizer on prediction dataset",
+            )
+        if data_args.max_predict_samples is not None:
+            # During Feature creation dataset samples might increase, we will select required samples again
+            predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
+
+    # Data collator
+    # We have already padded to max length if the corresponding flag is True, otherwise we need to pad in the data
+    # collator.
+    data_collator = (
+        default_data_collator
+        if data_args.pad_to_max_length
+        else DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None)
+    )
+
+    # Post-processing:
+    def post_processing_function(examples, features, predictions, stage="eval"):
+        # Post-processing: we match the start logits and end logits to answers in the original context.
+        predictions = postprocess_qa_predictions(
+            examples=examples,
+            features=features,
+            predictions=predictions,
+            version_2_with_negative=data_args.version_2_with_negative,
+            n_best_size=data_args.n_best_size,
+            max_answer_length=data_args.max_answer_length,
+            null_score_diff_threshold=data_args.null_score_diff_threshold,
+            output_dir=training_args.output_dir,
+            log_level=log_level,
+            prefix=stage,
+        )
+        # Format the result to the format the metric expects.
+        if data_args.version_2_with_negative:
+            formatted_predictions = [
+                {"id": k, "prediction_text": v, "no_answer_probability": 0.0} for k, v in predictions.items()
+            ]
+        else:
+            formatted_predictions = [{"id": k, "prediction_text": v} for k, v in predictions.items()]
+
+        references = [{"id": ex["id"], "answers": ex[answer_column_name]} for ex in examples]
+        return EvalPrediction(predictions=formatted_predictions, label_ids=references)
+
+    metric = load_metric("squad_v2" if data_args.version_2_with_negative else "squad")
+
+    def compute_metrics(p: EvalPrediction):
+        return metric.compute(predictions=p.predictions, references=p.label_ids)
+
+    # Initialize our Trainer
+    trainer = QuestionAnsweringTrainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset if training_args.do_train or model_args.do_calib else None,
+        eval_dataset=eval_dataset if training_args.do_eval or model_args.save_onnx else None,
+        eval_examples=eval_examples if training_args.do_eval or model_args.save_onnx else None,
+        tokenizer=tokenizer,
+        data_collator=data_collator,
+        post_process_function=post_processing_function,
+        compute_metrics=compute_metrics,
+        quant_trainer_args=quant_trainer_args,
+    )
+
+    # Calibration
+    if model_args.do_calib:
+        logger.info("*** Calibrate ***")
+        results = trainer.calibrate()
+        trainer.save_model()
+
+    # Training
+    if training_args.do_train:
+        checkpoint = None
+        if training_args.resume_from_checkpoint is not None:
+            checkpoint = training_args.resume_from_checkpoint
+        elif last_checkpoint is not None:
+            checkpoint = last_checkpoint
+
+        quant_trainer.configure_model(trainer.model, quant_trainer_args)
+
+        train_result = trainer.train(resume_from_checkpoint=checkpoint)
+        trainer.save_model()  # Saves the tokenizer too for easy upload
+
+        metrics = train_result.metrics
+        max_train_samples = (
+            data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
+        )
+        metrics["train_samples"] = min(max_train_samples, len(train_dataset))
+
+        trainer.log_metrics("train", metrics)
+        trainer.save_metrics("train", metrics)
+        trainer.save_state()
+
+    # Evaluation
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+        quant_trainer.configure_model(trainer.model, quant_trainer_args, eval=True)
+        metrics = trainer.evaluate()
+
+        max_eval_samples = data_args.max_eval_samples if data_args.max_eval_samples is not None else len(eval_dataset)
+        metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))
+
+        trainer.log_metrics("eval", metrics)
+        trainer.save_metrics("eval", metrics)
+
+    # Prediction
+    if training_args.do_predict:
+        logger.info("*** Predict ***")
+        results = trainer.predict(predict_dataset, predict_examples)
+        metrics = results.metrics
+
+        max_predict_samples = (
+            data_args.max_predict_samples if data_args.max_predict_samples is not None else len(predict_dataset)
+        )
+        metrics["predict_samples"] = min(max_predict_samples, len(predict_dataset))
+
+        trainer.log_metrics("predict", metrics)
+        trainer.save_metrics("predict", metrics)
+
+    if training_args.push_to_hub:
+        kwargs = {"finetuned_from": model_args.model_name_or_path, "tasks": "question-answering"}
+        if data_args.dataset_name is not None:
+            kwargs["dataset_tags"] = data_args.dataset_name
+            if data_args.dataset_config_name is not None:
+                kwargs["dataset_args"] = data_args.dataset_config_name
+                kwargs["dataset"] = f"{data_args.dataset_name} {data_args.dataset_config_name}"
+            else:
+                kwargs["dataset"] = data_args.dataset_name
+
+        trainer.push_to_hub(**kwargs)
+
+    if model_args.save_onnx:
+        logger.info("Exporting model to onnx")
+        results = trainer.save_onnx(output_dir=training_args.output_dir)
+
+
+def _mp_fn(index):
+    # For xla_spawn (TPUs)
+    main()
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/research_projects/quantization-qdqbert/trainer_quant_qa.py
+++ b/examples/research_projects/quantization-qdqbert/trainer_quant_qa.py
@ -0,0 +1,212 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team All rights reserved.
+# Copyright 2021 NVIDIA Corporation. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+A subclass of `Trainer` specific to Question-Answering tasks
+"""
+
+import logging
+import os
+
+import torch
+from torch.utils.data import DataLoader
+
+import quant_trainer
+from transformers import Trainer, is_torch_tpu_available
+from transformers.trainer_utils import PredictionOutput
+
+
+logger = logging.getLogger(__name__)
+
+if is_torch_tpu_available():
+    import torch_xla.core.xla_model as xm
+    import torch_xla.debug.metrics as met
+
+
+class QuestionAnsweringTrainer(Trainer):
+    def __init__(self, *args, eval_examples=None, post_process_function=None, quant_trainer_args=None, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.eval_examples = eval_examples
+        self.post_process_function = post_process_function
+        self.quant_trainer_args = quant_trainer_args
+        self.calib_num = 128  # default number of calibration samples
+
+    def get_calib_dataloader(self, calib_dataset=None):
+        """
+        Returns the calibration dataloader :class:`~torch.utils.data.DataLoader`.
+
+        Args:
+            calib_dataset (:obj:`torch.utils.data.Dataset`, `optional`)
+        """
+        if calib_dataset is None and self.calib_dataset is None:
+            raise ValueError("Trainer: calibration requires an calib_dataset.")
+        calib_dataset = calib_dataset if calib_dataset is not None else self.calib_dataset
+
+        calib_dataset = self._remove_unused_columns(calib_dataset, description="Calibration")
+
+        return DataLoader(
+            calib_dataset,
+            batch_size=self.args.eval_batch_size,
+            collate_fn=self.data_collator,
+            drop_last=self.args.dataloader_drop_last,
+            num_workers=self.args.dataloader_num_workers,
+            pin_memory=self.args.dataloader_pin_memory,
+            shuffle=True,
+        )
+
+    def calibrate(self, calib_dataset=None):
+        calib_dataset = self.train_dataset if calib_dataset is None else calib_dataset
+        calib_dataloader = self.get_calib_dataloader(calib_dataset)
+
+        model = self.model
+        quant_trainer.configure_model(model, self.quant_trainer_args, calib=True)
+        model.eval()
+        quant_trainer.enable_calibration(model)
+
+        logger.info("***** Running calibration *****")
+        logger.info(f"  Num examples = {self.calib_num}")
+        logger.info(f"  Batch size = {calib_dataloader.batch_size}")
+
+        for step, inputs in enumerate(calib_dataloader):
+            # Prediction step
+            loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only=True)
+            if (step + 1) * calib_dataloader.batch_size >= self.calib_num:
+                break
+
+        quant_trainer.finish_calibration(model, self.quant_trainer_args)
+        self.model = model
+
+    def evaluate(self, eval_dataset=None, eval_examples=None, ignore_keys=None, metric_key_prefix: str = "eval"):
+        eval_dataset = self.eval_dataset if eval_dataset is None else eval_dataset
+        eval_dataloader = self.get_eval_dataloader(eval_dataset)
+        eval_examples = self.eval_examples if eval_examples is None else eval_examples
+
+        # Temporarily disable metric computation, we will do it in the loop here.
+        compute_metrics = self.compute_metrics
+        self.compute_metrics = None
+        eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
+        try:
+            output = eval_loop(
+                eval_dataloader,
+                description="Evaluation",
+                # No point gathering the predictions if there are no metrics, otherwise we defer to
+                # self.args.prediction_loss_only
+                prediction_loss_only=True if compute_metrics is None else None,
+                ignore_keys=ignore_keys,
+            )
+        finally:
+            self.compute_metrics = compute_metrics
+
+        if self.post_process_function is not None and self.compute_metrics is not None:
+            eval_preds = self.post_process_function(eval_examples, eval_dataset, output.predictions)
+            metrics = self.compute_metrics(eval_preds)
+
+            # Prefix all keys with metric_key_prefix + '_'
+            for key in list(metrics.keys()):
+                if not key.startswith(f"{metric_key_prefix}_"):
+                    metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
+
+            self.log(metrics)
+        else:
+            metrics = {}
+
+        if self.args.tpu_metrics_debug or self.args.debug:
+            # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.)
+            xm.master_print(met.metrics_report())
+
+        self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, metrics)
+        return metrics
+
+    def predict(self, predict_dataset, predict_examples, ignore_keys=None, metric_key_prefix: str = "test"):
+        predict_dataloader = self.get_test_dataloader(predict_dataset)
+
+        # Temporarily disable metric computation, we will do it in the loop here.
+        compute_metrics = self.compute_metrics
+        self.compute_metrics = None
+        eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
+        try:
+            output = eval_loop(
+                predict_dataloader,
+                description="Prediction",
+                # No point gathering the predictions if there are no metrics, otherwise we defer to
+                # self.args.prediction_loss_only
+                prediction_loss_only=True if compute_metrics is None else None,
+                ignore_keys=ignore_keys,
+            )
+        finally:
+            self.compute_metrics = compute_metrics
+
+        if self.post_process_function is None or self.compute_metrics is None:
+            return output
+
+        predictions = self.post_process_function(predict_examples, predict_dataset, output.predictions, "predict")
+        metrics = self.compute_metrics(predictions)
+
+        # Prefix all keys with metric_key_prefix + '_'
+        for key in list(metrics.keys()):
+            if not key.startswith(f"{metric_key_prefix}_"):
+                metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key)
+
+        return PredictionOutput(predictions=predictions.predictions, label_ids=predictions.label_ids, metrics=metrics)
+
+    def save_onnx(self, output_dir="./"):
+        eval_dataset = self.eval_dataset
+        eval_dataloader = self.get_eval_dataloader(eval_dataset)
+
+        batch = next(iter(eval_dataloader))
+
+        # saving device - to make it consistent
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+        # convert to tuple
+        input_tuple = tuple(v.to(device) for k, v in batch.items())
+
+        logger.info("Converting model to be onnx compatible")
+        from pytorch_quantization.nn import TensorQuantizer
+
+        TensorQuantizer.use_fb_fake_quant = True
+
+        model = self.model.to(device)
+
+        model.eval()
+        model.float()
+
+        model_to_save = model.module if hasattr(model, "module") else model
+        quant_trainer.configure_model(model_to_save, self.quant_trainer_args)
+
+        output_model_file = os.path.join(output_dir, "model.onnx")
+        logger.info(f"exporting model to {output_model_file}")
+
+        axes = {0: "batch_size", 1: "seq_len"}
+
+        torch.onnx.export(
+            model_to_save,
+            input_tuple,
+            output_model_file,
+            export_params=True,
+            opset_version=13,
+            do_constant_folding=True,
+            input_names=["input_ids", "attention_mask", "token_type_ids"],
+            output_names=["output_start_logits", "output_end_logits"],
+            dynamic_axes={
+                "input_ids": axes,
+                "attention_mask": axes,
+                "token_type_ids": axes,
+                "output_start_logits": axes,
+                "output_end_logits": axes,
+            },
+            verbose=True,
+        )
+        logger.info("onnx export finished")
--- a/examples/research_projects/quantization-qdqbert/utils_qa.py
+++ b/examples/research_projects/quantization-qdqbert/utils_qa.py
@ -0,0 +1,427 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Team All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Post-processing utilities for question answering.
+"""
+import collections
+import json
+import logging
+import os
+from typing import Optional, Tuple
+
+import numpy as np
+from tqdm.auto import tqdm
+
+
+logger = logging.getLogger(__name__)
+
+
+def postprocess_qa_predictions(
+    examples,
+    features,
+    predictions: Tuple[np.ndarray, np.ndarray],
+    version_2_with_negative: bool = False,
+    n_best_size: int = 20,
+    max_answer_length: int = 30,
+    null_score_diff_threshold: float = 0.0,
+    output_dir: Optional[str] = None,
+    prefix: Optional[str] = None,
+    log_level: Optional[int] = logging.WARNING,
+):
+    """
+    Post-processes the predictions of a question-answering model to convert them to answers that are substrings of the
+    original contexts. This is the base postprocessing functions for models that only return start and end logits.
+
+    Args:
+        examples: The non-preprocessed dataset (see the main script for more information).
+        features: The processed dataset (see the main script for more information).
+        predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
+            The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
+            first dimension must match the number of elements of :obj:`features`.
+        version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not the underlying dataset contains examples with no answers.
+        n_best_size (:obj:`int`, `optional`, defaults to 20):
+            The total number of n-best predictions to generate when looking for an answer.
+        max_answer_length (:obj:`int`, `optional`, defaults to 30):
+            The maximum length of an answer that can be generated. This is needed because the start and end predictions
+            are not conditioned on one another.
+        null_score_diff_threshold (:obj:`float`, `optional`, defaults to 0):
+            The threshold used to select the null answer: if the best answer has a score that is less than the score of
+            the null answer minus this threshold, the null answer is selected for this example (note that the score of
+            the null answer for an example giving several features is the minimum of the scores for the null answer on
+            each feature: all features must be aligned on the fact they `want` to predict a null answer).
+
+            Only useful when :obj:`version_2_with_negative` is :obj:`True`.
+        output_dir (:obj:`str`, `optional`):
+            If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if
+            :obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null
+            answers, are saved in `output_dir`.
+        prefix (:obj:`str`, `optional`):
+            If provided, the dictionaries mentioned above are saved with `prefix` added to their names.
+        log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``):
+            ``logging`` log level (e.g., ``logging.WARNING``)
+    """
+    assert len(predictions) == 2, "`predictions` should be a tuple with two elements (start_logits, end_logits)."
+    all_start_logits, all_end_logits = predictions
+
+    assert len(predictions[0]) == len(features), f"Got {len(predictions[0])} predictions and {len(features)} features."
+
+    # Build a map example to its corresponding features.
+    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
+    features_per_example = collections.defaultdict(list)
+    for i, feature in enumerate(features):
+        features_per_example[example_id_to_index[feature["example_id"]]].append(i)
+
+    # The dictionaries we have to fill.
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    if version_2_with_negative:
+        scores_diff_json = collections.OrderedDict()
+
+    # Logging.
+    logger.setLevel(log_level)
+    logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")
+
+    # Let's loop over all the examples!
+    for example_index, example in enumerate(tqdm(examples)):
+        # Those are the indices of the features associated to the current example.
+        feature_indices = features_per_example[example_index]
+
+        min_null_prediction = None
+        prelim_predictions = []
+
+        # Looping through all the features associated to the current example.
+        for feature_index in feature_indices:
+            # We grab the predictions of the model for this feature.
+            start_logits = all_start_logits[feature_index]
+            end_logits = all_end_logits[feature_index]
+            # This is what will allow us to map some the positions in our logits to span of texts in the original
+            # context.
+            offset_mapping = features[feature_index]["offset_mapping"]
+            # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context
+            # available in the current feature.
+            token_is_max_context = features[feature_index].get("token_is_max_context", None)
+
+            # Update minimum null prediction.
+            feature_null_score = start_logits[0] + end_logits[0]
+            if min_null_prediction is None or min_null_prediction["score"] > feature_null_score:
+                min_null_prediction = {
+                    "offsets": (0, 0),
+                    "score": feature_null_score,
+                    "start_logit": start_logits[0],
+                    "end_logit": end_logits[0],
+                }
+
+            # Go through all possibilities for the `n_best_size` greater start and end logits.
+            start_indexes = np.argsort(start_logits)[-1 : -n_best_size - 1 : -1].tolist()
+            end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist()
+            for start_index in start_indexes:
+                for end_index in end_indexes:
+                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
+                    # to part of the input_ids that are not in the context.
+                    if (
+                        start_index >= len(offset_mapping)
+                        or end_index >= len(offset_mapping)
+                        or offset_mapping[start_index] is None
+                        or offset_mapping[end_index] is None
+                    ):
+                        continue
+                    # Don't consider answers with a length that is either < 0 or > max_answer_length.
+                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
+                        continue
+                    # Don't consider answer that don't have the maximum context available (if such information is
+                    # provided).
+                    if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False):
+                        continue
+                    prelim_predictions.append(
+                        {
+                            "offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]),
+                            "score": start_logits[start_index] + end_logits[end_index],
+                            "start_logit": start_logits[start_index],
+                            "end_logit": end_logits[end_index],
+                        }
+                    )
+        if version_2_with_negative:
+            # Add the minimum null prediction
+            prelim_predictions.append(min_null_prediction)
+            null_score = min_null_prediction["score"]
+
+        # Only keep the best `n_best_size` predictions.
+        predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size]
+
+        # Add back the minimum null prediction if it was removed because of its low score.
+        if version_2_with_negative and not any(p["offsets"] == (0, 0) for p in predictions):
+            predictions.append(min_null_prediction)
+
+        # Use the offsets to gather the answer text in the original context.
+        context = example["context"]
+        for pred in predictions:
+            offsets = pred.pop("offsets")
+            pred["text"] = context[offsets[0] : offsets[1]]
+
+        # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
+        # failure.
+        if len(predictions) == 0 or (len(predictions) == 1 and predictions[0]["text"] == ""):
+            predictions.insert(0, {"text": "empty", "start_logit": 0.0, "end_logit": 0.0, "score": 0.0})
+
+        # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using
+        # the LogSumExp trick).
+        scores = np.array([pred.pop("score") for pred in predictions])
+        exp_scores = np.exp(scores - np.max(scores))
+        probs = exp_scores / exp_scores.sum()
+
+        # Include the probabilities in our predictions.
+        for prob, pred in zip(probs, predictions):
+            pred["probability"] = prob
+
+        # Pick the best prediction. If the null answer is not possible, this is easy.
+        if not version_2_with_negative:
+            all_predictions[example["id"]] = predictions[0]["text"]
+        else:
+            # Otherwise we first need to find the best non-empty prediction.
+            i = 0
+            while predictions[i]["text"] == "":
+                i += 1
+            best_non_null_pred = predictions[i]
+
+            # Then we compare to the null prediction using the threshold.
+            score_diff = null_score - best_non_null_pred["start_logit"] - best_non_null_pred["end_logit"]
+            scores_diff_json[example["id"]] = float(score_diff)  # To be JSON-serializable.
+            if score_diff > null_score_diff_threshold:
+                all_predictions[example["id"]] = ""
+            else:
+                all_predictions[example["id"]] = best_non_null_pred["text"]
+
+        # Make `predictions` JSON-serializable by casting np.float back to float.
+        all_nbest_json[example["id"]] = [
+            {k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()}
+            for pred in predictions
+        ]
+
+    # If we have an output_dir, let's save all those dicts.
+    if output_dir is not None:
+        assert os.path.isdir(output_dir), f"{output_dir} is not a directory."
+
+        prediction_file = os.path.join(
+            output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json"
+        )
+        nbest_file = os.path.join(
+            output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json"
+        )
+        if version_2_with_negative:
+            null_odds_file = os.path.join(
+                output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json"
+            )
+
+        logger.info(f"Saving predictions to {prediction_file}.")
+        with open(prediction_file, "w") as writer:
+            writer.write(json.dumps(all_predictions, indent=4) + "\n")
+        logger.info(f"Saving nbest_preds to {nbest_file}.")
+        with open(nbest_file, "w") as writer:
+            writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+        if version_2_with_negative:
+            logger.info(f"Saving null_odds to {null_odds_file}.")
+            with open(null_odds_file, "w") as writer:
+                writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+    return all_predictions
+
+
+def postprocess_qa_predictions_with_beam_search(
+    examples,
+    features,
+    predictions: Tuple[np.ndarray, np.ndarray],
+    version_2_with_negative: bool = False,
+    n_best_size: int = 20,
+    max_answer_length: int = 30,
+    start_n_top: int = 5,
+    end_n_top: int = 5,
+    output_dir: Optional[str] = None,
+    prefix: Optional[str] = None,
+    log_level: Optional[int] = logging.WARNING,
+):
+    """
+    Post-processes the predictions of a question-answering model with beam search to convert them to answers that are substrings of the
+    original contexts. This is the postprocessing functions for models that return start and end logits, indices, as well as
+    cls token predictions.
+
+    Args:
+        examples: The non-preprocessed dataset (see the main script for more information).
+        features: The processed dataset (see the main script for more information).
+        predictions (:obj:`Tuple[np.ndarray, np.ndarray]`):
+            The predictions of the model: two arrays containing the start logits and the end logits respectively. Its
+            first dimension must match the number of elements of :obj:`features`.
+        version_2_with_negative (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not the underlying dataset contains examples with no answers.
+        n_best_size (:obj:`int`, `optional`, defaults to 20):
+            The total number of n-best predictions to generate when looking for an answer.
+        max_answer_length (:obj:`int`, `optional`, defaults to 30):
+            The maximum length of an answer that can be generated. This is needed because the start and end predictions
+            are not conditioned on one another.
+        start_n_top (:obj:`int`, `optional`, defaults to 5):
+            The number of top start logits too keep when searching for the :obj:`n_best_size` predictions.
+        end_n_top (:obj:`int`, `optional`, defaults to 5):
+            The number of top end logits too keep when searching for the :obj:`n_best_size` predictions.
+        output_dir (:obj:`str`, `optional`):
+            If provided, the dictionaries of predictions, n_best predictions (with their scores and logits) and, if
+            :obj:`version_2_with_negative=True`, the dictionary of the scores differences between best and null
+            answers, are saved in `output_dir`.
+        prefix (:obj:`str`, `optional`):
+            If provided, the dictionaries mentioned above are saved with `prefix` added to their names.
+        log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``):
+            ``logging`` log level (e.g., ``logging.WARNING``)
+    """
+    assert len(predictions) == 5, "`predictions` should be a tuple with five elements."
+    start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits = predictions
+
+    assert len(predictions[0]) == len(
+        features
+    ), f"Got {len(predictions[0])} predicitions and {len(features)} features."
+
+    # Build a map example to its corresponding features.
+    example_id_to_index = {k: i for i, k in enumerate(examples["id"])}
+    features_per_example = collections.defaultdict(list)
+    for i, feature in enumerate(features):
+        features_per_example[example_id_to_index[feature["example_id"]]].append(i)
+
+    # The dictionaries we have to fill.
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    scores_diff_json = collections.OrderedDict() if version_2_with_negative else None
+
+    # Logging.
+    logger.setLevel(log_level)
+    logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")
+
+    # Let's loop over all the examples!
+    for example_index, example in enumerate(tqdm(examples)):
+        # Those are the indices of the features associated to the current example.
+        feature_indices = features_per_example[example_index]
+
+        min_null_score = None
+        prelim_predictions = []
+
+        # Looping through all the features associated to the current example.
+        for feature_index in feature_indices:
+            # We grab the predictions of the model for this feature.
+            start_log_prob = start_top_log_probs[feature_index]
+            start_indexes = start_top_index[feature_index]
+            end_log_prob = end_top_log_probs[feature_index]
+            end_indexes = end_top_index[feature_index]
+            feature_null_score = cls_logits[feature_index]
+            # This is what will allow us to map some the positions in our logits to span of texts in the original
+            # context.
+            offset_mapping = features[feature_index]["offset_mapping"]
+            # Optional `token_is_max_context`, if provided we will remove answers that do not have the maximum context
+            # available in the current feature.
+            token_is_max_context = features[feature_index].get("token_is_max_context", None)
+
+            # Update minimum null prediction
+            if min_null_score is None or feature_null_score < min_null_score:
+                min_null_score = feature_null_score
+
+            # Go through all possibilities for the `n_start_top`/`n_end_top` greater start and end logits.
+            for i in range(start_n_top):
+                for j in range(end_n_top):
+                    start_index = int(start_indexes[i])
+                    j_index = i * end_n_top + j
+                    end_index = int(end_indexes[j_index])
+                    # Don't consider out-of-scope answers (last part of the test should be unnecessary because of the
+                    # p_mask but let's not take any risk)
+                    if (
+                        start_index >= len(offset_mapping)
+                        or end_index >= len(offset_mapping)
+                        or offset_mapping[start_index] is None
+                        or offset_mapping[end_index] is None
+                    ):
+                        continue
+                    # Don't consider answers with a length negative or > max_answer_length.
+                    if end_index < start_index or end_index - start_index + 1 > max_answer_length:
+                        continue
+                    # Don't consider answer that don't have the maximum context available (if such information is
+                    # provided).
+                    if token_is_max_context is not None and not token_is_max_context.get(str(start_index), False):
+                        continue
+                    prelim_predictions.append(
+                        {
+                            "offsets": (offset_mapping[start_index][0], offset_mapping[end_index][1]),
+                            "score": start_log_prob[i] + end_log_prob[j_index],
+                            "start_log_prob": start_log_prob[i],
+                            "end_log_prob": end_log_prob[j_index],
+                        }
+                    )
+
+        # Only keep the best `n_best_size` predictions.
+        predictions = sorted(prelim_predictions, key=lambda x: x["score"], reverse=True)[:n_best_size]
+
+        # Use the offsets to gather the answer text in the original context.
+        context = example["context"]
+        for pred in predictions:
+            offsets = pred.pop("offsets")
+            pred["text"] = context[offsets[0] : offsets[1]]
+
+        # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
+        # failure.
+        if len(predictions) == 0:
+            predictions.insert(0, {"text": "", "start_logit": -1e-6, "end_logit": -1e-6, "score": -2e-6})
+
+        # Compute the softmax of all scores (we do it with numpy to stay independent from torch/tf in this file, using
+        # the LogSumExp trick).
+        scores = np.array([pred.pop("score") for pred in predictions])
+        exp_scores = np.exp(scores - np.max(scores))
+        probs = exp_scores / exp_scores.sum()
+
+        # Include the probabilities in our predictions.
+        for prob, pred in zip(probs, predictions):
+            pred["probability"] = prob
+
+        # Pick the best prediction and set the probability for the null answer.
+        all_predictions[example["id"]] = predictions[0]["text"]
+        if version_2_with_negative:
+            scores_diff_json[example["id"]] = float(min_null_score)
+
+        # Make `predictions` JSON-serializable by casting np.float back to float.
+        all_nbest_json[example["id"]] = [
+            {k: (float(v) if isinstance(v, (np.float16, np.float32, np.float64)) else v) for k, v in pred.items()}
+            for pred in predictions
+        ]
+
+    # If we have an output_dir, let's save all those dicts.
+    if output_dir is not None:
+        assert os.path.isdir(output_dir), f"{output_dir} is not a directory."
+
+        prediction_file = os.path.join(
+            output_dir, "predictions.json" if prefix is None else f"{prefix}_predictions.json"
+        )
+        nbest_file = os.path.join(
+            output_dir, "nbest_predictions.json" if prefix is None else f"{prefix}_nbest_predictions.json"
+        )
+        if version_2_with_negative:
+            null_odds_file = os.path.join(
+                output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json"
+            )
+
+        logger.info(f"Saving predictions to {prediction_file}.")
+        with open(prediction_file, "w") as writer:
+            writer.write(json.dumps(all_predictions, indent=4) + "\n")
+        logger.info(f"Saving nbest_preds to {nbest_file}.")
+        with open(nbest_file, "w") as writer:
+            writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+        if version_2_with_negative:
+            logger.info(f"Saving null_odds to {null_odds_file}.")
+            with open(null_odds_file, "w") as writer:
+                writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+    return all_predictions, scores_diff_json
--- a/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples.py
+++ b/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples.py
@ -14,7 +14,7 @@ import lightning_base
 from convert_pl_checkpoint_to_hf import convert_pl_to_hf
 from distillation import distill_main
 from finetune import SummarizationModule, main
-from huggingface_hub.hf_api import HfApi
+from huggingface_hub import list_models
 from parameterized import parameterized
 from run_eval import generate_summaries_or_translations
 from transformers import AutoConfig, AutoModelForSeq2SeqLM
@ -130,7 +130,7 @@ class TestSummarizationDistiller(TestCasePlus):
    def test_hub_configs(self):
        """I put require_torch_gpu cause I only want this to run with self-scheduled."""

-        model_list = HfApi().list_models()
+        model_list = list_models()
        org = "sshleifer"
        model_ids = [x.modelId for x in model_list if x.modelId.startswith(org)]
        allowed_to_be_broken = ["sshleifer/blenderbot-3B", "sshleifer/blenderbot-90M"]
--- a/examples/research_projects/wav2vec2/run_asr.py
+++ b/examples/research_projects/wav2vec2/run_asr.py
@ -54,12 +54,6 @@ class ModelArguments:
    freeze_feature_extractor: Optional[bool] = field(
        default=True, metadata={"help": "Whether to freeze the feature extractor layers of the model."}
    )
-    gradient_checkpointing: Optional[bool] = field(
-        default=False,
-        metadata={
-            "help": "If True, use gradient checkpointing to save memory at the expense of slower backward pass."
-        },
-    )
    verbose_logging: Optional[bool] = field(
        default=False,
        metadata={"help": "Whether to log verbose messages or not."},
@ -352,7 +346,7 @@ def main():
    model = Wav2Vec2ForCTC.from_pretrained(
        model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
-        gradient_checkpointing=model_args.gradient_checkpointing,
+        gradient_checkpointing=training_args.gradient_checkpointing,
        vocab_size=len(processor.tokenizer),
    )

--- a/examples/research_projects/wav2vec2/run_common_voice.py
+++ b/examples/research_projects/wav2vec2/run_common_voice.py
@ -84,12 +84,6 @@ class ModelArguments:
            "vectors will be masked along the time axis. This is only relevant if ``apply_spec_augment is True``."
        },
    )
-    gradient_checkpointing: Optional[bool] = field(
-        default=True,
-        metadata={
-            "help": "If True, use gradient checkpointing to save memory at the expense of slower backward pass."
-        },
-    )
    layerdrop: Optional[float] = field(default=0.0, metadata={"help": "The LayerDrop probability."})


@ -373,7 +367,7 @@ def main():
        hidden_dropout=model_args.hidden_dropout,
        feat_proj_dropout=model_args.feat_proj_dropout,
        mask_time_prob=model_args.mask_time_prob,
-        gradient_checkpointing=model_args.gradient_checkpointing,
+        gradient_checkpointing=training_args.gradient_checkpointing,
        layerdrop=model_args.layerdrop,
        ctc_loss_reduction="mean",
        pad_token_id=processor.tokenizer.pad_token_id,
--- a/examples/research_projects/wav2vec2/run_pretrain.py
+++ b/examples/research_projects/wav2vec2/run_pretrain.py
@ -50,12 +50,6 @@ class ModelArguments:
    freeze_feature_extractor: Optional[bool] = field(
        default=True, metadata={"help": "Whether to freeze the feature extractor layers of the model."}
    )
-    gradient_checkpointing: Optional[bool] = field(
-        default=False,
-        metadata={
-            "help": "If True, use gradient checkpointing to save memory at the expense of slower backward pass."
-        },
-    )
    verbose_logging: Optional[bool] = field(
        default=False,
        metadata={"help": "Whether to log verbose messages or not."},
@ -370,7 +364,7 @@ def main():
    config = Wav2Vec2Config.from_pretrained(
        model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
-        gradient_checkpointing=model_args.gradient_checkpointing,
+        gradient_checkpointing=training_args.gradient_checkpointing,
    )

    if not config.do_stable_layer_norm or config.feat_extract_norm != "layer":
--- a/examples/tensorflow/language-modeling/requirements.txt
+++ b/examples/tensorflow/language-modeling/requirements.txt
@ -0,0 +1,2 @@
+datasets >= 1.8.0
+sentencepiece != 0.1.92
--- a/examples/tensorflow/language-modeling/run_clm.py
+++ b/examples/tensorflow/language-modeling/run_clm.py
@ -30,6 +30,7 @@ import random
 import sys
 from dataclasses import dataclass, field
 from functools import partial
+from itertools import chain
 from pathlib import Path
 from typing import Optional

@ -43,8 +44,8 @@ import transformers
 from transformers import (
    CONFIG_MAPPING,
    CONFIG_NAME,
-    MODEL_FOR_CAUSAL_LM_MAPPING,
    TF2_WEIGHTS_NAME,
+    TF_MODEL_FOR_CAUSAL_LM_MAPPING,
    AutoConfig,
    AutoTokenizer,
    HfArgumentParser,
@ -57,8 +58,8 @@ from transformers.utils.versions import require_version


 logger = logging.getLogger(__name__)
-require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
-MODEL_CONFIG_CLASSES = list(MODEL_FOR_CAUSAL_LM_MAPPING.keys())
+require_version("datasets>=1.8.0", "To fix: pip install -r examples/tensorflow/language-modeling/requirements.txt")
+MODEL_CONFIG_CLASSES = list(TF_MODEL_FOR_CAUSAL_LM_MAPPING.keys())
 MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
 # endregion

@ -406,7 +407,7 @@ def main():
    # Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.
    def group_texts(examples):
        # Concatenate all texts.
-        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
--- a/Show More
+++ b/Show More