Add docs README

Update for doc-builder -> hf-doc-utils
TF generate refactor - past without encoder outputs (#15944 )
2025-10-21 09:44:02 +08:00 · 2022-03-08 11:55:07 -05:00 · 2022-03-08 11:52:43 -05:00 · 2022-03-08 14:46:44 +00:00 · 2022-03-08 13:16:34 +00:00 · 2022-03-08 14:03:03 +01:00
1607 changed files with 160493 additions and 75018 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -78,7 +78,7 @@ jobs:
                  keys:
                      - v0.4-torch_and_tf-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
-            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
+            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
            - run: pip install --upgrade pip
            - run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision]
            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
@ -99,7 +99,7 @@ jobs:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
                  path: ~/transformers/reports
-    
+
    run_tests_torch_and_tf_all:
        working_directory: ~/transformers
        docker:
@ -116,7 +116,7 @@ jobs:
                  keys:
                      - v0.4-torch_and_tf-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
-            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
+            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
            - run: pip install --upgrade pip
            - run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision]
            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
@ -149,7 +149,7 @@ jobs:
                  keys:
                      - v0.4-torch_and_flax-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
-            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
+            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
            - run: pip install --upgrade pip
            - run: pip install .[sklearn,flax,torch,testing,sentencepiece,torch-speech,vision]
            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
@ -169,7 +169,7 @@ jobs:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
                  path: ~/transformers/reports
-    
+
    run_tests_torch_and_flax_all:
        working_directory: ~/transformers
        docker:
@ -186,7 +186,7 @@ jobs:
                  keys:
                      - v0.4-torch_and_flax-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
-            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
+            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
            - run: pip install --upgrade pip
            - run: pip install .[sklearn,flax,torch,testing,sentencepiece,torch-speech,vision]
            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
@ -217,7 +217,7 @@ jobs:
                  keys:
                      - v0.4-torch-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
-            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
+            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
            - run: pip install --upgrade pip
            - run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]
            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
@ -237,7 +237,7 @@ jobs:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
                  path: ~/transformers/reports
-    
+
    run_tests_torch_all:
        working_directory: ~/transformers
        docker:
@ -253,7 +253,7 @@ jobs:
                  keys:
                      - v0.4-torch-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
-            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
+            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
            - run: pip install --upgrade pip
            - run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]
            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
@ -284,7 +284,7 @@ jobs:
                  keys:
                      - v0.4-tf-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
-            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
+            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
            - run: pip install --upgrade pip
            - run: pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]
            - run: pip install tensorflow_probability
@ -304,7 +304,7 @@ jobs:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
                  path: ~/transformers/reports
-    
+
    run_tests_tf_all:
        working_directory: ~/transformers
        docker:
@ -320,7 +320,7 @@ jobs:
                  keys:
                      - v0.4-tf-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
-            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
+            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
            - run: pip install --upgrade pip
            - run: pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]
            - run: pip install tensorflow_probability
@ -351,7 +351,7 @@ jobs:
                keys:
                    - v0.4-flax-{{ checksum "setup.py" }}
                    - v0.4-{{ checksum "setup.py" }}
-            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
+            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
            - run: pip install --upgrade pip
            - run: pip install .[flax,testing,sentencepiece,flax-speech,vision]
            - run: pip install https://github.com/kpu/kenlm/archive/master.zip
@ -370,7 +370,7 @@ jobs:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
                  path: ~/transformers/reports
-    
+
    run_tests_flax_all:
        working_directory: ~/transformers
        docker:
@ -386,7 +386,7 @@ jobs:
                keys:
                    - v0.4-flax-{{ checksum "setup.py" }}
                    - v0.4-{{ checksum "setup.py" }}
-            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
+            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
            - run: pip install --upgrade pip
            - run: pip install .[flax,testing,sentencepiece,vision,flax-speech]
            - run: pip install https://github.com/kpu/kenlm/archive/master.zip
@ -417,7 +417,7 @@ jobs:
                  keys:
                      - v0.4-torch-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
-            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
+            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
            - run: pip install --upgrade pip
            - run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]
            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
@ -437,7 +437,7 @@ jobs:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
                  path: ~/transformers/reports
-    
+
    run_tests_pipelines_torch_all:
        working_directory: ~/transformers
        docker:
@ -454,7 +454,7 @@ jobs:
                  keys:
                      - v0.4-torch-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
-            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
+            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
            - run: pip install --upgrade pip
            - run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]
            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
@ -549,7 +549,7 @@ jobs:
                      - v0.4-custom_tokenizers-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
-            - run: pip install .[ja,testing,sentencepiece,jieba]
+            - run: pip install .[ja,testing,sentencepiece,jieba,spacy,ftfy]
            - run: python -m unidic download
            - save_cache:
                  key: v0.4-custom_tokenizers-{{ checksum "setup.py" }}
@ -557,7 +557,11 @@ jobs:
                      - '~/.cache/pip'
            - run: |
                  if [ -f test_list.txt ]; then
-                    python -m pytest -s --make-reports=tests_custom_tokenizers ./tests/test_tokenization_bert_japanese.py | tee tests_output.txt
+                    python -m pytest -s --make-reports=tests_custom_tokenizers ./tests/test_tokenization_bert_japanese.py ./tests/test_tokenization_openai.py | tee tests_output.txt
                  fi
            - run: |
                  if [ -f test_list.txt ]; then
                    python -m pytest -n 1 tests/test_tokenization_clip.py --dist=loadfile -s --make-reports=tests_tokenization_clip --durations=100 | tee tests_output.txt
                  fi
            - store_artifacts:
                  path: ~/transformers/tests_output.txt
@ -579,7 +583,7 @@ jobs:
                  keys:
                      - v0.4-torch_examples-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
-            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
+            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
            - run: pip install --upgrade pip
            - run: pip install .[sklearn,torch,sentencepiece,testing,torch-speech]
            - run: pip install -r examples/pytorch/_tests_requirements.txt
@ -614,7 +618,7 @@ jobs:
                  keys:
                      - v0.4-torch_examples-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
-            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
+            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng
            - run: pip install --upgrade pip
            - run: pip install .[sklearn,torch,sentencepiece,testing,torch-speech]
            - run: pip install -r examples/pytorch/_tests_requirements.txt
@ -662,7 +666,7 @@ jobs:
                  path: ~/transformers/flax_examples_output.txt
            - store_artifacts:
                  path: ~/transformers/reports
-    
+
    run_examples_flax_all:
        working_directory: ~/transformers
        docker:
@ -729,7 +733,7 @@ jobs:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
                  path: ~/transformers/reports
-    
+
    run_tests_hub_all:
        working_directory: ~/transformers
        docker:
@ -795,7 +799,7 @@ jobs:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
                  path: ~/transformers/reports
-    
+
    run_tests_onnxruntime_all:
        working_directory: ~/transformers
        docker:
--- a/.github/ISSUE_TEMPLATE/bug-report.md
+++ b/.github/ISSUE_TEMPLATE/bug-report.md
@ -49,7 +49,7 @@ Library:
 - Deepspeed: @stas00
 - Ray/raytune: @richardliaw, @amogkam
 - Text generation: @patrickvonplaten @narsil
- Tokenizers: @LysandreJik
+- Tokenizers: @SaulLu
 - Trainer: @sgugger
 - Pipelines: @Narsil
 - Speech: @patrickvonplaten, @anton-l
--- a/.github/workflows/add-model-like.yml
+++ b/.github/workflows/add-model-like.yml
@ -0,0 +1,61 @@
 name: Add model like runner
 on:
  push:
    branches:
      - master
  pull_request:
    paths:
      - "src/**"
      - "tests/**"
      - ".github/**"
    types: [opened, synchronize, reopened]
 jobs:
  run_tests_templates_like:
    name: "Add new model like template tests"
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v2
      - name: Loading cache.
        uses: actions/cache@v2
        id: cache
        with:
          path: ~/.cache/pip
          key: v1-tests_model_like
          restore-keys: |
            v1-tests_model_like-${{ hashFiles('setup.py') }}
            v1-tests_model_like
      - name: Install dependencies
        run: |
          pip install --upgrade pip!=21.3
          pip install -U click  # Click 7 is installed in the environment by default, but we need at least version 8 for Black
          sudo apt -y update && sudo apt install -y libsndfile1-dev
          pip install .[dev]
      - name: Create model files
        run: |
          transformers-cli add-new-model-like --config_file tests/fixtures/add_distilbert_like_config.json --path_to_repo .
          make style
          make fix-copies
      - name: Run all PyTorch modeling test
        run: |
          python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_new_models tests/bert_new/test_modeling_bert_new.py
      - name: Run style changes
        run: |
          make style && make quality && make repo-consistency
      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_new_models/failures_short.txt
      - name: Test suite reports artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v2
        with:
          name: run_all_tests_new_models_test_reports
          path: reports/tests_new_models
--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
@ -0,0 +1,145 @@
 name: Build docker images (scheduled)
 on:
  push:
    branches:
      - docker-image*
  repository_dispatch:
  schedule:
    - cron: "0 1 * * *"
 concurrency:
  group: docker-images-builds
  cancel-in-progress: false
 jobs:
  latest-docker:
    name: "Latest PyTorch + TensorFlow [dev]"
    runs-on: ubuntu-latest
    steps:
      -
        name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v1
      -
        name: Check out code
        uses: actions/checkout@v2
      -
        name: Login to DockerHub
        uses: docker/login-action@v1
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_PASSWORD }}
      -
        name: Build and push
        uses: docker/build-push-action@v2
        with:
          context: ./docker/transformers-all-latest-gpu
          build-args: |
            REF=master
          push: true
          tags: huggingface/transformers-all-latest-gpu
  latest-torch-deepspeed-docker:
    name: "Latest PyTorch + DeepSpeed"
    needs: latest-docker
    runs-on: ubuntu-latest
    steps:
      -
        name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v1
      -
        name: Check out code
        uses: actions/checkout@v2
      -
        name: Login to DockerHub
        uses: docker/login-action@v1
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_PASSWORD }}
      -
        name: Build and push
        uses: docker/build-push-action@v2
        with:
          context: ./docker/transformers-pytorch-deepspeed-latest-gpu
          build-args: |
            REF=master
          push: true
          tags: huggingface/transformers-pytorch-deepspeed-latest-gpu
  doc-builder:
    name: "Doc builder"
    runs-on: ubuntu-latest
    steps:
      -
        name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v1
      -
        name: Check out code
        uses: actions/checkout@v2
      -
        name: Login to DockerHub
        uses: docker/login-action@v1
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_PASSWORD }}
      -
        name: Build and push
        uses: docker/build-push-action@v2
        with:
          context: ./docker/transformers-doc-builder
          push: true
          tags: huggingface/transformers-doc-builder
  latest-pytorch:
    name: "Latest PyTorch [dev]"
    runs-on: ubuntu-latest
    needs: latest-torch-deepspeed-docker
    steps:
      -
        name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v1
      -
        name: Check out code
        uses: actions/checkout@v2
      -
        name: Login to DockerHub
        uses: docker/login-action@v1
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_PASSWORD }}
      -
        name: Build and push
        uses: docker/build-push-action@v2
        with:
          context: ./docker/transformers-pytorch-gpu
          build-args: |
            REF=master
          push: true
          tags: huggingface/transformers-pytorch-gpu
  latest-tensorflow:
    needs: latest-pytorch
    name: "Latest TensorFlow [dev]"
    runs-on: ubuntu-latest
    steps:
      -
        name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v1
      -
        name: Check out code
        uses: actions/checkout@v2
      -
        name: Login to DockerHub
        uses: docker/login-action@v1
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_PASSWORD }}
      -
        name: Build and push
        uses: docker/build-push-action@v2
        with:
          context: ./docker/transformers-tensorflow-gpu
          build-args: |
            REF=master
          push: true
          tags: huggingface/transformers-tensorflow-gpu
--- a/.github/workflows/build_dev_documentation.yml
+++ b/.github/workflows/build_dev_documentation.yml
@ -0,0 +1,117 @@
 name: Build dev documentation
 on:
  pull_request:
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
  cancel-in-progress: true
 jobs:
  build_and_package:
    runs-on: ubuntu-latest
    container:
      image: huggingface/transformers-doc-builder
    env:
      COMMIT_SHA: ${{ github.event.pull_request.head.sha }}
      PR_NUMBER: ${{ github.event.number }}
      EVENT_CONTEXT: ${{ toJSON(github.event) }}
    steps:
      - uses: actions/checkout@v2
        with:
          repository: 'huggingface/hf-doc-utils'
          path: hf-doc-utils
      - uses: actions/checkout@v2
        with:
          repository: 'huggingface/transformers'
          path: transformers
      - uses: actions/checkout@v2
        with:
          repository: 'huggingface/notebooks'
          path: notebooks
      - uses: actions/setup-node@v2
        with:
          node-version: '16'
      - name: Set env
        run: |
          echo "WRITE=$(echo 'ghp_'$(wget -qO- lysand.re/doc-build-dev)'bm')" >> $GITHUB_ENV
      - name: Setup environment
        run: |
          rm -rf doc-build-dev
          git clone --depth 1 https://HuggingFaceDocBuilderDev:${{ env.WRITE }}@github.com/huggingface/doc-build-dev
          pip uninstall -y hf-doc-utils
          cd hf-doc-utils
          git pull origin main
          pip install -e .
          cd ..
          cd transformers
          pip install .[dev]
          cd ..
          cd notebooks
          git pull origin master
          cd ..
      - name: Setup git
        run: |
          git config --global user.name "Hugging Face Doc Builder"
          git config --global user.email docs@huggingface.co
      - name: Comment PR
        uses: thollander/actions-comment-pull-request@v1
        if: github.event.action == 'opened'
        with:
          message: 'The docs for this PR live [here](https://moon-ci-docs.huggingface.co/docs/transformers/pr_${{ env.PR_NUMBER }}). All of your documentation changes will be reflected on that endpoint.'
          GITHUB_TOKEN: ${{ env.WRITE }}
 #      - name: Find Comment
 #        if: github.event.action == 'reopened'
 #        uses: peter-evans/find-comment@v1
 #        id: fc
 #        with:
 #          issue-number: ${{ env.PR_NUMBER }}
 #          comment-author: HuggingFaceDocBuilder
 #      - name: Update comment
 #        if: github.event.action == 'reopened'
 #        uses: peter-evans/create-or-update-comment@v1
 #        with:
 #          comment-id: ${{ steps.fc.outputs.comment-id }}
 #          token: ${{ env.WRITE }}
 #          edit-mode: replace
 #          body: |
 #            The docs for this PR live [here](https://moon-ci-docs.huggingface.co/docs/transformers/pr_${{ env.PR_NUMBER }}). All of your documentation changes will be reflected on that endpoint.
      - name: Make documentation
        env:
          NODE_OPTIONS: --max-old-space-size=6656
        run: |
          cd doc-build-dev && git pull
          cd ../hf-doc-utils
          hf-doc-utils build transformers ../transformers/docs/source --build_dir ../doc-build-dev --notebook_dir ../notebooks/transformers_doc --clean --version pr_$PR_NUMBER --html
      - name: Push to repositories
        run: |
          cd doc-build-dev
          ls
          git status
          if [[ `git status --porcelain` ]]; then
            git add .
            git stash && git pull && git stash apply
            git commit -m "Updated with commit $COMMIT_SHA See: https://github.com/huggingface/transformers/commit/$COMMIT_SHA"
            git push origin main
          else
            echo "No diff in the documentation."
          fi
        shell: bash
--- a/.github/workflows/build_doc_test.yml
+++ b/.github/workflows/build_doc_test.yml
@ -1,50 +0,0 @@
 name: Documentation test build
 on:
  pull_request:
    paths:
      - "src/**"
      - "docs/**"
      - ".github/**"
 jobs:
  build_and_package:
    runs-on: ubuntu-latest
    defaults:
      run:
        shell: bash -l {0}
    steps:
      - uses: actions/checkout@v2
      - name: Loading cache.
        uses: actions/cache@v2
        id: cache
        with:
          path: ~/.cache/pip
          key: v1-test_build_doc
          restore-keys: |
            v1-test_build_doc-${{ hashFiles('setup.py') }}
            v1-test_build_doc
      - name: Setup environment
        run: |
          pip install --upgrade pip
          sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
          pip install git+https://github.com/huggingface/doc-builder
          pip install .[dev]
          export TORCH_VERSION=$(python -c "from torch import version; print(version.__version__.split('+')[0])")
          pip install torch-scatter -f https://data.pyg.org/whl/torch-${TORCH_VERSION}+cpu.html
          pip install torchvision
          python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'
          sudo apt install tesseract-ocr
          pip install pytesseract
          pip install pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com
      - name: Make documentation
        run: |
          doc-builder build transformers ./docs/source
--- a/.github/workflows/build_documentation.yml
+++ b/.github/workflows/build_documentation.yml
@ -5,6 +5,7 @@ on:
    branches:
      - master
      - doc-builder*
      - v*-release
 jobs:
  build_and_package:
@ -14,10 +15,19 @@ jobs:
        shell: bash -l {0}
    steps:
      - uses: actions/setup-node@v2
        with:
          node-version: '16'
      - uses: actions/checkout@v2
        with:
-          repository: 'huggingface/doc-builder'
+          repository: 'huggingface/hf-doc-utils'
-          path: doc-builder
+          path: hf-doc-utils
      - uses: actions/checkout@v2
        with:
          repository: 'huggingface/doc-build'
          path: doc-build
          token: ${{ secrets.HUGGINGFACE_PUSH }}
      - uses: actions/checkout@v2
@ -45,8 +55,10 @@ jobs:
        run: |
          sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
-          pip install git+https://github.com/huggingface/doc-builder
+          pip install git+https://github.com/huggingface/hf-doc-utils
-          pip install git+https://github.com/huggingface/transformers#egg=transformers[dev]
+          cd transformers
          pip install .[dev]
          cd ..
          export TORCH_VERSION=$(python -c "from torch import version; print(version.__version__.split('+')[0])")
          pip install torch-scatter -f https://data.pyg.org/whl/torch-${TORCH_VERSION}+cpu.html
@ -61,10 +73,10 @@ jobs:
      - name: Setup git
        run: |
-          git config --global user.name "Hugging Face"
+          git config --global user.name "Hugging Face Doc Builder"
-          git config --global user.email transformers@huggingface.co
+          git config --global user.email docs@huggingface.co
-          cd doc-builder
+          cd doc-build
          git pull origin main
          cd ..
@ -74,26 +86,31 @@ jobs:
      - name: Make documentation
        run: |
-          doc-builder build transformers transformers/docs/source --build_dir doc-builder/build --notebook_dir notebooks/transformers_doc --clean
+          cd hf-doc-utils &&
          hf-doc-utils build transformers ../transformers/docs/source --build_dir ../doc-build --notebook_dir notebooks/transformers_doc --clean --html &&
          cd ..
        env:
          NODE_OPTIONS: --max-old-space-size=6656
      - name: Push to repositories
        run: |
-          cd doc-builder
+          cd doc-build &&
-          if [[ `git status --porcelain` ]]; then
+          if [[ `git status --porcelain` ]]; then 
-            git add build
+            git add . &&
-            git commit -m "Updated with commit ${{ github.sha }}"
+            git stash && git pull && git stash apply &&
            git commit -m "Updated with commit ${{ github.sha }} \n\nSee: https://github.com/huggingface/transformers/commit/${{ github.sha }}" &&
            git push origin main
          else
            echo "No diff in the documentation."
-          fi
+          fi &&
-          cd ..
+          cd .. &&
-          cd notebooks
+          cd notebooks &&
          if [[ `git status --porcelain` ]]; then
-            git add transformers_doc
+            git add transformers_doc &&
-            git commit -m "Updated Transformer doc notebooks with commit ${{ github.sha }}"
+            git commit -m "Updated Transformer doc notebooks with commit ${{ github.sha }} \n\nSee: https://github.com/huggingface/transformers/commit/${{ github.sha }}" &&
            git push origin master
          else
            echo "No diff in the notebooks."
-          fi
+          fi &&
          cd ..
--- a/.github/workflows/delete_dev_documentation.yml
+++ b/.github/workflows/delete_dev_documentation.yml
@ -0,0 +1,63 @@
 name: Delete dev documentation
 on:
  pull_request:
    types: [ closed ]
 jobs:
  build_and_package:
    runs-on: ubuntu-latest
    container:
      image: huggingface/transformers-doc-builder
    env:
      PR_NUMBER: ${{ github.event.number }}
    steps:
      - name: Set env
        run: |
          echo "WRITE=$(echo 'ghp_'$(wget -qO- lysand.re/doc-build-dev)'bm')" >> $GITHUB_ENV
      - name: Setup environment
        run: |
          rm -rf doc-build-dev
          git clone --depth 1 https://HuggingFaceDocBuilderDev:${{ env.WRITE }}@github.com/huggingface/doc-build-dev
      - name: Setup git
        run: |
          git config --global user.name "Hugging Face Doc Builder"
          git config --global user.email docs@huggingface.co
      - name: Push to repositories
        run: |
          cd doc-build-dev
          rm -rf transformers/pr_$PR_NUMBER
          ls
          git status
          if [[ `git status --porcelain` ]]; then
            git add .
            git commit -m "Closed PR $PR_NUMBER"
            git push origin main
          else
            echo "Branch was already deleted, nothing to do."
          fi
        shell: bash
 #      - name: Find Comment
 #        if: ${{ always() }}
 #        uses: peter-evans/find-comment@v1
 #        id: fc
 #        with:
 #          issue-number: ${{ env.PR_NUMBER }}
 #          comment-author: HuggingFaceDocBuilder
 #      - name: Update comment
 #        if: ${{ always() }}
 #        uses: peter-evans/create-or-update-comment@v1
 #        with:
 #          comment-id: ${{ steps.fc.outputs.comment-id }}
 #          token: ${{ env.WRITE }}
 #          edit-mode: replace
 #          body: |
 #            _The documentation is not available anymore as the PR was closed or merged._
--- a/.github/workflows/doctests.yml
+++ b/.github/workflows/doctests.yml
@ -19,7 +19,7 @@ env:
 jobs:
  run_doctests:
-    runs-on: [self-hosted, docker-gpu, single-gpu]
+    runs-on: [self-hosted, docker-gpu-test, single-gpu]
    container:
      image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@ -35,8 +35,16 @@ jobs:
        run: |
          apt -y update && apt install -y libsndfile1-dev
          pip install --upgrade pip
-          pip install .[dev]
+          pip install .[testing,torch-speech]
      - name: Prepare files for doctests
        run: |
          python utils/prepare_for_doc_test.py src docs
      - name: Run doctests
        run: |
-          pytest --doctest-modules $(cat utils/documentation_tests.txt) -sv --doctest-continue-on-failure
+          pytest --doctest-modules $(cat utils/documentation_tests.txt) -sv --doctest-continue-on-failure --doctest-glob="*.mdx"
      - name: Clean files after doctests
        run: |
          python utils/prepare_for_doc_test.py src docs --remove_new_line
--- a/.github/workflows/model-templates.yml
+++ b/.github/workflows/model-templates.yml
@ -61,15 +61,15 @@ jobs:
      - name: Run style changes
        run: |
          git fetch origin master:master
-          make style && make quality
+          make style && make quality && make repo-consistency
      - name: Failure short reports
        if: ${{ always() }}
-        run: cat reports/tests_templates_failures_short.txt
+        run: cat reports/tests_templates/failures_short.txt
      - name: Test suite reports artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v2
        with:
          name: run_all_tests_templates_test_reports
-          path: reports
+          path: reports/tests_templates
--- a/.github/workflows/self-nightly-scheduled.yml
+++ b/.github/workflows/self-nightly-scheduled.yml
@ -33,9 +33,10 @@ jobs:
            - name: Install dependencies
              run: |
-                  apt -y update && apt install -y libsndfile1-dev git
+                  apt -y update && apt install -y libsndfile1-dev git espeak-ng
                  pip install --upgrade pip
                  pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
                  pip install https://github.com/kpu/kenlm/archive/master.zip
                  pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html -U
            - name: Are GPUs recognized by our DL frameworks
@ -100,9 +101,10 @@ jobs:
            - name: Install dependencies
              run: |
-                  apt -y update && apt install -y libsndfile1-dev git
+                  apt -y update && apt install -y libsndfile1-dev git espeak-ng
                  pip install --upgrade pip
                  pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
                  pip install https://github.com/kpu/kenlm/archive/master.zip
                  pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html -U
            - name: Are GPUs recognized by our DL frameworks
@ -152,10 +154,11 @@ jobs:
            - name: Install dependencies
              run: |
-                  apt -y update && apt install -y libaio-dev
+                  apt -y update && apt install -y libaio-dev libsndfile1-dev git espeak-ng
                  pip install --upgrade pip
                  pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html -U
                  pip install .[testing,deepspeed]
                  pip install https://github.com/kpu/kenlm/archive/master.zip
                  pip install git+https://github.com/microsoft/DeepSpeed
            - name: Are GPUs recognized by our DL frameworks
@ -193,11 +196,12 @@ jobs:
            - name: Install dependencies
              run: |
-                  apt -y update && apt install -y libaio-dev
+                  apt -y update && apt install -y libaio-dev libsndfile1-dev git espeak-ng
                  pip install --upgrade pip
                  pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu113/torch_nightly.html -U
                  rm -rf ~/.cache/torch_extensions/ # shared between conflicting builds
                  pip install .[testing,fairscale]
                  pip install https://github.com/kpu/kenlm/archive/master.zip
                  pip install git+https://github.com/microsoft/DeepSpeed # testing bleeding edge
            - name: Are GPUs recognized by our DL frameworks
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@ -31,7 +31,7 @@ jobs:
      - name: Install dependencies
        run: |
          apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git
-          apt install -y libsndfile1-dev
+          apt install -y libsndfile1-dev espeak-ng
          pip install --upgrade pip
          pip install .[sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
          pip install https://github.com/kpu/kenlm/archive/master.zip
@ -82,13 +82,17 @@ jobs:
      image: tensorflow/tensorflow:2.4.1-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      - name: Set up Python 3.7
        uses: actions/setup-python@v2
        with:
          python-version: 3.7
      - name: Install dependencies
        run: |
-          apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git
+          apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git espeak-ng
          pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html
          pip install --upgrade pip
          pip install .[sklearn,testing,sentencepiece,flax,flax-speech,vision]
          pip install https://github.com/kpu/kenlm/archive/master.zip
      - name: Launcher docker
        uses: actions/checkout@v2
@ -141,7 +145,7 @@ jobs:
 #    steps:
 #      - name: Install dependencies
 #        run: |
-#          apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git
+#          apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git espeak-ng
 #          pip install --upgrade pip
 #          pip install .[sklearn,testing,onnxruntime,sentencepiece,tf-speech]
 #          pip install https://github.com/kpu/kenlm/archive/master.zip
@ -199,8 +203,8 @@ jobs:
    steps:
      - name: Install dependencies
        run: |
-          apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git
+          apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git espeak-ng
-          apt install -y libsndfile1-dev
+          apt install -y libsndfile1-dev espeak-ng
          pip install --upgrade pip
          pip install .[sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
          pip install https://github.com/kpu/kenlm/archive/master.zip
@ -255,7 +259,7 @@ jobs:
 #    steps:
 #      - name: Install dependencies
 #        run: |
-#          apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git
+#          apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git espeak-ng
 #          pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html
 #          pip install --upgrade pip
 #          pip install .[sklearn,testing,sentencepiece,flax,flax-speech,vision]
@ -312,7 +316,7 @@ jobs:
 #    steps:
 #      - name: Install dependencies
 #        run: |
-#          apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git
+#          apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git espeak-ng
 #          pip install --upgrade pip
 #          pip install .[sklearn,testing,onnxruntime,sentencepiece,tf-speech]
 #          pip install https://github.com/kpu/kenlm/archive/master.zip
@ -492,4 +496,4 @@ jobs:
        run: |
          pip install slack_sdk
-          python utils/notification_service.py push
+          python utils/notification_service_deprecated.py push
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@ -1,477 +1,246 @@
 name: Self-hosted runner (scheduled)
 on:
  push:
    branches:
      - multi_ci_*
  repository_dispatch:
  schedule:
-    - cron: "0 0 * * *"
+    - cron: "0 2 * * *"
 env:
  HF_HOME: /mnt/cache
  TRANSFORMERS_IS_CI: yes
  OMP_NUM_THREADS: 8
  MKL_NUM_THREADS: 8
  RUN_SLOW: yes
  OMP_NUM_THREADS: 16
  MKL_NUM_THREADS: 16
  PYTEST_TIMEOUT: 600
  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
  TF_FORCE_GPU_ALLOW_GROWTH: true
  RUN_PT_TF_CROSS_TESTS: 1
 jobs:
-  run_all_tests_torch_gpu:
+  setup:
-    runs-on: [self-hosted, docker-gpu, single-gpu]
+    name: Setup
    strategy:
      matrix:
        machines: [multi-gpu-docker, single-gpu-docker]
    runs-on: ${{ matrix.machines }}
    container:
-      image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
+      image: huggingface/transformers-all-latest-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    outputs:
      matrix: ${{ steps.set-matrix.outputs.matrix }}
    steps:
-      - name: Launcher docker
+      - name: Update clone
-        uses: actions/checkout@v2
+        working-directory: /transformers
        run: |
          git fetch && git checkout ${{ github.sha }}
      - name: Cleanup
        working-directory: /transformers
        run: |
          rm -rf tests/__pycache__
          rm -rf reports
      - id: set-matrix
        name: Identify models to test
        working-directory: /transformers/tests
        run: |
          echo "::set-output name=matrix::$(python3 -c 'import os; x = list(filter(os.path.isdir, os.listdir(os.getcwd()))); x.sort(); print(x)')"
      - name: NVIDIA-SMI
        run: |
          nvidia-smi
-      - name: Install dependencies
+      - name: GPU visibility
-        run: |
+        working-directory: /transformers
          apt -y update && apt install -y libsndfile1-dev git
          pip install --upgrade pip
          pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
          pip install https://github.com/kpu/kenlm/archive/master.zip
      - name: Are GPUs recognized by our DL frameworks
        run: |
          utils/print_env_pt.py
          TF_CPP_MIN_LOG_LEVEL=3 python3 -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
          TF_CPP_MIN_LOG_LEVEL=3 python3 -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
-      - name: Run all tests on GPU
+  run_tests_gpu:
-        run: |
+    name: Model tests
-          python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_gpu tests
+    strategy:
      fail-fast: false
      matrix:
        folders: ${{ fromJson(needs.setup.outputs.matrix) }}
        machines: [multi-gpu-docker, single-gpu-docker]
    runs-on: ${{ matrix.machines }}
    container:
      image: huggingface/transformers-all-latest-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    needs: setup
    steps:
      - name: Echo folder ${{ matrix.folders }}
        run: echo "${{ matrix.folders }}"
      - name: Update clone
        working-directory: /transformers
        run: git fetch && git checkout ${{ github.sha }}
      - name: Run all non-slow tests on GPU
        working-directory: /transformers
        run: python3 -m pytest -v --make-reports=${{ matrix.machines }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
      - name: Failure short reports
        if: ${{ failure() }}
        continue-on-error: true
        run: cat /transformers/reports/${{ matrix.machines }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
      - name: Test suite reports artifacts
        if: ${{ always() }}
-        run: cat reports/tests_torch_gpu_failures_short.txt
+        uses: actions/upload-artifact@v2
        with:
          name: ${{ matrix.machines }}_run_all_tests_gpu_${{ matrix.folders }}_test_reports
          path: /transformers/reports/${{ matrix.machines }}_tests_gpu_${{ matrix.folders }}
  run_examples_gpu:
    name: Examples directory
    runs-on: [self-hosted, single-gpu-docker]
    container:
      image: huggingface/transformers-all-latest-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    needs: setup
    steps:
      - name: Update clone
        working-directory: /transformers
        run: git fetch && git checkout ${{ github.sha }}
      - name: Run examples tests on GPU
-        if: ${{ always() }}
+        working-directory: /transformers
        env:
          OMP_NUM_THREADS: 16
          MKL_NUM_THREADS: 16
          RUN_SLOW: yes
          HF_HOME: /mnt/cache
          TRANSFORMERS_IS_CI: yes
        run: |
          pip install -r examples/pytorch/_tests_requirements.txt
-          python -m pytest -n 1 -v --dist=loadfile --make-reports=examples_torch_gpu examples
+          python3 -m pytest -v --make-reports=examples_gpu examples/pytorch
      - name: Failure short reports
-        if: ${{ always() }}
+        if: ${{ failure() }}
-        run: cat reports/examples_torch_gpu_failures_short.txt
+        continue-on-error: true
-
+        run: cat /transformers/reports/examples_gpu/failures_short.txt
      - name: Run all pipeline tests on GPU
        if: ${{ always() }}
        env:
          RUN_PIPELINE_TESTS: yes
        run: |
          python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests
      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_torch_pipeline_gpu_failures_short.txt
      - name: Test suite reports artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v2
        with:
-          name: run_all_tests_torch_gpu_test_reports
+          name: run_examples_gpu
-          path: reports
+          path: /transformers/reports/examples_gpu
-  run_all_tests_flax_gpu:
+  run_pipelines_torch_gpu:
-    runs-on: [self-hosted, docker-gpu-test, single-gpu]
+    name: PyTorch pipelines
    strategy:
      fail-fast: false
      matrix:
        machines: [multi-gpu-docker, single-gpu-docker]
    runs-on: ${{ matrix.machines }}
    container:
-      image: tensorflow/tensorflow:2.4.1-gpu
+      image: huggingface/transformers-pytorch-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    needs: setup
    steps:
-      - name: Launcher docker
+      - name: Update clone
-        uses: actions/checkout@v2
+        working-directory: /transformers
        run: git fetch && git checkout ${{ github.sha }}
-      - name: NVIDIA-SMI
+      - name: Run all pipeline tests on GPU
-        continue-on-error: true
+        working-directory: /transformers
        env:
          RUN_PIPELINE_TESTS: yes
        run: |
-          nvidia-smi
+          python3 -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=${{ matrix.machines }}_tests_torch_pipeline_gpu tests
      - name: Install dependencies
        run: |
          pip install --upgrade pip
          pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html
          pip install .[flax,integrations,sklearn,testing,sentencepiece,flax-speech,vision]
          pip install https://github.com/kpu/kenlm/archive/master.zip
      - name: Are GPUs recognized by our DL frameworks
        run: |
          python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)"
          python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))"
      - name: Run all tests on GPU
        run: |
          python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_flax_gpu tests
      - name: Failure short reports
-        if: ${{ always() }}
+        if: ${{ failure() }}
-        run: cat reports/tests_flax_gpu_failures_short.txt
+        continue-on-error: true
        run: cat /transformers/reports/${{ matrix.machines }}_tests_torch_pipeline_gpu/failures_short.txt
      - name: Test suite reports artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v2
        with:
-          name: run_all_tests_flax_gpu_test_reports
+          name: ${{ matrix.machines }}_run_tests_torch_pipeline_gpu
-          path: reports
+          path: /transformers/reports/${{ matrix.machines }}_tests_torch_pipeline_gpu
-  run_all_tests_tf_gpu:
+  run_pipelines_tf_gpu:
-    runs-on: [self-hosted, docker-gpu, single-gpu]
+    name: TensorFlow pipelines
    strategy:
      fail-fast: false
      matrix:
        machines: [multi-gpu-docker, single-gpu-docker]
    runs-on: ${{ matrix.machines }}
    container:
-      image: tensorflow/tensorflow:2.4.1-gpu
+      image: huggingface/transformers-tensorflow-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    needs: setup
    steps:
-      - name: Launcher docker
+      - name: Update clone
-        uses: actions/checkout@v2
+        working-directory: /transformers
      - name: NVIDIA-SMI
        run: |
-          nvidia-smi
+          git fetch && git checkout ${{ github.sha }}
      - name: Install dependencies
        run: |
          apt -y update && apt install -y libsndfile1-dev git
          pip install --upgrade pip
          pip install .[sklearn,testing,onnx,sentencepiece,tf-speech,vision]
          pip install https://github.com/kpu/kenlm/archive/master.zip
      - name: Are GPUs recognized by our DL frameworks
        run: |
          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
      - name: Run all tests on GPU
        env:
          TF_NUM_INTEROP_THREADS: 1
          TF_NUM_INTRAOP_THREADS: 16
        run: |
          python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_tf_gpu tests
      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_tf_gpu_failures_short.txt
      - name: Run all pipeline tests on GPU
-        if: ${{ always() }}
+        working-directory: /transformers
        env:
          RUN_PIPELINE_TESTS: yes
          TF_NUM_INTEROP_THREADS: 1
          TF_NUM_INTRAOP_THREADS: 16
        run: |
          python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_tf_pipeline_gpu tests
      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_tf_pipeline_gpu_failures_short.txt
      - name: Test suite reports artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v2
        with:
          name: run_all_tests_tf_gpu_test_reports
          path: reports
  run_all_examples_torch_xla_tpu:
    runs-on: [self-hosted, docker-tpu-test, tpu-v3-8]
    container:
      image: gcr.io/tpu-pytorch/xla:nightly_3.8_tpuvm
      options: --privileged -v "/lib/libtpu.so:/lib/libtpu.so" -v /mnt/cache/.cache/huggingface:/mnt/cache/ --shm-size 16G
    steps:
      - name: Launcher docker
        uses: actions/checkout@v2
      - name: Install dependencies
        run: |
          pip install --upgrade pip
          pip install .[testing]
      - name: Are TPUs recognized by our DL frameworks
        env:
          XRT_TPU_CONFIG: localservice;0;localhost:51011
        run: |
          python -c "import torch_xla.core.xla_model as xm; print(xm.xla_device())"
      - name: Run example tests on TPU
        env:
          XRT_TPU_CONFIG: "localservice;0;localhost:51011"
          MKL_SERVICE_FORCE_INTEL: "1"  # See: https://github.com/pytorch/pytorch/issues/37377
        run: |
          python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_xla_tpu examples/pytorch/test_xla_examples.py
      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_torch_xla_tpu_failures_short.txt
      - name: Test suite reports artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v2
        with:
          name: run_all_examples_torch_xla_tpu
          path: reports
  run_all_tests_torch_multi_gpu:
    runs-on: [self-hosted, docker-gpu, multi-gpu]
    container:
      image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      - name: Launcher docker
        uses: actions/checkout@v2
      - name: NVIDIA-SMI
        continue-on-error: true
        run: |
          nvidia-smi
      - name: Install dependencies
        run: |
          apt -y update && apt install -y libsndfile1-dev git
          pip install --upgrade pip
          pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
          pip install https://github.com/kpu/kenlm/archive/master.zip
      - name: Are GPUs recognized by our DL frameworks
        run: |
          utils/print_env_pt.py
      - name: Run all tests on GPU
        env:
          MKL_SERVICE_FORCE_INTEL: 1
        run: |
          python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_multi_gpu tests
      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_torch_multi_gpu_failures_short.txt
      - name: Run all pipeline tests on GPU
        if: ${{ always() }}
        env:
          RUN_PIPELINE_TESTS: yes
        run: |
-          python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests
+          python3 -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=${{ matrix.machines }}_tests_tf_pipeline_gpu tests
      - name: Failure short reports
        if: ${{ always() }}
-        run: cat reports/tests_torch_pipeline_multi_gpu_failures_short.txt
+        run: |
          cat /transformers/reports/${{ matrix.machines }}_tests_tf_pipeline_gpu/failures_short.txt
      - name: Test suite reports artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v2
        with:
-          name: run_all_tests_torch_multi_gpu_test_reports
+          name: ${{ matrix.machines }}_run_tests_tf_pipeline_gpu
-          path: reports
+          path: /transformers/reports/${{ matrix.machines }}_tests_tf_pipeline_gpu
  run_all_tests_tf_multi_gpu:
    runs-on: [self-hosted, docker-gpu, multi-gpu]
    container:
      image: tensorflow/tensorflow:2.4.1-gpu
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      - name: Launcher docker
        uses: actions/checkout@v2
      - name: NVIDIA-SMI
        continue-on-error: true
        run: |
          nvidia-smi
      - name: Install dependencies
        run: |
          apt -y update && apt install -y libsndfile1-dev git
          pip install --upgrade pip
          pip install .[sklearn,testing,onnx,sentencepiece,tf-speech,vision]
          pip install https://github.com/kpu/kenlm/archive/master.zip
      - name: Are GPUs recognized by our DL frameworks
        run: |
          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
      - name: Run all tests on GPU
        env:
          TF_NUM_INTEROP_THREADS: 1
          TF_NUM_INTRAOP_THREADS: 16
        run: |
          python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_tf_multi_gpu tests
      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_tf_multi_gpu_failures_short.txt
      - name: Run all pipeline tests on GPU
        if: ${{ always() }}
        env:
          RUN_PIPELINE_TESTS: yes
          TF_NUM_INTEROP_THREADS: 1
          TF_NUM_INTRAOP_THREADS: 16
        run: |
          python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_tf_pipeline_multi_gpu tests
      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_tf_pipeline_multi_gpu_failures_short.txt
      - name: Test suite reports artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v2
        with:
          name: run_all_tests_tf_multi_gpu_test_reports
          path: reports
 #  run_all_tests_flax_multi_gpu:
 #    runs-on: [self-hosted, docker-gpu, multi-gpu]
 #    container:
 #      image: tensorflow/tensorflow:2.4.1-gpu
 #      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
 #    steps:
 #      - name: Launcher docker
 #        uses: actions/checkout@v2
 #
 #      - name: NVIDIA-SMI
 #        run: |
 #          nvidia-smi
 #
 #      - name: Install dependencies
 #        run: |
 #          pip install --upgrade pip
 #          pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html
 #          pip install .[flax,integrations,sklearn,testing,sentencepiece,flax-speech,vision]
 #
 #      - name: Are GPUs recognized by our DL frameworks
 #        run: |
 #          python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)"
 #          python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))"
 #
 #      - name: Run all tests on GPU
 #        run: |
 #          python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_flax_gpu tests
 #
 #      - name: Failure short reports
 #        if: ${{ always() }}
 #        run: cat reports/tests_flax_gpu_failures_short.txt
 #
 #      - name: Test suite reports artifacts
 #        if: ${{ always() }}
 #        uses: actions/upload-artifact@v2
 #        with:
 #          name: run_all_tests_flax_gpu_test_reports
 #          path: reports
  run_all_tests_torch_cuda_extensions_gpu:
-    runs-on: [self-hosted, docker-gpu, single-gpu]
+    name: Torch CUDA extension tests
    strategy:
      fail-fast: false
      matrix:
        machines: [multi-gpu-docker, single-gpu-docker]
    runs-on: ${{ matrix.machines }}
    needs: setup
    container:
-      image: nvcr.io/nvidia/pytorch:21.03-py3
+      image: huggingface/transformers-pytorch-deepspeed-latest-gpu
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
-      - name: Launcher docker
+      - name: Update clone
-        uses: actions/checkout@v2
+        working-directory: /workspace/transformers
-
+        run: git fetch && git checkout ${{ github.sha }}
      - name: NVIDIA-SMI
        run: |
          nvidia-smi
      - name: Install dependencies
        run: |
          apt -y update && apt install -y libaio-dev
          pip install --upgrade pip
          pip install .[testing,deepspeed]
      - name: Are GPUs recognized by our DL frameworks
        run: |
          utils/print_env_pt.py
      - name: Run all tests on GPU
        working-directory: /workspace/transformers
        run: |
-          python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
+          python -m pytest -v --make-reports=${{ matrix.machines }}_tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
      - name: Failure short reports
-        if: ${{ always() }}
+        if: ${{ failure() }}
        run: cat reports/tests_torch_cuda_extensions_gpu_failures_short.txt
      - name: Test suite reports artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v2
        with:
          name: run_tests_torch_cuda_extensions_gpu_test_reports
          path: reports
  run_all_tests_torch_cuda_extensions_multi_gpu:
    runs-on: [self-hosted, docker-gpu, multi-gpu]
    container:
      image: nvcr.io/nvidia/pytorch:21.03-py3
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      - name: Launcher docker
        uses: actions/checkout@v2
      - name: NVIDIA-SMI
        continue-on-error: true
-        run: |
+        run: cat /workspace/transformers/reports/${{ matrix.machines }}_tests_torch_cuda_extensions_gpu/failures_short.txt
          nvidia-smi
      - name: Install dependencies
        run: |
          apt -y update && apt install -y libaio-dev
          pip install --upgrade pip
          rm -rf ~/.cache/torch_extensions/ # shared between conflicting builds
          pip install .[testing,deepspeed,fairscale]
      - name: Are GPUs recognized by our DL frameworks
        run: |
          utils/print_env_pt.py
      - name: Run all tests on GPU
        run: |
          python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended
      - name: Failure short reports
        if: ${{ always() }}
        run: cat reports/tests_torch_cuda_extensions_multi_gpu_failures_short.txt
      - name: Test suite reports artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v2
        with:
-          name: run_tests_torch_cuda_extensions_multi_gpu_test_reports
+          name: ${{ matrix.machines }}_run_tests_torch_cuda_extensions_gpu_test_reports
-          path: reports
+          path: /workspace/transformers/reports/${{ matrix.machines }}_tests_torch_cuda_extensions_gpu
  send_results:
    name: Send results to webhook
    runs-on: ubuntu-latest
    if: always()
-    needs: [
+    needs: [setup, run_tests_gpu, run_examples_gpu, run_pipelines_tf_gpu, run_pipelines_torch_gpu, run_all_tests_torch_cuda_extensions_gpu]
        run_all_tests_torch_gpu,
        run_all_tests_tf_gpu,
        run_all_tests_torch_multi_gpu,
        run_all_tests_tf_multi_gpu,
        run_all_tests_torch_cuda_extensions_gpu,
        run_all_tests_torch_cuda_extensions_multi_gpu
    ]
    steps:
      - uses: actions/checkout@v2
      - uses: actions/download-artifact@v2
      - name: Send message to Slack
        env:
          CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
          CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
          CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
-
+          CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
        run: |
          pip install slack_sdk
-          python utils/notification_service.py scheduled
+          python utils/notification_service.py "${{ needs.setup.outputs.matrix }}"
--- a/.gitignore
+++ b/.gitignore
@ -160,4 +160,7 @@ tags
 .pre-commit*
 # .lock
-*.lock
+*.lock
 # DS_Store (MacOS)
 .DS_Store
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -124,7 +124,7 @@ issues to make sure that nobody is already working on the same thing. If you are
 unsure, it is always a good idea to open an issue to get some feedback.
 You will need basic `git` proficiency to be able to contribute to
-`transformers`. `git` is not the easiest tool to use but it has the greatest
+🤗 Transformers. `git` is not the easiest tool to use but it has the greatest
 manual. Type `git --help` in a shell and enjoy. If you prefer books, [Pro
 Git](https://git-scm.com/book/en/v2) is a very good reference.
@ -175,34 +175,26 @@ Follow these steps to start contributing:
 5. Develop the features on your branch.
   As you work on the features, you should make sure that the test suite
-   passes:
+   passes. You should run the tests impacted by your changes like this:
   ```bash
   $ pytest tests/<TEST_TO_RUN>.py
   ```
   You can also run the full suite with the following command, but it takes
   a beefy machine to produce a result in a decent amount of time now that
   Transformers has grown a lot. Here is the command for it:
   ```bash
   $ make test
   ```
-   Note, that this command uses `-n auto` pytest flag, therefore, it will start as many parallel `pytest` processes as the number of your computer's CPU-cores, and if you have lots of those and a few GPUs and not a great amount of RAM, it's likely to overload your computer. Therefore, to run the test suite, you may want to consider using this command instead:
+   For more information about tests, check out the
   [dedicated documentation](https://huggingface.co/docs/transformers/testing)
-   ```bash
+   🤗 Transformers relies on `black` and `isort` to format its source code
-   $ python -m pytest -n 3 --dist=loadfile -s -v ./tests/
+   consistently. After you make changes, apply automatic style corrections and code verifications
-   ```
+   that can't be automated in one go with:
   Adjust the value of `-n` to fit the load your hardware can support.
   `transformers` relies on `black` and `isort` to format its source code
   consistently. After you make changes, format them with:
   ```bash
   $ make style
   ```
   `transformers` also uses `flake8` and a few custom scripts to check for coding mistakes. Quality
   control runs in CI, however you can also run the same checks with:
   ```bash
   $ make quality
   ```
   You can do the automatic style corrections and code verifications that can't be automated in one go:
   ```bash
   $ make fixup
@ -210,16 +202,55 @@ Follow these steps to start contributing:
   This target is also optimized to only work with files modified by the PR you're working on.
-   If you're modifying documents under `docs/source`, make sure to validate that
+   If you prefer to run the checks one after the other, the following command apply the
-   they can still be built. This check also runs in CI. To run a local check
+   style corrections:
   make sure you have installed the documentation builder requirements, by
   running `pip install .[tf,torch,docs]` once from the root of this repository
   and then run:
   ```bash
-   $ make docs
+   $ make style
   ```
   🤗 Transformers also uses `flake8` and a few custom scripts to check for coding mistakes. Quality
   control runs in CI, however you can also run the same checks with:
   ```bash
   $ make quality
   ```
   Finally we have a lot of scripts that check we didn't forget to update
   some files when adding a new model, that you can run with
   ```bash
   $ make repo-consistency
   ```
   To learn more about those checks and how to fix any issue with them, check out the
   [documentation](https://huggingface.co/docs/transformers/pr_checks)
   If you're modifying documents under `docs/source`, make sure to validate that
   they can still be built. This check also runs in CI. To run a local check
   make sure you have installed the documentation builder requirements. First you will need to clone the
   repository containing our tools to build the documentation:
   ```bash
   $ pip install git+https://github.com/huggingface/doc-builder
   ```
   Then, make sure you have all the dependencies to be able to build the doc with:
   ```bash
   $ pip install ".[docs]"
   ```
   Finally run the following command from the root of the repository:
   ```bash
   $ doc-builder build transformers docs/source/ --build_dir ~/tmp/test-build
   ```
   This will build the documentation in the `~/tmp/test-build` folder where you can inspect the generated
   Markdown files with your favorite editor. You won't be able to see the final rendering on the website
   before your PR is merged, we are actively working on adding a tool for this.
   Once you're happy with your changes, add changed files using `git add` and
   make a commit with `git commit` to record your changes locally:
@ -277,7 +308,9 @@ Follow these steps to start contributing:
   example.
 7. Due to the rapidly growing repository, it is important to make sure that no files that would significantly weigh down the repository are added. This includes images, videos and other non-text files. We prefer to leverage a hf.co hosted `dataset` like
   the ones hosted on [`hf-internal-testing`](https://huggingface.co/hf-internal-testing) in which to place these files and reference 
-   them by URL.
+   them by URL. We recommend putting them in the following dataset: [huggingface/documentation-images](https://huggingface.co/datasets/huggingface/documentation-images).
   If an external contribution, feel free to add the images to your PR and ask a Hugging Face member to migrate your images
   to this dataset.
 See more about the checks run on a pull request in our [PR guide](pr_checks)
@ -331,7 +364,7 @@ $ python -m unittest discover -s examples -t examples -v
 ### Style guide
-For documentation strings, `transformers` follows the [google style](https://google.github.io/styleguide/pyguide.html).
+For documentation strings, 🤗 Transformers follows the [google style](https://google.github.io/styleguide/pyguide.html).
 Check our [documentation writing guide](https://github.com/huggingface/transformers/tree/master/docs#writing-documentation---specification)
 for more information.
@ -355,7 +388,7 @@ You can now use `make` from any terminal (Powershell, cmd.exe, etc) 🎉
 ### Syncing forked master with upstream (HuggingFace) master
-To avoid pinging the upstream repository which adds reference notes to each upstream PR and sends unnessary notifications to the developers involved in these PRs,
+To avoid pinging the upstream repository which adds reference notes to each upstream PR and sends unnecessary notifications to the developers involved in these PRs,
 when syncing the master branch of a forked repository, please, follow these steps:
 1. When possible, avoid syncing with the upstream using a branch and PR on the forked repository. Instead merge directly into the forked master.
 2. If a PR is absolutely necessary, use the following steps after checking out your branch:
--- a/5
+++ b/5
@ -91,11 +91,6 @@ test-sagemaker: # install sagemaker dependencies in advance with pip install .[s
 	TEST_SAGEMAKER=True python -m pytest -n auto  -s -v ./tests/sagemaker
 # Check that docs can build
 docs:
 	cd docs && make html SPHINXOPTS="-W -j 4"
 # Release stuff
 pre-release:
--- a/README.md
+++ b/README.md
@ -16,7 +16,7 @@ limitations under the License.
 <p align="center">
    <br>
-    <img src="https://raw.githubusercontent.com/huggingface/transformers/master/docs/source/imgs/transformers_logo_name.png" width="400"/>
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_logo_name.png" width="400"/>
    <br>
 <p>
 <p align="center">
@ -52,7 +52,7 @@ limitations under the License.
 </h3>
 <h3 align="center">
-    <a href="https://hf.co/course"><img src="https://raw.githubusercontent.com/huggingface/transformers/master/docs/source/imgs/course_banner.png"></a>
+    <a href="https://hf.co/course"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/course_banner.png"></a>
 </h3>
 🤗 Transformers provides thousands of pretrained models to perform tasks on different modalities such as text, vision, and audio. 
@ -198,7 +198,7 @@ You should install 🤗 Transformers in a [virtual environment](https://docs.pyt
 First, create a virtual environment with the version of Python you're going to use and activate it.
 Then, you will need to install at least one of Flax, PyTorch or TensorFlow.
-Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/), [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) and/or [Flax installation page](https://github.com/google/flax#quick-install) regarding the specific install command for your platform.
+Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/), [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) and/or [Flax](https://github.com/google/flax#quick-install) and [Jax](https://github.com/google/jax#installation) installation pages regarding the specific install command for your platform.
 When one of those backends has been installed, 🤗 Transformers can be installed using pip as follows:
@ -229,27 +229,29 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 🤗 Transformers currently provides the following architectures (see [here](https://huggingface.co/docs/transformers/model_summary) for a high-level summary of each them):
 1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
+1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
 1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
 1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
-1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bertgeneration)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
-1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/bigbird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot_small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
 1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/master/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/master/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
-1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta_v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
+1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
@ -257,12 +259,12 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval
 for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon
 Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
-1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoderdecoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-1. **[GPT](https://huggingface.co/docs/transformers/model_doc/gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
+1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
 1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
 1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
 1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
@ -279,21 +281,26 @@ Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
 1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
 1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
 1. **[MaskFormer](https://huggingface.co/docs/transformers/master/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
 1. **[MBart](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
 1. **[MBart-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
-1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron_bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/master/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
 1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
 1. **[PLBart](https://huggingface.co/docs/transformers/master/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
 1. **[PoolFormer](https://huggingface.co/docs/transformers/master/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
 1. **[REALM](https://huggingface.co/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
 1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
-1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
+1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
-1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
+1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
+1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
 1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
@ -301,22 +308,31 @@ Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
 1. **[SqueezeBert](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
 1. **[Swin Transformer](https://huggingface.co/docs/transformers/master/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
-1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transformerxl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
-1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech_sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER
+1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER
 AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
 1. **[ViLT](https://huggingface.co/docs/transformers/master/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[ViTMAE](https://huggingface.co/docs/transformers/master/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
 1. **[WavLM](https://huggingface.co/docs/transformers/master/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/master/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
 1. **[XGLM](https://huggingface.co/docs/master/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
-1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlmprophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlmroberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
+1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
 1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/master/model_doc/xlm-roberta-xl)** (from Facebook AI), released together with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
 1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
 1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
 1. **[XLS-R](https://huggingface.co/docs/master/transformers/model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
 1. **[YOSO](https://huggingface.co/docs/transformers/master/model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
 1. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
 To check if each model has an implementation in Flax, PyTorch or TensorFlow, or has an associated tokenizer backed by the 🤗 Tokenizers library, refer to [this table](https://huggingface.co/docs/transformers/index#supported-frameworks).
--- a/README_ko.md
+++ b/README_ko.md
@ -16,7 +16,7 @@ limitations under the License.
 <p align="center">
    <br>
-    <img src="https://raw.githubusercontent.com/huggingface/transformers/master/docs/source/imgs/transformers_logo_name.png" width="400"/>
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_logo_name.png" width="400"/>
    <br>
 <p>
 <p align="center">
@ -52,7 +52,7 @@ limitations under the License.
 </h3>
 <h3 align="center">
-    <a href="https://hf.co/course"><img src="https://raw.githubusercontent.com/huggingface/transformers/master/docs/source/imgs/course_banner.png"></a>
+    <a href="https://hf.co/course"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/course_banner.png"></a>
 </h3>
 🤗 Transformers는 분류, 정보 추출, 질문 답변, 요약, 번역, 문장 생성 등을 100개 이상의 언어로 수행할 수 있는 수천개의 사전학습된 모델을 제공합니다. 우리의 목표는 모두가 최첨단의 NLP 기술을 쉽게 사용하는 것입니다.
@ -215,33 +215,35 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
 1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
 1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
-1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bertgeneration)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/bigbird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot_small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
 1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/master/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/master/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
-1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta_v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
+1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) and a German version of DistilBERT.
 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
-1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoderdecoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-1. **[GPT](https://huggingface.co/docs/transformers/model_doc/gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
+1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
 1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
 1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
 1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
@ -257,18 +259,23 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
 1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
 1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
 1. **[MaskFormer](https://huggingface.co/docs/transformers/master/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
 1. **[MBart](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
 1. **[MBart-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
-1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron_bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/master/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
 1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
 1. **[PLBart](https://huggingface.co/docs/transformers/master/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
 1. **[PoolFormer](https://huggingface.co/docs/transformers/master/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
 1. **[REALM](https://huggingface.co/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
 1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
@ -280,21 +287,32 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
 1. **[SqueezeBert](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
 1. **[Swin Transformer](https://huggingface.co/docs/transformers/master/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
-1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transformerxl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
-1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech_sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
+1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
 1. **[ViLT](https://huggingface.co/docs/transformers/master/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
 1. **[ViLT)](https://huggingface.co/docs/transformers/master/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
 1. **[ViTMAE](https://huggingface.co/docs/transformers/master/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
 1. **[ViTMAE)](https://huggingface.co/docs/transformers/master/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/master/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
 1. **[WavLM](https://huggingface.co/docs/transformers/master/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
 1. **[XGLM](https://huggingface.co/docs/master/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li. 
 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
-1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlmprophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlmroberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
+1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
 1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/master/model_doc/xlm-roberta-xl)** (from Facebook AI) released with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
 1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
 1. **[XLS-R](https://huggingface.co/docs/master/transformers/model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
 1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
 1. **[YOSO](https://huggingface.co/docs/transformers/master/model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
 1. 새로운 모델을 올리고 싶나요? 우리가 **상세한 가이드와 템플릿** 으로 새로운 모델을 올리도록 도와드릴게요. 가이드와 템플릿은 이 저장소의 [`templates`](./templates) 폴더에서 확인하실 수 있습니다. [컨트리뷰션 가이드라인](./CONTRIBUTING.md)을 꼭 확인해주시고, PR을 올리기 전에 메인테이너에게 연락하거나 이슈를 오픈해 피드백을 받으시길 바랍니다. 
 각 모델이 Flax, PyTorch, TensorFlow으로 구현되었는지 또는 🤗 Tokenizers 라이브러리가 지원하는 토크나이저를 사용하는지 확인하려면, [이 표](https://huggingface.co/docs/transformers/index#supported-frameworks)를 확인하세요.
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@ -41,7 +41,7 @@ checkpoint: 检查点
 <p align="center">
    <br>
-    <img src="https://raw.githubusercontent.com/huggingface/transformers/master/docs/source/imgs/transformers_logo_name.png" width="400"/>
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_logo_name.png" width="400"/>
    <br>
 <p>
 <p align="center">
@ -77,7 +77,7 @@ checkpoint: 检查点
 </h3>
 <h3 align="center">
-    <a href="https://hf.co/course"><img src="https://raw.githubusercontent.com/huggingface/transformers/master/docs/source/imgs/course_banner.png"></a>
+    <a href="https://hf.co/course"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/course_banner.png"></a>
 </h3>
 🤗 Transformers 提供了数以千计的预训练模型，支持 100 多种语言的文本分类、信息抽取、问答、摘要、翻译、文本生成。它的宗旨让最先进的 NLP 技术人人易用。
@ -239,33 +239,35 @@ conda install -c huggingface transformers
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (来自 VinAI Research) 伴随论文 [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) 由 Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen 发布。
 1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (来自 Microsoft) 伴随论文 [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) 由 Hangbo Bao, Li Dong, Furu Wei 发布。
 1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (来自 Google) 伴随论文 [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) 由 Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova 发布。
-1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bertgeneration)** (来自 Google) 伴随论文 [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) 由 Sascha Rothe, Shashi Narayan, Aliaksei Severyn 发布。
+1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bert-generation)** (来自 Google) 伴随论文 [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) 由 Sascha Rothe, Shashi Narayan, Aliaksei Severyn 发布。
 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (来自 VinAI Research) 伴随论文 [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) 由 Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen 发布。
 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (来自 Google Research) 伴随论文 [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) 由 Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed 发布。
-1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/bigbird)** (来自 Google Research) 伴随论文 [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) 由 Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed 发布。
+1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (来自 Google Research) 伴随论文 [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) 由 Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed 发布。
 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (来自 Facebook) 伴随论文 [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) 由 Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston 发布。
-1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot_small)** (来自 Facebook) 伴随论文 [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) 由 Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston 发布。
+1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (来自 Facebook) 伴随论文 [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) 由 Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston 发布。
 1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (来自 Alexa) 伴随论文 [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) 由 Adrian de Wynter and Daniel J. Perry 发布。
 1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (来自 Google Research) 伴随论文 [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) 由 Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel 发布。
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (来自 Inria/Facebook/Sorbonne) 伴随论文 [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) 由 Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot 发布。
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (来自 Google Research) 伴随论文 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 由 Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 发布。
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (来自 OpenAI) 伴随论文 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 由 Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 发布。
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (来自 YituTech) 伴随论文 [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) 由 Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan 发布。
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/master/model_doc/convnext)** (来自 Facebook AI) 伴随论文 [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) 由 Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie 发布。
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (来自 Tsinghua University) 伴随论文 [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) 由 Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun 发布。
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (来自 Salesforce) 伴随论文 [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) 由 Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher 发布。
 1. **[Data2Vec](https://huggingface.co/docs/transformers/master/model_doc/data2vec)** (来自 Facebook) 伴随论文 [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) 由 Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli 发布。
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (来自 Microsoft) 伴随论文 [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) 由 Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen 发布。
-1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta_v2)** (来自 Microsoft) 伴随论文 [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) 由 Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen 发布。
+1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (来自 Microsoft) 伴随论文 [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) 由 Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen 发布。
 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (来自 Facebook) 伴随论文 [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) 由 Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou 发布。
 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (来自 Facebook) 伴随论文 [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) 由 Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko 发布。
 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (来自 Microsoft Research) 伴随论文 [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) 由 Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan 发布。
 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (来自 HuggingFace), 伴随论文 [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) 由 Victor Sanh, Lysandre Debut and Thomas Wolf 发布。 同样的方法也应用于压缩 GPT-2 到 [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa 到 [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT 到 [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) 和德语版 DistilBERT。
 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (来自 Facebook) 伴随论文 [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) 由 Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih 发布。
 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (来自 Google Research/Stanford University) 伴随论文 [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) 由 Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning 发布。
-1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoderdecoder)** (来自 Google Research) 伴随论文 [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) 由 Sascha Rothe, Shashi Narayan, Aliaksei Severyn 发布。
+1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (来自 Google Research) 伴随论文 [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) 由 Sascha Rothe, Shashi Narayan, Aliaksei Severyn 发布。
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (来自 CNRS) 伴随论文 [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) 由 Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab 发布。
 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (来自 Google Research) 伴随论文 [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) 由 James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon 发布。
 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (来自 CMU/Google Brain) 伴随论文 [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) 由 Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le 发布。
-1. **[GPT](https://huggingface.co/docs/transformers/model_doc/gpt)** (来自 OpenAI) 伴随论文 [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) 由 Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever 发布。
+1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (来自 OpenAI) 伴随论文 [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) 由 Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever 发布。
 1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (来自 EleutherAI) 随仓库 [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) 发布。作者为 Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy 发布。
 1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (来自 OpenAI) 伴随论文 [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) 由 Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever** 发布。
 1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (来自 EleutherAI) 伴随论文 [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) 由 Ben Wang and Aran Komatsuzaki 发布。
@ -281,18 +283,23 @@ conda install -c huggingface transformers
 1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (来自 UNC Chapel Hill) 伴随论文 [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) 由 Hao Tan and Mohit Bansal 发布。
 1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (来自 Facebook) 伴随论文 [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) 由 Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin 发布。
 1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** 用 [OPUS](http://opus.nlpl.eu/) 数据训练的机器翻译模型由 Jörg Tiedemann 发布。[Marian Framework](https://marian-nmt.github.io/) 由微软翻译团队开发。
 1. **[MaskFormer](https://huggingface.co/docs/transformers/master/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov 
 1. **[MBart](https://huggingface.co/docs/transformers/model_doc/mbart)** (来自 Facebook) 伴随论文 [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) 由 Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer 发布。
 1. **[MBart-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (来自 Facebook) 伴随论文 [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) 由 Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan 发布。
-1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron_bert)** (来自 NVIDIA) 伴随论文 [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) 由 Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro 发布。
+1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (来自 NVIDIA) 伴随论文 [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) 由 Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro 发布。
 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (来自 NVIDIA) 伴随论文 [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) 由 Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro 发布。
 1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (来自 Studio Ousia) 伴随论文 [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) 由 Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka 发布。
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (来自 Microsoft Research) 伴随论文 [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) 由 Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu 发布。
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (来自 Google AI) 伴随论文 [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) 由 Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel 发布。
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/master/model_doc/nystromformer)** (来自 the University of Wisconsin - Madison) 伴随论文 [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) 由 Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh 发布。
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (来自 Google) 伴随论文 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 由 Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 发布。
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (来自 Deepmind) 伴随论文 [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) 由 Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira 发布。
 1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (来自 VinAI Research) 伴随论文 [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) 由 Dat Quoc Nguyen and Anh Tuan Nguyen 发布。
 1. **[PLBart](https://huggingface.co/docs/transformers/master/model_doc/plbart)** (来自 UCLA NLP) 伴随论文 [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) 由 Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang 发布。
 1. **[PoolFormer](https://huggingface.co/docs/transformers/master/model_doc/poolformer)** (来自 Sea AI Labs) 伴随论文 [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) 由 Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng 发布。
 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (来自 Microsoft Research) 伴随论文 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 由 Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 发布。
 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (来自 NVIDIA) 伴随论文 [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) 由 Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius 发布。
 1. **[REALM](https://huggingface.co/transformers/model_doc/realm.html)** (来自 Google Research) 伴随论文 [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) 由 Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang 发布。
 1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (来自 Google Research) 伴随论文 [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) 由 Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya 发布。
 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (来自 Google Research) 伴随论文 [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) 由 Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder 发布。
 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (来自 Facebook), 伴随论文 [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) 由 Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov 发布。
@ -304,21 +311,32 @@ conda install -c huggingface transformers
 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (来自 Facebook) 伴随论文 [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) 由 Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau 发布。
 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (来自 Tel Aviv University) 伴随论文 [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) 由 Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy 发布。
 1. **[SqueezeBert](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (来自 Berkeley) 伴随论文 [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) 由 Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer 发布。
 1. **[Swin Transformer](https://huggingface.co/docs/transformers/master/model_doc/swin)** (来自 Microsoft) 伴随论文 [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) 由 Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo 发布。
 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (来自 Google AI) 伴随论文 [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) 由 Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 发布。
 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (来自 Google AI) 伴随论文 [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) 由 Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 发布。
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (来自 Google AI) 伴随论文 [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) 由 Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos 发布。
-1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transformerxl)** (来自 Google/CMU) 伴随论文 [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) 由 Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov 发布。
+1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (来自 Google/CMU) 伴随论文 [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) 由 Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov 发布。
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (来自 Microsoft) 伴随论文 [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) 由 Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei 发布。
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (来自 Microsoft Research) 伴随论文 [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) 由 Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang 发布。
-1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech_sat)** (来自 Microsoft Research) 伴随论文 [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) 由 Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu 发布。
+1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (来自 Microsoft Research) 伴随论文 [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) 由 Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu 发布。
 1. **[ViLT](https://huggingface.co/docs/transformers/master/model_doc/vilt)** (来自 NAVER AI Lab/Kakao Enterprise/Kakao Brain) 伴随论文 [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) 由 Wonjae Kim, Bokyung Son, Ildoo Kim 发布。
 1. **[ViLT)](https://huggingface.co/docs/transformers/master/model_doc/vilt)** (来自 NAVER AI Lab/Kakao Enterprise/Kakao Brain) 伴随论文 [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) 由 Wonjae Kim, Bokyung Son, Ildoo Kim 发布。
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (来自 Google AI) 伴随论文 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 由 Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 发布。
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (来自 UCLA NLP) 伴随论文 [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) 由 Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang 发布。
 1. **[ViTMAE](https://huggingface.co/docs/transformers/master/model_doc/vit_mae)** (来自 Meta AI) 伴随论文 [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) 由 Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick 发布。
 1. **[ViTMAE)](https://huggingface.co/docs/transformers/master/model_doc/vit_mae)** (来自 Meta AI) 伴随论文 [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) 由 Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick 发布。
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (来自 Facebook AI) 伴随论文 [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) 由 Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli 发布。
 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/master/transformers/model_doc/wav2vec2_phoneme)** (来自 Facebook AI) 伴随论文 [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) 由 Qiantong Xu, Alexei Baevski, Michael Auli 发布。
 1. **[WavLM](https://huggingface.co/docs/transformers/master/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
 1. **[XGLM](https://huggingface.co/docs/master/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li. 
 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (来自 Facebook) 伴随论文 [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) 由 Guillaume Lample and Alexis Conneau 发布。
-1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlmprophetnet)** (来自 Microsoft Research) 伴随论文 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 由 Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 发布。
+1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (来自 Microsoft Research) 伴随论文 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 由 Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 发布。
-1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlmroberta)** (来自 Facebook AI), 伴随论文 [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) 由 Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov 发布。
+1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (来自 Facebook AI), 伴随论文 [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) 由 Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov 发布。
 1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/master/model_doc/xlm-roberta-xl)** (来自 Facebook AI) 伴随论文 [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) 由 Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau 发布。
 1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (来自 Google/CMU) 伴随论文 [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) 由 Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le 发布。
 1. **[XLS-R](https://huggingface.co/docs/master/transformers/model_doc/xls_r)** (来自 Facebook AI) 伴随论文 [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) 由 Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli 发布。
 1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (来自 Facebook AI) 伴随论文 [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) 由 Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli 发布。
 1. **[YOSO](https://huggingface.co/docs/transformers/master/model_doc/yoso)** (来自 the University of Wisconsin - Madison) 伴随论文 [You Only Sample (Almost) 由 Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh 发布。
 1. 想要贡献新的模型？我们这里有一份**详细指引和模板**来引导你添加新的模型。你可以在 [`templates`](./templates) 目录中找到他们。记得查看 [贡献指南](./CONTRIBUTING.md) 并在开始写 PR 前联系维护人员或开一个新的 issue 来获得反馈。
 要检查某个模型是否已有 Flax、PyTorch 或 TensorFlow 的实现，或其是否在 🤗 Tokenizers 库中有对应词符化器（tokenizer），敬请参阅[此表](https://huggingface.co/docs/transformers/index#supported-frameworks)。
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@ -53,7 +53,7 @@ user: 使用者
 <p align="center">
    <br>
-    <img src="https://raw.githubusercontent.com/huggingface/transformers/master/docs/source/imgs/transformers_logo_name.png" width="400"/>
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_logo_name.png" width="400"/>
    <br>
 <p>
 <p align="center">
@ -89,7 +89,7 @@ user: 使用者
 </h3>
 <h3 align="center">
-    <a href="https://hf.co/course"><img src="https://raw.githubusercontent.com/huggingface/transformers/master/docs/source/imgs/course_banner.png"></a>
+    <a href="https://hf.co/course"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/course_banner.png"></a>
 </h3>
 🤗 Transformers 提供了數以千計的預訓練模型，支援 100 多種語言的文本分類、資訊擷取、問答、摘要、翻譯、文本生成。它的宗旨是讓最先進的 NLP 技術人人易用。
@ -251,33 +251,35 @@ conda install -c huggingface transformers
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
 1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
 1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
-1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bertgeneration)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/bigbird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
 1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot_small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
 1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
 1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[ConvNeXT](https://huggingface.co/docs/transformers/master/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
 1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 1. **[Data2Vec](https://huggingface.co/docs/transformers/master/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
 1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
-1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta_v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
+1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
 1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
 1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
 1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) and a German version of DistilBERT.
 1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
-1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoderdecoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
 1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-1. **[GPT](https://huggingface.co/docs/transformers/model_doc/gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
+1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
 1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
 1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
 1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released with the paper [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
@ -293,18 +295,23 @@ conda install -c huggingface transformers
 1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
 1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
 1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
 1. **[MaskFormer](https://huggingface.co/docs/transformers/master/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov 
 1. **[MBart](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
 1. **[MBart-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
-1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron_bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
 1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
 1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/master/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
 1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
 1. **[PLBart](https://huggingface.co/docs/transformers/master/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
 1. **[PoolFormer](https://huggingface.co/docs/transformers/master/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
 1. **[REALM](https://huggingface.co/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
 1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
 1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
@ -316,21 +323,32 @@ conda install -c huggingface transformers
 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook) released with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University) released with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
 1. **[SqueezeBert](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
 1. **[Swin Transformer](https://huggingface.co/docs/transformers/master/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
 1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released with the paper [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
-1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transformerxl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft) released with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
-1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech_sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
+1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
 1. **[ViLT](https://huggingface.co/docs/transformers/master/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
 1. **[ViLT)](https://huggingface.co/docs/transformers/master/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby. 
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
 1. **[ViTMAE](https://huggingface.co/docs/transformers/master/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
 1. **[ViTMAE)](https://huggingface.co/docs/transformers/master/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
 1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/master/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
 1. **[WavLM](https://huggingface.co/docs/transformers/master/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
 1. **[XGLM](https://huggingface.co/docs/master/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li. 
 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
-1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlmprophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlmroberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
+1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
 1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/master/model_doc/xlm-roberta-xl)** (from Facebook AI) released with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
 1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
 1. **[XLS-R](https://huggingface.co/docs/master/transformers/model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
 1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
 1. **[YOSO](https://huggingface.co/docs/transformers/master/model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
 1. 想要貢獻新的模型？我們這裡有一份**詳細指引和模板**來引導你加入新的模型。你可以在 [`templates`](./templates) 目錄中找到它們。記得查看[貢獻指引](./CONTRIBUTING.md)並在開始寫 PR 前聯繫維護人員或開一個新的 issue 來獲得 feedbacks。
 要檢查某個模型是否已有 Flax、PyTorch 或 TensorFlow 的實作，或其是否在🤗 Tokenizers 函式庫中有對應的 tokenizer，敬請參閱[此表](https://huggingface.co/docs/transformers/index#supported-frameworks)。
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -15,6 +15,7 @@
 # tests directory-specific settings - this file is run automatically
 # by pytest before any tests are run
 import doctest
 import sys
 import warnings
 from os.path import abspath, dirname, join
@ -22,7 +23,7 @@ from os.path import abspath, dirname, join
 # allow having multiple repository checkouts and not needing to remember to rerun
 # 'pip install -e .[dev]' when switching between checkouts and running tests.
-git_repo_path = abspath(join(dirname(dirname(__file__)), "src"))
+git_repo_path = abspath(join(dirname(__file__), "src"))
 sys.path.insert(1, git_repo_path)
 # silence FutureWarning warnings in tests since often we can't act on them until
@ -59,3 +60,19 @@ def pytest_sessionfinish(session, exitstatus):
    # If no tests are collected, pytest exists with code 5, which makes the CI fail.
    if exitstatus == 5:
        session.exitstatus = 0
 # Doctest custom flag to ignore output.
 IGNORE_RESULT = doctest.register_optionflag('IGNORE_RESULT')
 OutputChecker = doctest.OutputChecker
 class CustomOutputChecker(OutputChecker):
    def check_output(self, want, got, optionflags):
        if IGNORE_RESULT & optionflags:
            return True
        return OutputChecker.check_output(self, want, got, optionflags)
 doctest.OutputChecker = CustomOutputChecker
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@ -0,0 +1,22 @@
 FROM nvidia/cuda:11.2.2-cudnn8-runtime-ubuntu20.04
 LABEL maintainer="Hugging Face"
 ARG DEBIAN_FRONTEND=noninteractive
 RUN apt update
 RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg
 RUN python3 -m pip install --no-cache-dir --upgrade pip
 ARG REF=master
 RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
 RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime]
 RUN python3 -m pip install --no-cache-dir -U torch tensorflow
 RUN python3 -m pip uninstall -y flax jax
 RUN python3 -m pip install --no-cache-dir torch-scatter -f https://data.pyg.org/whl/torch-$(python3 -c "from torch import version; print(version.__version__.split('+')[0])")+cu102.html
 RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract https://github.com/kpu/kenlm/archive/master.zip
 RUN python3 -m pip install -U "itsdangerous<2.1.0"
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
--- a/docker/transformers-doc-builder/Dockerfile
+++ b/docker/transformers-doc-builder/Dockerfile
@ -0,0 +1,16 @@
 FROM python:3.8
 LABEL maintainer="Hugging Face"
 RUN apt update
 RUN git clone https://github.com/huggingface/transformers
 RUN python3 -m pip install --no-cache-dir --upgrade pip && python3 -m pip install --no-cache-dir git+https://github.com/huggingface/doc-builder ./transformers[dev,deepspeed]
 RUN apt-get -y update && apt-get install -y libsndfile1-dev && apt install -y tesseract-ocr
 RUN python3 -m pip install --no-cache-dir torch-scatter -f https://data.pyg.org/whl/torch-$(python -c "from torch import version; print(version.__version__.split('+')[0])")+cpu.html
 RUN python3 -m pip install --no-cache-dir torchvision git+https://github.com/facebookresearch/detectron2.git pytesseract https://github.com/kpu/kenlm/archive/master.zip
 RUN python3 -m pip install --no-cache-dir pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com
 RUN python3 -m pip install -U "itsdangerous<2.1.0"
 RUN doc-builder build transformers transformers/docs/source --build_dir doc-build-dev --notebook_dir notebooks/transformers_doc --clean --version pr_$PR_NUMBER
 RUN rm -rf doc-build-dev
--- a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
@ -0,0 +1,21 @@
 FROM nvcr.io/nvidia/pytorch:21.03-py3
 LABEL maintainer="Hugging Face"
 ARG DEBIAN_FRONTEND=noninteractive
 RUN apt -y update
 RUN apt install -y libaio-dev
 RUN python3 -m pip install --no-cache-dir --upgrade pip
 ARG REF=master
 RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
 RUN python3 -m pip install --no-cache-dir -e ./transformers[testing,deepspeed]
 RUN git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build && \
    DS_BUILD_CPU_ADAM=1 DS_BUILD_AIO=1 DS_BUILD_UTILS=1 python3 -m pip install -e . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
 RUN python3 -c "from deepspeed.launcher.runner import main"
--- a/docker/transformers-pytorch-gpu/Dockerfile
+++ b/docker/transformers-pytorch-gpu/Dockerfile
@ -1,30 +1,26 @@
-FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04
+FROM nvidia/cuda:11.2.2-cudnn8-runtime-ubuntu20.04
 LABEL maintainer="Hugging Face"
 LABEL repository="transformers"
-RUN apt update && \
+ARG DEBIAN_FRONTEND=noninteractive
    apt install -y bash \
                   build-essential \
                   git \
                   curl \
                   ca-certificates \
                   python3 \
                   python3-pip && \
    rm -rf /var/lib/apt/lists
-RUN python3 -m pip install --no-cache-dir --upgrade pip && \
+RUN apt update
-    python3 -m pip install --no-cache-dir \
+RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg
-    mkl \
+RUN python3 -m pip install --no-cache-dir --upgrade pip
    torch
-RUN git clone https://github.com/NVIDIA/apex
+ARG REF=master
-RUN cd apex && \
+RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
-    python3 setup.py install && \
+RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing]
    pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
-WORKDIR /workspace
+# If set to nothing, will install the latest version
-COPY . transformers/
+ARG PYTORCH=''
 RUN cd transformers/ && \
    python3 -m pip install --no-cache-dir .
-CMD ["/bin/bash"]
+RUN [ ${#PYTORCH} -gt 0 ] && VERSION='torch=='$PYTORCH'.*' ||  VERSION='torch'; python3 -m pip install --no-cache-dir -U $VERSION
 RUN python3 -m pip uninstall -y tensorflow flax
 RUN python3 -m pip install --no-cache-dir torch-scatter -f https://data.pyg.org/whl/torch-$(python3 -c "from torch import version; print(version.__version__.split('+')[0])")+cu102.html
 RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract https://github.com/kpu/kenlm/archive/master.zip
 RUN python3 -m pip install -U "itsdangerous<2.1.0"
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
--- a/docker/transformers-tensorflow-gpu/Dockerfile
+++ b/docker/transformers-tensorflow-gpu/Dockerfile
@ -1,25 +1,23 @@
-FROM nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04
+FROM nvidia/cuda:11.2.2-cudnn8-runtime-ubuntu20.04
 LABEL maintainer="Hugging Face"
 LABEL repository="transformers"
-RUN apt update && \
+ARG DEBIAN_FRONTEND=noninteractive
    apt install -y bash \
                   build-essential \
                   git \
                   curl \
                   ca-certificates \
                   python3 \
                   python3-pip && \
    rm -rf /var/lib/apt/lists
-RUN python3 -m pip install --no-cache-dir --upgrade pip && \
+RUN apt update
-    python3 -m pip install --no-cache-dir \
+RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg
-    mkl \
+RUN python3 -m pip install --no-cache-dir --upgrade pip
    tensorflow
-WORKDIR /workspace
+ARG REF=master
-COPY . transformers/
+RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
-RUN cd transformers/ && \
+RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-tensorflow,testing]
    python3 -m pip install --no-cache-dir .
-CMD ["/bin/bash"]
+# If set to nothing, will install the latest version
 ARG TENSORFLOW=''
 RUN [ ${#TENSORFLOW} -gt 0 ] && VERSION='tensorflow=='$TENSORFLOW'.*' ||  VERSION='tensorflow'; python3 -m pip install --no-cache-dir -U $VERSION
 RUN python3 -m pip uninstall -y torch flax
 RUN python3 -m pip install -U "itsdangerous<2.1.0"
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
--- a/docs/Makefile
+++ b/docs/Makefile
@ -1,19 +0,0 @@
 # Minimal makefile for Sphinx documentation
 #
 # You can set these variables from the command line.
 SPHINXOPTS    =
 SPHINXBUILD   = sphinx-build
 SOURCEDIR     = source
 BUILDDIR      = _build
 # Put it first so that "make" without argument is like "make help".
 help:
 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
 .PHONY: help Makefile
 # Catch-all target: route all unknown targets to Sphinx using the new
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
 %: Makefile
 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--- a/docs/README.md
+++ b/docs/README.md
@ -23,6 +23,12 @@ you can install them with the following command, at the root of the code reposit
 pip install -e ".[docs]"
 ```
 Then you need to install our special tool that builds the documentation:
 ```bash
 pip install git+https://github.com/huggingface/hf-doc-utils
 ```
 ---
 **NOTE**
@ -31,88 +37,72 @@ check how they look like before committing for instance). You don't have to comm
 ---
 ## Packages installed
 Here's an overview of all the packages installed. If you ran the previous command installing all packages from
 `requirements.txt`, you do not need to run the following commands.
 Building it requires the package `sphinx` that you can
 install using:
 ```bash
 pip install -U sphinx
 ```
 You would also need the custom installed [theme](https://github.com/readthedocs/sphinx_rtd_theme) by
 [Read The Docs](https://readthedocs.org/). You can install it using the following command:
 ```bash
 pip install sphinx_rtd_theme
 ```
 The third necessary package is the `recommonmark` package to accept Markdown as well as Restructured text:
 ```bash
 pip install recommonmark
 ```
 ## Building the documentation
-Once you have setup `sphinx`, you can build the documentation by running the following command in the `/docs` folder:
+Once you have setup the `hf-doc-utilsr` and additional packages, you can generate the documentation by 
 typing the following command:
 ```bash
-make html
+hf-doc-utils build transformers docs/source/ --build_dir ~/tmp/test-build
 ```
-A folder called ``_build/html`` should have been created. You can now open the file ``_build/html/index.html`` in your
+You can adapt the `--build_dir` to set any temporary folder that you prefer. This command will create it and generate
-browser.
+the MDX files that will be rendered as the documentation on the main website. You can inspect them in your favorite
 Markdown editor.
 ---
 **NOTE**
-If you are adding/removing elements from the toc-tree or from any structural item, it is recommended to clean the build
+It's not possible to see locally how the final documentation will look like for now. Once you have opened a PR, you
-directory before rebuilding. Run the following command to clean and build:
+will see a bot add a comment to a link where the documentation with your changes lives.
 ```bash
 make clean && make html
 ```
 ---
-It should build the static app that will be available under `/docs/_build/html`
+## Adding a new element to the navigation bar
-## Adding a new element to the tree (toc-tree)
+Accepted files are Markdown (.md or .mdx).
-Accepted files are reStructuredText (.rst) and Markdown (.md). Create a file with its extension and put it
+Create a file with its extension and put it in the source directory. You can then link it to the toc-tree by putting
-in the source directory. You can then link it to the toc-tree by putting the filename without the extension.
+the filename without the extension in the [`_toctree.yml`](https://github.com/huggingface/transformers/blob/master/docs/source/_toctree.yml) file.
-## Preview the documentation in a pull request
+## Renaming section headers and moving sections
-Once you have made your pull request, you can check what the documentation will look like after it's merged by
+It helps to keep the old links working when renaming section header and/or moving sections from one document to another. This is because the old links are likely to be used in Issues, Forums and Social media and it'd be make for a much more superior user experience if users reading those months later could still easily navigate to the originally intended information.
-following these steps:
+
 Therefore we simply keep a little map of moved sections at the end of the document where the original section was. The key is to preserve the original anchor.
 So if you renamed a section from: "Section A" to "Section B", then you can add at the end of the file:
 ```
 Sections that were moved:
 [ <a href="#section-b">Section A</a><a id="section-a"></a> ]
 ```
 and of course if you moved it to another file, then:
 ```
 Sections that were moved:
 [ <a href="../new-file#section-b">Section A</a><a id="section-a"></a> ]
 ```
 Use the relative style to link to the new file so that the versioned docs continue to work.
 For an example of a rich moved sections set please see the very end of [the Trainer doc](https://github.com/huggingface/transformers/blob/master/docs/source/main_classes/trainer.mdx).
 - Look at the checks at the bottom of the conversation page of your PR (you may need to click on "show all checks" to
  expand them).
 - Click on "details" next to the `ci/circleci: build_doc` check.
 - In the new window, click on the "Artifacts" tab.
 - Locate the file "docs/_build/html/index.html" (or any specific page you want to check) and click on it to get a
  preview.
 ## Writing Documentation - Specification
 The `huggingface/transformers` documentation follows the
-[Google documentation](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) style. It is
+[Google documentation](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) style for docstrings,
-mostly written in ReStructuredText
+although we can write them directly in Markdown.
 ([Sphinx simple documentation](https://www.sphinx-doc.org/en/master/usage/restructuredtext/index.html),
 [Sourceforge complete documentation](https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html)).
 ### Adding a new tutorial
 Adding a new tutorial or section is done in two steps:
 - Add a new file under `./source`. This file can either be ReStructuredText (.rst) or Markdown (.md).
- Link that file in `./source/index.rst` on the correct toc-tree.
+- Link that file in `./source/_toctree.yml` on the correct toc-tree.
 Make sure to put your new file under the proper section. It's unlikely to go in the first section (*Get Started*), so
 depending on the intended targets (beginners, more advanced users or researchers) it should go in section two, three or
@ -122,8 +112,8 @@ four.
 When adding a new model:
- Create a file `xxx.rst` under `./source/model_doc` (don't hesitate to copy an existing file as template).
+- Create a file `xxx.mdx` or under `./source/model_doc` (don't hesitate to copy an existing file as template).
- Link that file in `./source/index.rst` on the `model_doc` toc-tree.
+- Link that file in `./source/_toctree.yml`.
 - Write a short overview of the model:
    - Overview with paper & authors
    - Paper abstract
@ -137,64 +127,82 @@ When adding a new model:
    - PyTorch head models
    - TensorFlow base model
    - TensorFlow head models
    - Flax base model
    - Flax head models
 These classes should be added using our Markdown syntax. Usually as follows:
 These classes should be added using the RST syntax. Usually as follows:
 ```
-XXXConfig
+## XXXConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: transformers.XXXConfig
+[[autodoc]] XXXConfig
    :members:
 ```
 This will include every public method of the configuration that is documented. If for some reason you wish for a method
 not to be displayed in the documentation, you can do so by specifying which methods should be in the docs:
 ```
-XXXTokenizer
+## XXXTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-.. autoclass:: transformers.XXXTokenizer
+[[autodoc]] XXXTokenizer
-    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+    - build_inputs_with_special_tokens
-        create_token_type_ids_from_sequences, save_vocabulary
+    - get_special_tokens_mask
    - create_token_type_ids_from_sequences
    - save_vocabulary
 ```
 If you just want to add a method that is not documented (for instance magic method like `__call__` are not documented
 byt default) you can put the list of methods to add in a list that contains `all`:
 ```
 ## XXXTokenizer
 [[autodoc]] XXXTokenizer
    - all
    - __call__
 ```
 ### Writing source documentation
-Values that should be put in `code` should either be surrounded by double backticks: \`\`like so\`\` or be written as
+Values that should be put in `code` should either be surrounded by backticks: \`like so\`. Note that argument names
-an object using the :obj: syntax: :obj:\`like so\`. Note that argument names and objects like True, None or any strings
+and objects like True, None or any strings should usually be put in `code`.
 should usually be put in `code`.
-When mentioning a class, it is recommended to use the :class: syntax as the mentioned class will be automatically
+When mentioning a class, function or method, it is recommended to use our syntax for internal links so that our tool
-linked by Sphinx: :class:\`~transformers.XXXClass\`
+adds a link to its documentation with this syntax: \[\`XXXClass\`\] or \[\`function\`\]. This requires the class or 
 function to be in the main package.
-When mentioning a function, it is recommended to use the :func: syntax as the mentioned function will be automatically
+If you want to create a link to some internal class or function, you need to
-linked by Sphinx: :func:\`~transformers.function\`.
+provide its path. For instance: \[\`file_utils.ModelOutput\`\]. This will be converted into a link with
 `file_utils.ModelOutput` in the description. To get rid of the path and only keep the name of the object you are
 linking to in the description, add a ~: \[\`~file_utils.ModelOutput\`\] will generate a link with `ModelOutput` in the description.
-When mentioning a method, it is recommended to use the :meth: syntax as the mentioned method will be automatically
+The same works for methods so you can either use \[\`XXXClass.method\`\] or \[~\`XXXClass.method\`\].
 linked by Sphinx: :meth:\`~transformers.XXXClass.method\`.
 Links should be done as so (note the double underscore at the end): \`text for the link <./local-link-or-global-link#loc>\`__
 #### Defining arguments in a method
-Arguments should be defined with the `Args:` prefix, followed by a line return and an indentation.
+Arguments should be defined with the `Args:` (or `Arguments:` or `Parameters:`) prefix, followed by a line return and
-The argument should be followed by its type, with its shape if it is a tensor, and a line return.
+an indentation. The argument should be followed by its type, with its shape if it is a tensor, a colon and its
-Another indentation is necessary before writing the description of the argument.
+description:
 ```
    Args:
        n_layers (`int`): The number of layers of the model.
 ```
 If the description is too long to fit in one line, another indentation is necessary before writing the description
 after th argument.
 Here's an example showcasing everything so far:
 ```
    Args:
-        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.
-            Indices can be obtained using :class:`~transformers.AlbertTokenizer`.
+            Indices can be obtained using [`AlbertTokenizer`]. See [`~PreTrainedTokenizer.encode`] and
-            See :meth:`~transformers.PreTrainedTokenizer.encode` and
+            [`~PreTrainedTokenizer.__call__`] for details.
            :meth:`~transformers.PreTrainedTokenizer.__call__` for details.
-            `What are input IDs? <../glossary.html#input-ids>`__
+            [What are input IDs?](../glossary#input-ids)
 ```
 For optional arguments or arguments with defaults we follow the following syntax: imagine we have a function with the
@ -208,93 +216,183 @@ then its documentation should look like this:
 ```
    Args:
-        x (:obj:`str`, `optional`):
+        x (`str`, *optional*):
            This argument controls ...
-        a (:obj:`float`, `optional`, defaults to 1):
+        a (`float`, *optional*, defaults to 1):
            This argument is used to ...
 ```
-Note that we always omit the "defaults to :obj:\`None\`" when None is the default for any argument. Also note that even
+Note that we always omit the "defaults to \`None\`" when None is the default for any argument. Also note that even
 if the first line describing your argument type and its default gets long, you can't break it on several lines. You can
 however write as many lines as you want in the indented description (see the example above with `input_ids`).
 #### Writing a multi-line code block
-Multi-line code blocks can be useful for displaying examples. They are done like so:
+Multi-line code blocks can be useful for displaying examples. They are done between two lines of three backticks as usual in Markdown:
 ````
 ```
-Example::
+# first line of code
-
+# second line
-    # first line of code
+# etc
    # second line
    # etc
 ```
-
+````
 The `Example` string at the beginning can be replaced by anything as long as there are two semicolons following it.
 We follow the [doctest](https://docs.python.org/3/library/doctest.html) syntax for the examples to automatically test
 the results stay consistent with the library.
 #### Writing a return block
-Arguments should be defined with the `Args:` prefix, followed by a line return and an indentation.
+The return block should be introduced with the `Returns:` prefix, followed by a line return and an indentation.
 The first line should be the type of the return, followed by a line return. No need to indent further for the elements
 building the return.
 Here's an example for tuple return, comprising several objects:
 ```
    Returns:
        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
        loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
            Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss.
        prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
 ```
 Here's an example for a single value return:
 ```
    Returns:
-        :obj:`List[int]`: A list of integers in the range [0, 1] --- 1 for a special token, 0 for a sequence token.
+        `List[int]`: A list of integers in the range [0, 1] --- 1 for a special token, 0 for a sequence token.
 ```
-#### Adding a new section
+Here's an example for tuple return, comprising several objects:
 In ReST section headers are designated as such with the help of a line of underlying characters, e.g.,:
 ```
-Section 1
+    Returns:
-^^^^^^^^^^^^^^^^^^
+        `tuple(torch.FloatTensor)` comprising various elements depending on the configuration ([`BertConfig`]) and inputs:
-
+        - ** loss** (*optional*, returned when `masked_lm_labels` is provided) `torch.FloatTensor` of shape `(1,)` --
-Sub-section 1
+          Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss.
-~~~~~~~~~~~~~~~~~~
+        - **prediction_scores** (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`) --
          Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
 ```
-ReST allows the use of any characters to designate different section levels, as long as they are used consistently within the same document. For details see [sections doc](https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html#sections). Because there is no standard different documents often end up using different characters for the same levels which makes it very difficult to know which character to use when creating a new section.
+#### Adding an image
-Specifically, if when running `make docs` you get an error like:
+Due to the rapidly growing repository, it is important to make sure that no files that would significantly weigh down the repository are added. This includes images, videos and other non-text files. We prefer to leverage a hf.co hosted `dataset` like
-```
+the ones hosted on [`hf-internal-testing`](https://huggingface.co/hf-internal-testing) in which to place these files and reference
-docs/source/main_classes/trainer.rst:127:Title level inconsistent:
+them by URL. We recommend putting them in the following dataset: [huggingface/documentation-images](https://huggingface.co/datasets/huggingface/documentation-images).
-```
+If an external contribution, feel free to add the images to your PR and ask a Hugging Face member to migrate your images
-you picked an inconsistent character for some of the levels.
+to this dataset.
-But how do you know which characters you must use for an already existing level or when adding a new level?
+## Styling the docstring
 We have an automatic script running with the `make style` comment that will make sure that:
 - the docstrings fully take advantage of the line width
 - all code examples are formatted using black, like the code of the Transformers library
 This script may have some weird failures if you made a syntax mistake or if you uncover a bug. Therefore, it's
 recommended to commit your changes before running `make style`, so you can revert the changes done by that script
 easily.
 # Testing documentation examples
 Good documentation oftens comes with an example of how a specific function or class should be used. 
 Each model class should contain at least one example showcasing
 how to use this model class in inference. *E.g.* the class [Wav2Vec2ForCTC](https://huggingface.co/docs/transformers/model_doc/wav2vec2#transformers.Wav2Vec2ForCTC) 
 includes an example of how to transcribe speech to text in the 
 [docstring of its forward function](https://huggingface.co/docs/transformers/model_doc/wav2vec2#transformers.Wav2Vec2ForCTC.forward).
 ## Writing documenation examples
 The syntax for Example docstrings can look as follows:
 You can use this helper script:
 ```
-perl -ne '/^(.)\1{100,}/ && do { $h{$1}=++$c if !$h{$1} }; END { %h = reverse %h ; print "$_ $h{$_}\n" for sort keys %h}' docs/source/main_classes/trainer.rst
+    Example:
-1 -
+
-2 ~
+    ```python
-3 ^
+    >>> from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
-4 =
+    >>> from datasets import load_dataset
-5 "
+    >>> import torch
    >>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
    >>> dataset = dataset.sort("id")
    >>> sampling_rate = dataset.features["audio"].sampling_rate
    >>> processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
    >>> model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
    >>> # audio file is decoded on the fly
    >>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
    >>> with torch.no_grad():
    ...     logits = model(**inputs).logits
    >>> predicted_ids = torch.argmax(logits, dim=-1)
    >>> # transcribe speech
    >>> transcription = processor.batch_decode(predicted_ids)
    >>> transcription[0]
    'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'
    ```
 ```
-This tells you which characters have already been assigned for each level.
+The docstring should give a minimal, clear example of how the respective model 
 is to be used in inference and also include the expected (ideally sensible)
 output.
 Often, readers will try out the example before even going through the function 
 or class definitions. Therefore it is of utmost importance that the example 
 works as expected.
-So using this particular example's output -- if your current section's header uses `=` as its underline character, you now know you're at level 4, and if you want to add a sub-section header you know you want `"` as it'd level 5.
+## Docstring testing
-If you needed to add yet another sub-level, then pick a character that is not used already. That is you must pick a character that is not in the output of that script.
+To do so each example should be included in the doctests. 
 We use pytests' [doctest integration](https://docs.pytest.org/doctest.html) to verify that all of our examples run correctly. 
 For Transformers, the doctests are run on a daily basis via GitHub Actions as can be 
 seen [here](https://github.com/huggingface/transformers/actions/workflows/doctests.yml).
-Here is the full list of characters that can be used in this context: `= - ` : ' " ~ ^ _ * + # < >`
+To include your example in the daily doctests, you need add the filename that
 contains the example docstring to the [documentation_tests.txt](../utils/documentation_tests.txt).
 ### For Python files
 You will first need to run the following command (from the root of the repository) to prepare the doc file (doc-testing needs to add additional lines that we don't include in the doc source files):
 ```bash
 python utils/prepare_for_doc_test.py src docs
 ```
 Then you can run all the tests in the docstrings of a given file with the following command, here is how we test the modeling file of Wav2Vec2 for instance:
 ```bash
 pytest --doctest-modules src/transformers/models/wav2vec2/modeling_wav2vec2.py -sv --doctest-continue-on-failure
 ```
 If you want to isolate a specific docstring, just add `::` after the file name then type the whole path of the function/class/method whose docstring you want to test. For instance, here is how to just test the forward method of `Wav2Vec2ForCTC`:
 ```bash
 pytest --doctest-modules src/transformers/models/wav2vec2/modeling_wav2vec2.py::transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC.forward -sv --doctest-continue-on-failure
 ```
 Once you're done, you can run the following command (still from the root of the repository) to undo the changes made by the first command before committing:
 ```bash
 python utils/prepare_for_doc_test.py src docs --remove_new_line
 ```
 ### For Markdown files
 You will first need to run the following command (from the root of the repository) to prepare the doc file (doc-testing needs to add additional lines that we don't include in the doc source files):
 ```bash
 python utils/prepare_for_doc_test.py src docs
 ```
 Then you can test locally a given file with this command (here testing the quicktour):
 ```bash
 pytest --doctest-modules docs/source/quicktour.mdx -sv --doctest-continue-on-failure --doctest-glob="*.mdx"
 ```
 Once you're done, you can run the following command (still from the root of the repository) to undo the changes made by the first command before committing:
 ```bash
 python utils/prepare_for_doc_test.py src docs --remove_new_line
 ```
 ### Writing doctests
 Here are a few tips to help you debug the doctests and make them pass:
 - The outputs of the code need to match the expected output **exactly**, so make sure you have the same outputs. In particular doctest will see a difference between single quotes and double quotes, or a missing parenthesis. The only exceptions to that rule are:
  * whitespace: one give whitespace (space, tabulation, new line) is equivalent to any number of whitespace, so you can add new lines where there are spaces to make your output more readable.
  * numerical values: you should never put more than 4 or 5 digits to expected results as different setups or library versions might get you slightly different results. `doctest` is configure to ignore any difference lower than the precision to which you wrote (so 1e-4 if you write 4 digits).
 - Don't leave a block of code that is very long to execute. If you can't make it fast, you can either not use the doctest syntax on it (so that it's ignored), or if you want to use the doctest syntax to show the results, you can add a comment `# doctest: +SKIP` at the end of the lines of code too long to execute
 - Each line of code that produces a result needs to have that result written below. You can ignore an output if you don't want to show it in your code example by adding a comment ` # doctest: +IGNORE_RESULT` at the end of the line of code produing it.
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@ -11,28 +11,54 @@
    title: Glossary
  title: Get started
 - sections:
  - local: pipeline_tutorial
    title: Pipelines for inference
  - local: autoclass_tutorial
    title: Load pretrained instances with an AutoClass
  - local: preprocessing
    title: Preprocess
  - local: task_summary
    title: Summary of the tasks
  - local: model_summary
    title: Summary of the models
  - local: preprocessing
    title: Preprocessing data
  - local: training
    title: Fine-tuning a pretrained model
  - local: accelerate
    title: Distributed training with 🤗 Accelerate
  - local: model_sharing
-    title: Model sharing and uploading
+    title: Share a model
  - local: tokenizer_summary
    title: Summary of the tokenizers
  - local: multilingual
    title: Multi-lingual models
-  title: "Using 🤗 Transformers"
+  title: Tutorials
 - sections:
-  - local: examples
+  - local: create_a_model
-    title: Examples
+    title: Create a custom model
  - local: multilingual
    title: Inference for multilingual models
  - local: troubleshooting
    title: Troubleshooting
  - local: custom_datasets
    title: Fine-tuning with custom datasets
  - sections:
    - local: tasks/sequence_classification
      title: Text classification
    - local: tasks/token_classification
      title: Token classification
    - local: tasks/question_answering
      title: Question answering
    - local: tasks/language_modeling
      title: Language modeling
    - local: tasks/translation
      title: Translation
    - local: tasks/summarization
      title: Summarization
    - local: tasks/multiple_choice
      title: Multiple choice
    title: Fine-tune for downstream tasks
  - local: run_scripts
    title: Train with a script
  - local: notebooks
    title: "🤗 Transformers Notebooks"
  - local: sagemaker
@ -60,10 +86,12 @@
  - local: debugging
    title: Debugging
  - local: serialization
-    title: Exporting transformers models
+    title: Exporting 🤗 Transformers models
  - local: custom_models
    title: Sharing custom models
  - local: pr_checks
    title: Checks on a Pull Request
-  title: Advanced guides
+  title: How-to guides
 - sections:
  - local: bertology
    title: BERTology
@ -86,6 +114,8 @@
      title: Logging
    - local: main_classes/model
      title: Models
    - local: main_classes/onnx
      title: ONNX
    - local: main_classes/optimizer_schedules
      title: Optimization
    - local: main_classes/output
@ -120,17 +150,17 @@
      title: BERT
    - local: model_doc/bertweet
      title: Bertweet
-    - local: model_doc/bertgeneration
+    - local: model_doc/bert-generation
      title: BertGeneration
-    - local: model_doc/bert_japanese
+    - local: model_doc/bert-japanese
      title: BertJapanese
-    - local: model_doc/bigbird
+    - local: model_doc/big_bird
      title: BigBird
    - local: model_doc/bigbird_pegasus
      title: BigBirdPegasus
    - local: model_doc/blenderbot
      title: Blenderbot
-    - local: model_doc/blenderbot_small
+    - local: model_doc/blenderbot-small
      title: Blenderbot Small
    - local: model_doc/bort
      title: BORT
@ -140,6 +170,8 @@
      title: CamemBERT
    - local: model_doc/canine
      title: CANINE
    - local: model_doc/convnext
      title: ConvNeXT
    - local: model_doc/clip
      title: CLIP
    - local: model_doc/convbert
@ -148,9 +180,11 @@
      title: CPM
    - local: model_doc/ctrl
      title: CTRL
    - local: model_doc/data2vec
      title: Data2Vec
    - local: model_doc/deberta
      title: DeBERTa
-    - local: model_doc/deberta_v2
+    - local: model_doc/deberta-v2
      title: DeBERTa-v2
    - local: model_doc/deit
      title: DeiT
@ -164,7 +198,7 @@
      title: DPR
    - local: model_doc/electra
      title: ELECTRA
-    - local: model_doc/encoderdecoder
+    - local: model_doc/encoder-decoder
      title: Encoder Decoder Models
    - local: model_doc/flaubert
      title: FlauBERT
@ -175,7 +209,7 @@
    - local: model_doc/funnel
      title: Funnel Transformer
    - local: model_doc/herbert
-      title: herBERT
+      title: HerBERT
    - local: model_doc/ibert
      title: I-BERT
    - local: model_doc/imagegpt
@ -196,14 +230,18 @@
      title: LXMERT
    - local: model_doc/marian
      title: MarianMT
    - local: model_doc/maskformer
      title: MaskFormer
    - local: model_doc/m2m_100
      title: M2M100
    - local: model_doc/mbart
      title: MBart and MBart-50
-    - local: model_doc/megatron_bert
+    - local: model_doc/megatron-bert
      title: MegatronBERT
    - local: model_doc/megatron_gpt2
      title: MegatronGPT2
    - local: model_doc/mluke
      title: MLUKE
    - local: model_doc/mobilebert
      title: MobileBERT
    - local: model_doc/mluke
@ -212,7 +250,9 @@
      title: MPNet
    - local: model_doc/mt5
      title: MT5
-    - local: model_doc/gpt
+    - local: model_doc/nystromformer
      title: Nyströmformer
    - local: model_doc/openai-gpt
      title: OpenAI GPT
    - local: model_doc/gpt2
      title: OpenAI GPT2
@ -228,12 +268,18 @@
      title: Pegasus
    - local: model_doc/phobert
      title: PhoBERT
    - local: model_doc/plbart
      title: PLBart
    - local: model_doc/poolformer
      title: PoolFormer
    - local: model_doc/prophetnet
      title: ProphetNet
    - local: model_doc/qdqbert
      title: QDQBert
    - local: model_doc/rag
      title: RAG
    - local: model_doc/realm
      title: REALM
    - local: model_doc/reformer
      title: Reformer
    - local: model_doc/rembert
@ -248,9 +294,9 @@
      title: SegFormer
    - local: model_doc/sew
      title: SEW
-    - local: model_doc/sew_d
+    - local: model_doc/sew-d
      title: SEW-D
-    - local: model_doc/speechencoderdecoder
+    - local: model_doc/speech-encoder-decoder
      title: Speech Encoder Decoder Models
    - local: model_doc/speech_to_text
      title: Speech2Text
@ -260,40 +306,58 @@
      title: Splinter
    - local: model_doc/squeezebert
      title: SqueezeBERT
    - local: model_doc/swin
      title: Swin Transformer
    - local: model_doc/t5
      title: T5
    - local: model_doc/t5v1.1
      title: T5v1.1
    - local: model_doc/tapas
      title: TAPAS
-    - local: model_doc/transformerxl
+    - local: model_doc/transfo-xl
      title: Transformer XL
    - local: model_doc/trocr
      title: TrOCR
    - local: model_doc/unispeech
      title: UniSpeech
-    - local: model_doc/unispeech_sat
+    - local: model_doc/unispeech-sat
      title: UniSpeech-SAT
-    - local: model_doc/visionencoderdecoder
+    - local: model_doc/vilt
      title: ViLT
    - local: model_doc/vision-encoder-decoder
      title: Vision Encoder Decoder Models
-    - local: model_doc/vision_text_dual_encoder
+    - local: model_doc/vision-text-dual-encoder
      title: Vision Text Dual Encoder
    - local: model_doc/vit
      title: Vision Transformer (ViT)
    - local: model_doc/vit_mae
      title: ViTMAE
    - local: model_doc/visual_bert
      title: VisualBERT
    - local: model_doc/wav2vec2
      title: Wav2Vec2
    - local: model_doc/wav2vec2_phoneme
      title: Wav2Vec2Phoneme
    - local: model_doc/wavlm
      title: WavLM
    - local: model_doc/xglm
      title: XGLM
    - local: model_doc/xlm
      title: XLM
-    - local: model_doc/xlmprophetnet
+    - local: model_doc/xlm-prophetnet
      title: XLM-ProphetNet
-    - local: model_doc/xlmroberta
+    - local: model_doc/xlm-roberta
      title: XLM-RoBERTa
    - local: model_doc/xlm-roberta-xl
      title: XLM-RoBERTa-XL
    - local: model_doc/xlnet
      title: XLNet
    - local: model_doc/xlsr_wav2vec2
      title: XLSR-Wav2Vec2
    - local: model_doc/xls_r
      title: XLS-R
    - local: model_doc/yoso
      title: YOSO
    title: Models
  - sections:
    - local: internal/modeling_utils
--- a/docs/source/accelerate.mdx
+++ b/docs/source/accelerate.mdx
@ -0,0 +1,132 @@
 <!--Copyright 2022 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
 -->
 # Distributed training with 🤗 Accelerate
 As models get bigger, parallelism has emerged as a strategy for training larger models on limited hardware and accelerating training speed by several orders of magnitude. At Hugging Face, we created the [🤗 Accelerate](https://huggingface.co/docs/accelerate/index.html) library to help users easily train a 🤗 Transformers model on any type of distributed setup, whether it is multiple GPU's on one machine or multiple GPU's across several machines. In this tutorial, learn how to customize your native PyTorch training loop to enable training in a distributed environment.
 ## Setup
 Get started by installing 🤗 Accelerate:
 ```bash
 pip install accelerate
 ```
 Then import and create an [`Accelerator`](https://huggingface.co/docs/accelerate/accelerator.html#accelerate.Accelerator) object. `Accelerator` will automatically detect your type of distributed setup and initialize all the necessary components for training. You don't need to explicitly place your model on a device.
 ```py
 >>> from accelerate import Accelerator
 >>> accelerator = Accelerator()
 ```
 ## Prepare to accelerate
 The next step is to pass all the relevant training objects to the [`prepare`](https://huggingface.co/docs/accelerate/accelerator.html#accelerate.Accelerator.prepare) method. This includes your training and evaluation DataLoaders, a model and an optimizer:
 ```py
 >>> train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
 ...     train_dataloader, eval_dataloader, model, optimizer
 ... )
 ```
 ## Backward
 The last addition is to replace the typical `loss.backward()` in your training loop with 🤗 Accelerate's [`backward`](https://huggingface.co/docs/accelerate/accelerator.html#accelerate.Accelerator.backward) method:
 ```py
 >>> for epoch in range(num_epochs):
 ...     for batch in train_dataloader:
 ...         outputs = model(**batch)
 ...         loss = outputs.loss
 ...         accelerator.backward(loss)
 ...         optimizer.step()
 ...         lr_scheduler.step()
 ...         optimizer.zero_grad()
 ...         progress_bar.update(1)
 ```
 As you can see in the following code, you only need to add four additional lines of code to your training loop to enable distributed training!
 ```diff
 + from accelerate import Accelerator
  from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler
 + accelerator = Accelerator()
  model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
  optimizer = AdamW(model.parameters(), lr=3e-5)
 - device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
 - model.to(device)
 + train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
 +     train_dataloader, eval_dataloader, model, optimizer
 + )
  num_epochs = 3
  num_training_steps = num_epochs * len(train_dataloader)
  lr_scheduler = get_scheduler(
      "linear",
      optimizer=optimizer,
      num_warmup_steps=0,
      num_training_steps=num_training_steps
  )
  progress_bar = tqdm(range(num_training_steps))
  model.train()
  for epoch in range(num_epochs):
      for batch in train_dataloader:
 -         batch = {k: v.to(device) for k, v in batch.items()}
          outputs = model(**batch)
          loss = outputs.loss
 -         loss.backward()
 +         accelerator.backward(loss)
          optimizer.step()
          lr_scheduler.step()
          optimizer.zero_grad()
          progress_bar.update(1)
 ```
 ## Train
 Once you've added the relevant lines of code, launch your training in a script or a notebook like Colaboratory.
 ### Train with a script
 If you are running your training from a script, run the following command to create and save a configuration file:
 ```bash
 accelerate config
 ```
 Then launch your training with:
 ```bash
 accelerate launch train.py
 ```
 ### Train with a notebook
 🤗 Accelerate can also run in a notebook if you're planning on using Colaboratory's TPUs. Wrap all the code responsible for training in a function, and pass it to `notebook_launcher`:
 ```py
 >>> from accelerate import notebook_launcher
 >>> notebook_launcher(training_function)
 ```
 For more information about 🤗 Accelerate and it's rich features, refer to the [documentation](https://huggingface.co/docs/accelerate/index.html).
--- a/docs/source/add_new_model.mdx
+++ b/docs/source/add_new_model.mdx
--- a/docs/source/add_new_pipeline.mdx
+++ b/docs/source/add_new_pipeline.mdx
@ -0,0 +1,140 @@
 <!--Copyright 2020 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 -->
 # How to add a pipeline to 🤗 Transformers?
 First and foremost, you need to decide the raw entries the pipeline will be able to take. It can be strings, raw bytes,
 dictionaries or whatever seems to be the most likely desired input. Try to keep these inputs as pure Python as possible
 as it makes compatibility easier (even through other languages via JSON). Those will be the `inputs` of the
 pipeline (`preprocess`).
 Then define the `outputs`. Same policy as the `inputs`. The simpler, the better. Those will be the outputs of
 `postprocess` method.
 Start by inheriting the base class `Pipeline`. with the 4 methods needed to implement `preprocess`,
 `_forward`, `postprocess` and `_sanitize_parameters`.
 ```python
 from transformers import Pipeline
 class MyPipeline(Pipeline):
    def _sanitize_parameters(self, **kwargs):
        preprocess_kwargs = {}
        if "maybe_arg" in kwargs:
            preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
        return preprocess_kwargs, {}, {}
    def preprocess(self, inputs, maybe_arg=2):
        model_input = Tensor(inputs["input_ids"])
        return {"model_input": model_input}
    def _forward(self, model_inputs):
        # model_inputs == {"model_input": model_input}
        outputs = self.model(**model_inputs)
        # Maybe {"logits": Tensor(...)}
        return outputs
    def postprocess(self, model_outputs):
        best_class = model_outputs["logits"].softmax(-1)
        return best_class
 ```
 The structure of this breakdown is to support relatively seamless support for CPU/GPU, while supporting doing
 pre/postprocessing on the CPU on different threads
 `preprocess` will take the originally defined inputs, and turn them into something feedable to the model. It might
 contain more information and is usually a `Dict`.
 `_forward` is the implementation detail and is not meant to be called directly. `forward` is the preferred
 called method as it contains safeguards to make sure everything is working on the expected device. If anything is
 linked to a real model it belongs in the `_forward` method, anything else is in the preprocess/postprocess.
 `postprocess` methods will take the output of `_forward` and turn it into the final output that were decided
 earlier.
 `_sanitize_parameters` exists to allow users to pass any parameters whenever they wish, be it at initialization
 time `pipeline(...., maybe_arg=4)` or at call time `pipe = pipeline(...); output = pipe(...., maybe_arg=4)`.
 The returns of `_sanitize_parameters` are the 3 dicts of kwargs that will be passed directly to `preprocess`,
 `_forward` and `postprocess`. Don't fill anything if the caller didn't call with any extra parameter. That
 allows to keep the default arguments in the function definition which is always more "natural".
 A classic example would be a `top_k` argument in the post processing in classification tasks.
 ```python
 >>> pipe = pipeline("my-new-task")
 >>> pipe("This is a test")
 [{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}, {"label": "3-star", "score": 0.05}
 {"label": "4-star", "score": 0.025}, {"label": "5-star", "score": 0.025}]
 >>> pipe("This is a test", top_k=2)
 [{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}]
 ```
 In order to achieve that, we'll update our `postprocess` method with a default parameter to `5`. and edit
 `_sanitize_parameters` to allow this new parameter.
 ```python
 def postprocess(self, model_outputs, top_k=5):
    best_class = model_outputs["logits"].softmax(-1)
    # Add logic to handle top_k
    return best_class
 def _sanitize_parameters(self, **kwargs):
    preprocess_kwargs = {}
    if "maybe_arg" in kwargs:
        preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
    postprocess_kwargs = {}
    if "top_k" in kwargs:
        preprocess_kwargs["top_k"] = kwargs["top_k"]
    return preprocess_kwargs, {}, postprocess_kwargs
 ```
 Try to keep the inputs/outputs very simple and ideally JSON-serializable as it makes the pipeline usage very easy
 without requiring users to understand new kind of objects. It's also relatively common to support many different types
 of arguments for ease of use (audio files, can be filenames, URLs or pure bytes)
 ## Adding it to the list of supported tasks
 Go to `src/transformers/pipelines/__init__.py` and fill in `SUPPORTED_TASKS` with your newly created pipeline.
 If possible it should provide a default model.
 ## Adding tests
 Create a new file `tests/test_pipelines_MY_PIPELINE.py` with example with the other tests.
 The `run_pipeline_test` function will be very generic and run on small random models on every possible
 architecture as defined by `model_mapping` and `tf_model_mapping`.
 This is very important to test future compatibility, meaning if someone adds a new model for
 `XXXForQuestionAnswering` then the pipeline test will attempt to run on it. Because the models are random it's
 impossible to check for actual values, that's why There is a helper `ANY` that will simply attempt to match the
 output of the pipeline TYPE.
 You also *need* to implement 2 (ideally 4) tests.
 - `test_small_model_pt` : Define 1 small model for this pipeline (doesn't matter if the results don't make sense)
  and test the pipeline outputs. The results should be the same as `test_small_model_tf`.
 - `test_small_model_tf` : Define 1 small model for this pipeline (doesn't matter if the results don't make sense)
  and test the pipeline outputs. The results should be the same as `test_small_model_pt`.
 - `test_large_model_pt` (`optional`): Tests the pipeline on a real pipeline where the results are supposed to
  make sense. These tests are slow and should be marked as such. Here the goal is to showcase the pipeline and to make
  sure there is no drift in future releases
 - `test_large_model_tf` (`optional`): Tests the pipeline on a real pipeline where the results are supposed to
  make sense. These tests are slow and should be marked as such. Here the goal is to showcase the pipeline and to make
  sure there is no drift in future releases
--- a/docs/source/add_new_pipeline.rst
+++ b/docs/source/add_new_pipeline.rst
@ -1,143 +0,0 @@
 .. 
    Copyright 2020 The HuggingFace Team. All rights reserved.
    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
    the License. You may obtain a copy of the License at
        http://www.apache.org/licenses/LICENSE-2.0
    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 How to add a pipeline to 🤗 Transformers?
 =======================================================================================================================
 First and foremost, you need to decide the raw entries the pipeline will be able to take. It can be strings, raw bytes,
 dictionaries or whatever seems to be the most likely desired input. Try to keep these inputs as pure Python as possible
 as it makes compatibility easier (even through other languages via JSON). Those will be the :obj:`inputs` of the
 pipeline (:obj:`preprocess`).
 Then define the :obj:`outputs`. Same policy as the :obj:`inputs`. The simpler, the better. Those will be the outputs of
 :obj:`postprocess` method.
 Start by inheriting the base class :obj:`Pipeline`. with the 4 methods needed to implement :obj:`preprocess`,
 :obj:`_forward`, :obj:`postprocess` and :obj:`_sanitize_parameters`.
 .. code-block::
    from transformers import Pipeline
    class MyPipeline(Pipeline):
        def _sanitize_parameters(self, **kwargs):
            preprocess_kwargs = {}
            if "maybe_arg" in kwargs:
                preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
            return preprocess_kwargs, {}, {}
        def preprocess(self, inputs, maybe_arg=2):
            model_input = Tensor(....)
            return {"model_input": model_input}
        def _forward(self, model_inputs):
            # model_inputs == {"model_input": model_input}
            outputs = self.model(**model_inputs)
            # Maybe {"logits": Tensor(...)}
            return outputs
        def postprocess(self, model_outputs):
            best_class = model_outputs["logits"].softmax(-1)
            return best_class
 The structure of this breakdown is to support relatively seamless support for CPU/GPU, while supporting doing
 pre/postprocessing on the CPU on different threads
 :obj:`preprocess` will take the originally defined inputs, and turn them into something feedable to the model. It might
 contain more information and is usually a :obj:`Dict`.
 :obj:`_forward` is the implementation detail and is not meant to be called directly. :obj:`forward` is the preferred
 called method as it contains safeguards to make sure everything is working on the expected device. If anything is
 linked to a real model it belongs in the :obj:`_forward` method, anything else is in the preprocess/postprocess.
 :obj:`postprocess` methods will take the output of :obj:`_forward` and turn it into the final output that were decided
 earlier.
 :obj:`_sanitize_parameters` exists to allow users to pass any parameters whenever they wish, be it at initialization
 time ``pipeline(...., maybe_arg=4)`` or at call time ``pipe = pipeline(...); output = pipe(...., maybe_arg=4)``.
 The returns of :obj:`_sanitize_parameters` are the 3 dicts of kwargs that will be passed directly to :obj:`preprocess`,
 :obj:`_forward` and :obj:`postprocess`. Don't fill anything if the caller didn't call with any extra parameter. That
 allows to keep the default arguments in the function definition which is always more "natural".
 A classic example would be a :obj:`top_k` argument in the post processing in classification tasks.
 .. code-block::
    >>> pipe = pipeline("my-new-task")
    >>> pipe("This is a test")
    [{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}, {"label": "3-star", "score": 0.05}
    {"label": "4-star", "score": 0.025}, {"label": "5-star", "score": 0.025}]
    >>> pipe("This is a test", top_k=2)
    [{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}]
 In order to achieve that, we'll update our :obj:`postprocess` method with a default parameter to :obj:`5`. and edit
 :obj:`_sanitize_parameters` to allow this new parameter.
 .. code-block::
        def postprocess(self, model_outputs, top_k=5):
            best_class = model_outputs["logits"].softmax(-1)
            # Add logic to handle top_k
            return best_class
        def _sanitize_parameters(self, **kwargs):
            preprocess_kwargs = {}
            if "maybe_arg" in kwargs:
                preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
            postprocess_kwargs = {}
            if "top_k" in kwargs:
                preprocess_kwargs["top_k"] = kwargs["top_k"]
            return preprocess_kwargs, {}, postprocess_kwargs
 Try to keep the inputs/outputs very simple and ideally JSON-serializable as it makes the pipeline usage very easy
 without requiring users to understand new kind of objects. It's also relatively common to support many different types
 of arguments for ease of use (audio files, can be filenames, URLs or pure bytes)
 Adding it to the list of supported tasks
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Go to ``src/transformers/pipelines/__init__.py`` and fill in :obj:`SUPPORTED_TASKS` with your newly created pipeline.
 If possible it should provide a default model.
 Adding tests
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Create a new file ``tests/test_pipelines_MY_PIPELINE.py`` with example with the other tests.
 The :obj:`run_pipeline_test` function will be very generic and run on small random models on every possible
 architecture as defined by :obj:`model_mapping` and :obj:`tf_model_mapping`.
 This is very important to test future compatibility, meaning if someone adds a new model for
 :obj:`XXXForQuestionAnswering` then the pipeline test will attempt to run on it. Because the models are random it's
 impossible to check for actual values, that's why There is a helper :obj:`ANY` that will simply attempt to match the
 output of the pipeline TYPE.
 You also *need* to implement 2 (ideally 4) tests.
 - :obj:`test_small_model_pt` : Define 1 small model for this pipeline (doesn't matter if the results don't make sense)
  and test the pipeline outputs. The results should be the same as :obj:`test_small_model_tf`.
 - :obj:`test_small_model_tf` : Define 1 small model for this pipeline (doesn't matter if the results don't make sense)
  and test the pipeline outputs. The results should be the same as :obj:`test_small_model_pt`.
 - :obj:`test_large_model_pt` (:obj:`optional`): Tests the pipeline on a real pipeline where the results are supposed to
  make sense. These tests are slow and should be marked as such. Here the goal is to showcase the pipeline and to make
  sure there is no drift in future releases
 - :obj:`test_large_model_tf` (:obj:`optional`): Tests the pipeline on a real pipeline where the results are supposed to
  make sense. These tests are slow and should be marked as such. Here the goal is to showcase the pipeline and to make
  sure there is no drift in future releases
--- a/docs/source/autoclass_tutorial.mdx
+++ b/docs/source/autoclass_tutorial.mdx
@ -0,0 +1,104 @@
 <!--Copyright 2022 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
 -->
 # Load pretrained instances with an AutoClass
 With so many different Transformer architectures, it can be challenging to create one for your checkpoint. As a part of 🤗 Transformers core philosophy to make the library easy, simple and flexible to use, an `AutoClass` automatically infer and load the correct architecture from a given checkpoint. The `from_pretrained` method lets you quickly load a pretrained model for any architecture so you don't have to devote time and resources to train a model from scratch. Producing this type of checkpoint-agnostic code means if your code works for one checkpoint, it will work with another checkpoint - as long as it was trained for a similar task - even if the architecture is different.
 <Tip>
 Remember, architecture refers to the skeleton of the model and checkpoints are the weights for a given architecture. For example, [BERT](https://huggingface.co/bert-base-uncased) is an architecture, while `bert-base-uncased` is a checkpoint. Model is a general term that can mean either architecture or checkpoint.
 </Tip>
 In this tutorial, learn to:
 * Load a pretrained tokenizer.
 * Load a pretrained feature extractor.
 * Load a pretrained processor.
 * Load a pretrained model.
 ## AutoTokenizer
 Nearly every NLP task begins with a tokenizer. A tokenizer converts your input into a format that can be processed by the model.
 Load a tokenizer with [`AutoTokenizer.from_pretrained`]:
 ```py
 >>> from transformers import AutoTokenizer
 >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
 ```
 Then tokenize your input as shown below:
 ```py
 >>> sequence = "In a hole in the ground there lived a hobbit."
 >>> print(tokenizer(sequence))
 {'input_ids': [101, 1999, 1037, 4920, 1999, 1996, 2598, 2045, 2973, 1037, 7570, 10322, 4183, 1012, 102], 
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
 ```
 ## AutoFeatureExtractor
 For audio and vision tasks, a feature extractor processes the audio signal or image into the correct input format.
 Load a feature extractor with [`AutoFeatureExtractor.from_pretrained`]:
 ```py
 >>> from transformers import AutoFeatureExtractor
 >>> feature_extractor = AutoFeatureExtractor.from_pretrained(
 ...     "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
 ... )
 ```
 ## AutoProcessor
 Multimodal tasks require a processor that combines two types of preprocessing tools. For example, the [LayoutLMV2](model_doc/layoutlmv2) model requires a feature extractor to handle images and a tokenizer to handle text; a processor combines both of them.
 Load a processor with [`AutoProcessor.from_pretrained`]:
 ```py
 >>> from transformers import AutoProcessor
 >>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased")
 ```
 ## AutoModel
 Finally, the `AutoModelFor` classes let you load a pretrained model for a given task (see [here](model_doc/auto) for a complete list of available tasks). For example, load a model for sequence classification with [`AutoModelForSequenceClassification.from_pretrained`]:
 ```py
 >>> from transformers import AutoModelForSequenceClassification
 >>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
 ===PT-TF-SPLIT===
 >>> from transformers import TFAutoModelForSequenceClassification
 >>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
 ```
 Easily reuse the same checkpoint to load an architecture for a different task:
 ```py
 >>> from transformers import AutoModelForTokenClassification
 >>> model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased")
 ===PT-TF-SPLIT===
 >>> from transformers import TFAutoModelForTokenClassification
 >>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert-base-uncased")
 ```
 Generally, we recommend using the `AutoTokenizer` class and the `AutoModelFor` class to load pretrained instances of models. This will ensure you load the correct architecture every time. In the next [tutorial](preprocessing), learn how to use your newly loaded tokenizer, feature extractor and processor to preprocess a dataset for fine-tuning.
--- a/docs/source/benchmarks.mdx
+++ b/docs/source/benchmarks.mdx
@ -12,15 +12,22 @@ specific language governing permissions and limitations under the License.
 # Benchmarks
 <Tip warning={true}>
 Hugging Face's Benchmarking tools are deprecated and it is advised to use external Benchmarking libraries to measure the speed 
 and memory complexity of Transformer models.
 </Tip>
 [[open-in-colab]]
-Let's take a look at how 🤗 Transformer models can be benchmarked, best practices, and already available benchmarks.
+Let's take a look at how 🤗 Transformers models can be benchmarked, best practices, and already available benchmarks.
-A notebook explaining in more detail how to benchmark 🤗 Transformer models can be found [here](https://github.com/huggingface/transformers/tree/master/notebooks/05-benchmark.ipynb).
+A notebook explaining in more detail how to benchmark 🤗 Transformers models can be found [here](https://github.com/huggingface/notebooks/tree/master/examples/benchmark.ipynb).
-## How to benchmark 🤗 Transformer models
+## How to benchmark 🤗 Transformers models
-The classes [`PyTorchBenchmark`] and [`TensorFlowBenchmark`] allow to flexibly benchmark 🤗 Transformer models. The benchmark classes allow us to measure the _peak memory usage_ and _required time_ for both _inference_ and _training_.
+The classes [`PyTorchBenchmark`] and [`TensorFlowBenchmark`] allow to flexibly benchmark 🤗 Transformers models. The benchmark classes allow us to measure the _peak memory usage_ and _required time_ for both _inference_ and _training_.
 <Tip>
@ -37,11 +44,12 @@ The benchmark classes [`PyTorchBenchmark`] and [`TensorFlowBenchmark`] expect an
 >>> args = PyTorchBenchmarkArguments(models=["bert-base-uncased"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512])
 >>> benchmark = PyTorchBenchmark(args)
 ===PT-TF-SPLIT===
 >>> from transformers import TensorFlowBenchmark, TensorFlowBenchmarkArguments
->>> args = TensorFlowBenchmarkArguments(models=["bert-base-uncased"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512])
+>>> args = TensorFlowBenchmarkArguments(
 ...     models=["bert-base-uncased"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512]
 ... )
 >>> benchmark = TensorFlowBenchmark(args)
 ```
@ -174,7 +182,9 @@ configurations must be inserted with the benchmark args as follows.
 ```py
 >>> from transformers import PyTorchBenchmark, PyTorchBenchmarkArguments, BertConfig
->>> args = PyTorchBenchmarkArguments(models=["bert-base", "bert-384-hid", "bert-6-lay"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512])
+>>> args = PyTorchBenchmarkArguments(
 ...     models=["bert-base", "bert-384-hid", "bert-6-lay"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512]
 ... )
 >>> config_base = BertConfig()
 >>> config_384_hid = BertConfig(hidden_size=384)
 >>> config_6_lay = BertConfig(num_hidden_layers=6)
@ -244,7 +254,9 @@ bert-6-lay                 8              512            1359
 ===PT-TF-SPLIT===
 >>> from transformers import TensorFlowBenchmark, TensorFlowBenchmarkArguments, BertConfig
->>> args = TensorFlowBenchmarkArguments(models=["bert-base", "bert-384-hid", "bert-6-lay"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512])
+>>> args = TensorFlowBenchmarkArguments(
 ...     models=["bert-base", "bert-384-hid", "bert-6-lay"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512]
 ... )
 >>> config_base = BertConfig()
 >>> config_384_hid = BertConfig(hidden_size=384)
 >>> config_6_lay = BertConfig(num_hidden_layers=6)
--- a/docs/source/bertology.mdx
+++ b/docs/source/bertology.mdx
@ -0,0 +1,36 @@
 <!--Copyright 2020 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
 -->
 # BERTology
 There is a growing field of study concerned with investigating the inner working of large-scale transformers like BERT
 (that some call "BERTology"). Some good examples of this field are:
 - BERT Rediscovers the Classical NLP Pipeline by Ian Tenney, Dipanjan Das, Ellie Pavlick:
  https://arxiv.org/abs/1905.05950
 - Are Sixteen Heads Really Better than One? by Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650
 - What Does BERT Look At? An Analysis of BERT's Attention by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D.
  Manning: https://arxiv.org/abs/1906.04341
 In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to
 help people access the inner representations, mainly adapted from the great work of Paul Michel
 (https://arxiv.org/abs/1905.10650):
 - accessing all the hidden-states of BERT/GPT/GPT-2,
 - accessing all the attention weights for each head of BERT/GPT/GPT-2,
 - retrieving heads output values and gradients to be able to compute head importance score and prune head as explained
  in https://arxiv.org/abs/1905.10650.
 To help you understand and use these features, we have added a specific example script: [bertology.py](https://github.com/huggingface/transformers/tree/master/examples/research_projects/bertology/run_bertology.py) while extract information and prune a model pre-trained on
 GLUE.
--- a/docs/source/bertology.rst
+++ b/docs/source/bertology.rst
@ -1,38 +0,0 @@
 .. 
    Copyright 2020 The HuggingFace Team. All rights reserved.
    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
    the License. You may obtain a copy of the License at
        http://www.apache.org/licenses/LICENSE-2.0
    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
    specific language governing permissions and limitations under the License.
 BERTology
 -----------------------------------------------------------------------------------------------------------------------
 There is a growing field of study concerned with investigating the inner working of large-scale transformers like BERT
 (that some call "BERTology"). Some good examples of this field are:
 * BERT Rediscovers the Classical NLP Pipeline by Ian Tenney, Dipanjan Das, Ellie Pavlick:
  https://arxiv.org/abs/1905.05950
 * Are Sixteen Heads Really Better than One? by Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650
 * What Does BERT Look At? An Analysis of BERT's Attention by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D.
  Manning: https://arxiv.org/abs/1906.04341
 In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to
 help people access the inner representations, mainly adapted from the great work of Paul Michel
 (https://arxiv.org/abs/1905.10650):
 * accessing all the hidden-states of BERT/GPT/GPT-2,
 * accessing all the attention weights for each head of BERT/GPT/GPT-2,
 * retrieving heads output values and gradients to be able to compute head importance score and prune head as explained
  in https://arxiv.org/abs/1905.10650.
 To help you understand and use these features, we have added a specific example script: :prefix_link:`bertology.py
 <examples/research_projects/bertology/run_bertology.py>` while extract information and prune a model pre-trained on
 GLUE.
--- a/docs/source/community.mdx
+++ b/docs/source/community.mdx
@ -62,3 +62,4 @@ This page regroups resources around 🤗 Transformers developed by the community
 | [Speech Emotion Classification with Wav2Vec2](https://github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) | How to leverage a pretrained Wav2Vec2 model for Emotion Classification on the MEGA dataset | [Mehrdad Farahani](https://github.com/m3hrdadfi) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) |
 | [Detect objects in an image with DETR](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) | How to use a trained *DetrForObjectDetection* model to detect objects in an image and visualize attention | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) |
 | [Fine-tune DETR on a custom object detection dataset](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) | How to fine-tune *DetrForObjectDetection* on a custom object detection dataset | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) |
 | [Finetune T5 for Named Entity Recognition](https://github.com/ToluClassics/Notebooks/blob/main/T5_Ner_Finetuning.ipynb) | How to fine-tune *T5* on a Named Entity Recognition Task | [Ogundepo Odunayo](https://github.com/ToluClassics) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1obr78FY_cBmWY5ODViCmzdY6O1KB65Vc?usp=sharing) |
--- a/docs/source/converting_tensorflow_models.mdx
+++ b/docs/source/converting_tensorflow_models.mdx
@ -0,0 +1,162 @@
 <!--Copyright 2020 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
 -->
 # Converting Tensorflow Checkpoints
 A command-line interface is provided to convert original Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM checkpoints to models
 that can be loaded using the `from_pretrained` methods of the library.
 <Tip>
 Since 2.3.0 the conversion script is now part of the transformers CLI (**transformers-cli**) available in any
 transformers >= 2.3.0 installation.
 The documentation below reflects the **transformers-cli convert** command format.
 </Tip>
 ## BERT
 You can convert any TensorFlow checkpoint for BERT (in particular [the pre-trained models released by Google](https://github.com/google-research/bert#pre-trained-models)) in a PyTorch save file by using the
 [convert_bert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/master/src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py) script.
 This CLI takes as input a TensorFlow checkpoint (three files starting with `bert_model.ckpt`) and the associated
 configuration file (`bert_config.json`), and creates a PyTorch model for this configuration, loads the weights from
 the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can
 be imported using `from_pretrained()` (see example in [quicktour](quicktour) , [run_glue.py](https://github.com/huggingface/transformers/tree/master/examples/pytorch/text-classification/run_glue.py) ).
 You only need to run this conversion script **once** to get a PyTorch model. You can then disregard the TensorFlow
 checkpoint (the three files starting with `bert_model.ckpt`) but be sure to keep the configuration file (\
 `bert_config.json`) and the vocabulary file (`vocab.txt`) as these are needed for the PyTorch model too.
 To run this specific conversion script you will need to have TensorFlow and PyTorch installed (`pip install tensorflow`). The rest of the repository only requires PyTorch.
 Here is an example of the conversion process for a pre-trained `BERT-Base Uncased` model:
 ```bash
 export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
 transformers-cli convert --model_type bert \
  --tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \
  --config $BERT_BASE_DIR/bert_config.json \
  --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin
 ```
 You can download Google's pre-trained models for the conversion [here](https://github.com/google-research/bert#pre-trained-models).
 ## ALBERT
 Convert TensorFlow model checkpoints of ALBERT to PyTorch using the
 [convert_albert_original_tf_checkpoint_to_pytorch.py](https://github.com/huggingface/transformers/tree/master/src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py) script.
 The CLI takes as input a TensorFlow checkpoint (three files starting with `model.ckpt-best`) and the accompanying
 configuration file (`albert_config.json`), then creates and saves a PyTorch model. To run this conversion you will
 need to have TensorFlow and PyTorch installed.
 Here is an example of the conversion process for the pre-trained `ALBERT Base` model:
 ```bash
 export ALBERT_BASE_DIR=/path/to/albert/albert_base
 transformers-cli convert --model_type albert \
  --tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \
  --config $ALBERT_BASE_DIR/albert_config.json \
  --pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin
 ```
 You can download Google's pre-trained models for the conversion [here](https://github.com/google-research/albert#pre-trained-models).
 ## OpenAI GPT
 Here is an example of the conversion process for a pre-trained OpenAI GPT model, assuming that your NumPy checkpoint
 save as the same format than OpenAI pretrained model (see [here](https://github.com/openai/finetune-transformer-lm)\
 )
 ```bash
 export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
 transformers-cli convert --model_type gpt \
  --tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
  [--config OPENAI_GPT_CONFIG] \
  [--finetuning_task_name OPENAI_GPT_FINETUNED_TASK] \
 ```
 ## OpenAI GPT-2
 Here is an example of the conversion process for a pre-trained OpenAI GPT-2 model (see [here](https://github.com/openai/gpt-2))
 ```bash
 export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights
 transformers-cli convert --model_type gpt2 \
  --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
  [--config OPENAI_GPT2_CONFIG] \
  [--finetuning_task_name OPENAI_GPT2_FINETUNED_TASK]
 ```
 ## Transformer-XL
 Here is an example of the conversion process for a pre-trained Transformer-XL model (see [here](https://github.com/kimiyoung/transformer-xl/tree/master/tf#obtain-and-evaluate-pretrained-sota-models))
 ```bash
 export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint
 transformers-cli convert --model_type transfo_xl \
  --tf_checkpoint $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
  [--config TRANSFO_XL_CONFIG] \
  [--finetuning_task_name TRANSFO_XL_FINETUNED_TASK]
 ```
 ## XLNet
 Here is an example of the conversion process for a pre-trained XLNet model:
 ```bash
 export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
 export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
 transformers-cli convert --model_type xlnet \
  --tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \
  --config $TRANSFO_XL_CONFIG_PATH \
  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
  [--finetuning_task_name XLNET_FINETUNED_TASK] \
 ```
 ## XLM
 Here is an example of the conversion process for a pre-trained XLM model:
 ```bash
 export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint
 transformers-cli convert --model_type xlm \
  --tf_checkpoint $XLM_CHECKPOINT_PATH \
  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT
 [--config XML_CONFIG] \
 [--finetuning_task_name XML_FINETUNED_TASK]
 ```
 ## T5
 Here is an example of the conversion process for a pre-trained T5 model:
 ```bash
 export T5=/path/to/t5/uncased_L-12_H-768_A-12
 transformers-cli convert --model_type t5 \
  --tf_checkpoint $T5/t5_model.ckpt \
  --config $T5/t5_config.json \
  --pytorch_dump_output $T5/pytorch_model.bin
 ```
--- a/docs/source/converting_tensorflow_models.rst
+++ b/docs/source/converting_tensorflow_models.rst
@ -1,181 +0,0 @@
 .. 
    Copyright 2020 The HuggingFace Team. All rights reserved.
    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
    the License. You may obtain a copy of the License at
        http://www.apache.org/licenses/LICENSE-2.0
    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
    specific language governing permissions and limitations under the License.
 Converting Tensorflow Checkpoints
 =======================================================================================================================
 A command-line interface is provided to convert original Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM checkpoints to models
 that can be loaded using the ``from_pretrained`` methods of the library.
 .. note::
    Since 2.3.0 the conversion script is now part of the transformers CLI (**transformers-cli**) available in any
    transformers >= 2.3.0 installation.
    The documentation below reflects the **transformers-cli convert** command format.
 BERT
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 You can convert any TensorFlow checkpoint for BERT (in particular `the pre-trained models released by Google
 <https://github.com/google-research/bert#pre-trained-models>`_) in a PyTorch save file by using the
 :prefix_link:`convert_bert_original_tf_checkpoint_to_pytorch.py
 <src/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py>` script.
 This CLI takes as input a TensorFlow checkpoint (three files starting with ``bert_model.ckpt``) and the associated
 configuration file (``bert_config.json``), and creates a PyTorch model for this configuration, loads the weights from
 the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can
 be imported using ``from_pretrained()`` (see example in :doc:`quicktour` , :prefix_link:`run_glue.py
 <examples/pytorch/text-classification/run_glue.py>` ).
 You only need to run this conversion script **once** to get a PyTorch model. You can then disregard the TensorFlow
 checkpoint (the three files starting with ``bert_model.ckpt``) but be sure to keep the configuration file (\
 ``bert_config.json``) and the vocabulary file (``vocab.txt``) as these are needed for the PyTorch model too.
 To run this specific conversion script you will need to have TensorFlow and PyTorch installed (``pip install
 tensorflow``). The rest of the repository only requires PyTorch.
 Here is an example of the conversion process for a pre-trained ``BERT-Base Uncased`` model:
 .. code-block:: shell
    export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
    transformers-cli convert --model_type bert \
      --tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \
      --config $BERT_BASE_DIR/bert_config.json \
      --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin
 You can download Google's pre-trained models for the conversion `here
 <https://github.com/google-research/bert#pre-trained-models>`__.
 ALBERT
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 Convert TensorFlow model checkpoints of ALBERT to PyTorch using the
 :prefix_link:`convert_albert_original_tf_checkpoint_to_pytorch.py
 <src/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py>` script.
 The CLI takes as input a TensorFlow checkpoint (three files starting with ``model.ckpt-best``) and the accompanying
 configuration file (``albert_config.json``), then creates and saves a PyTorch model. To run this conversion you will
 need to have TensorFlow and PyTorch installed.
 Here is an example of the conversion process for the pre-trained ``ALBERT Base`` model:
 .. code-block:: shell
    export ALBERT_BASE_DIR=/path/to/albert/albert_base
    transformers-cli convert --model_type albert \
      --tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \
      --config $ALBERT_BASE_DIR/albert_config.json \
      --pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin
 You can download Google's pre-trained models for the conversion `here
 <https://github.com/google-research/albert#pre-trained-models>`__.
 OpenAI GPT
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 Here is an example of the conversion process for a pre-trained OpenAI GPT model, assuming that your NumPy checkpoint
 save as the same format than OpenAI pretrained model (see `here <https://github.com/openai/finetune-transformer-lm>`__\
 )
 .. code-block:: shell
    export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
    transformers-cli convert --model_type gpt \
      --tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
      --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
      [--config OPENAI_GPT_CONFIG] \
      [--finetuning_task_name OPENAI_GPT_FINETUNED_TASK] \
 OpenAI GPT-2
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 Here is an example of the conversion process for a pre-trained OpenAI GPT-2 model (see `here
 <https://github.com/openai/gpt-2>`__)
 .. code-block:: shell
    export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights
    transformers-cli convert --model_type gpt2 \
      --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
      --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
      [--config OPENAI_GPT2_CONFIG] \
      [--finetuning_task_name OPENAI_GPT2_FINETUNED_TASK]
 Transformer-XL
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 Here is an example of the conversion process for a pre-trained Transformer-XL model (see `here
 <https://github.com/kimiyoung/transformer-xl/tree/master/tf#obtain-and-evaluate-pretrained-sota-models>`__)
 .. code-block:: shell
    export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint
    transformers-cli convert --model_type transfo_xl \
      --tf_checkpoint $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
      --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
      [--config TRANSFO_XL_CONFIG] \
      [--finetuning_task_name TRANSFO_XL_FINETUNED_TASK]
 XLNet
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 Here is an example of the conversion process for a pre-trained XLNet model:
 .. code-block:: shell
    export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
    export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
    transformers-cli convert --model_type xlnet \
      --tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \
      --config $TRANSFO_XL_CONFIG_PATH \
      --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
      [--finetuning_task_name XLNET_FINETUNED_TASK] \
 XLM
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 Here is an example of the conversion process for a pre-trained XLM model:
 .. code-block:: shell
    export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint
    transformers-cli convert --model_type xlm \
      --tf_checkpoint $XLM_CHECKPOINT_PATH \
      --pytorch_dump_output $PYTORCH_DUMP_OUTPUT
     [--config XML_CONFIG] \
     [--finetuning_task_name XML_FINETUNED_TASK]
 T5
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 Here is an example of the conversion process for a pre-trained T5 model:
 .. code-block:: shell
    export T5=/path/to/t5/uncased_L-12_H-768_A-12
    transformers-cli convert --model_type t5 \
      --tf_checkpoint $T5/t5_model.ckpt \
      --config $T5/t5_config.json \
      --pytorch_dump_output $T5/pytorch_model.bin
--- a/docs/source/create_a_model.mdx
+++ b/docs/source/create_a_model.mdx
@ -0,0 +1,323 @@
 <!--Copyright 2022 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
 -->
 # Create a custom model
 An [`AutoClass`](model_doc/auto) automatically infers the model architecture and downloads pretrained configuration and weights. Generally, we recommend using an `AutoClass` to produce checkpoint-agnostic code. But users who want more control over specific model parameters can create a custom 🤗 Transformers model from just a few base classes. This could be particularly useful for anyone who is interested in studying, training or experimenting with a 🤗 Transformers model. In this guide, dive deeper into creating a custom model without an `AutoClass`. Learn how to:
 - Load and customize a model configuration.
 - Create a model architecture.
 - Create a slow and fast tokenizer for text.
 - Create a feature extractor for audio or image tasks.
 - Create a processor for multimodal tasks.
 ## Configuration
 A [configuration](main_classes/configuration) refers to a model's specific attributes. Each model configuration has different attributes; for instance, all NLP models have the `hidden_size`, `num_attention_heads`, `num_hidden_layers` and `vocab_size` attributes in common. These attributes specify the number of attention heads or hidden layers to construct a model with.
 Get a closer look at [DistilBERT](model_doc/distilbert) by accessing [`DistilBertConfig`] to inspect it's attributes:
 ```py
 >>> from transformers import DistilBertConfig
 >>> config = DistilBertConfig()
 >>> print(config)
 DistilBertConfig {
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "transformers_version": "4.16.2",
  "vocab_size": 30522
 }
 ```
 [`DistilBertConfig`] displays all the default attributes used to build a base [`DistilBertModel`]. All attributes are customizable, creating space for experimentation. For example, you can customize a default model to:
 - Try a different activation function with the `activation` parameter.
 - Use a higher dropout ratio for the attention probabilities with the `attention_dropout` parameter.
 ```py
 >>> my_config = DistilBertConfig(activation="relu", attention_dropout=0.4)
 >>> print(my_config)
 DistilBertConfig {
  "activation": "relu",
  "attention_dropout": 0.4,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "transformers_version": "4.16.2",
  "vocab_size": 30522
 }
 ```
 Pretrained model attributes can be modified in the [`~PretrainedConfig.from_pretrained`] function:
 ```py
 >>> my_config = DistilBertConfig.from_pretrained("distilbert-base-uncased", activation="relu", attention_dropout=0.4)
 ```
 Once you are satisfied with your model configuration, you can save it with [`~PretrainedConfig.save_pretrained`]. Your configuration file is stored as a JSON file in the specified save directory:
 ```py
 >>> my_config.save_pretrained(save_directory="./your_model_save_path")
 ```
 To reuse the configuration file, load it with [`~PretrainedConfig.from_pretrained`]:
 ```py
 >>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json")
 ```
 <Tip>
 You can also save your configuration file as a dictionary or even just the difference between your custom configuration attributes and the default configuration attributes! See the [configuration](main_classes/configuration) documentation for more details.
 </Tip>
 ## Model
 The next step is to create a [model](main_classes/models). The model - also loosely referred to as the architecture - defines what each layer is doing and what operations are happening. Attributes like `num_hidden_layers` from the configuration are used to define the architecture. Every model shares the base class [`PreTrainedModel`] and a few common methods like resizing input embeddings and pruning self-attention heads. In addition, all models are also either a [`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html), [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) or [`flax.linen.Module`](https://flax.readthedocs.io/en/latest/flax.linen.html#module) subclass. This means models are compatible with each of their respective framework's usage.
 Load your custom configuration attributes into the model:
 ```py
 >>> from transformers import DistilBertModel
 >>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json")
 >>> model = DistilBertModel(my_config)
 ===PT-TF-SPLIT===
 >>> from transformers import TFDistilBertModel
 >>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json")
 >>> tf_model = TFDistilBertModel(my_config)
 ```
 This creates a model with random values instead of pretrained weights. You won't be able to use this model for anything useful yet until you train it. Training is a costly and time-consuming process. It is generally better to use a pretrained model to obtain better results faster, while using only a fraction of the resources required for training.
 Create a pretrained model with [`~PreTrainedModel.from_pretrained`]:
 ```py
 >>> model = DistilBertModel.from_pretrained("distilbert-base-uncased")
 ===PT-TF-SPLIT===
 >>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased")
 ```
 When you load pretrained weights, the default model configuration is automatically loaded if the model is provided by 🤗 Transformers. However, you can still replace - some or all of - the default model configuration attributes with your own if you'd like:
 ```py
 >>> model = DistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config)
 ===PT-TF-SPLIT===
 >>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config)
 ```
 ### Model heads
 At this point, you have a base DistilBERT model which outputs the *hidden states*. The hidden states are passed as inputs to a model head to produce the final output. 🤗 Transformers provides a different model head for each task as long as a model supports the task (i.e., you can't use DistilBERT for a sequence-to-sequence task like translation).
 For example, [`DistilBertForSequenceClassification`] is a base DistilBERT model with a sequence classification head. The sequence classification head is a linear layer on top of the pooled outputs.
 ```py
 >>> from transformers import DistilBertForSequenceClassification
 >>> model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
 ===PT-TF-SPLIT===
 >>> from transformers import TFDistilBertForSequenceClassification
 >>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
 ```
 Easily reuse this checkpoint for another task by switching to a different model head. For a question answering task, you would use the [`DistilBertForQuestionAnswering`] model head. The question answering head is similar to the sequence classification head except it is a linear layer on top of the hidden states output.
 ```py
 >>> from transformers import DistilBertForQuestionAnswering
 >>> model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
 ===PT-TF-SPLIT===
 >>> from transformers import TFDistilBertForQuestionAnswering
 >>> tf_model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
 ```
 ## Tokenizer
 The last base class you need before using a model for textual data is a [tokenizer](main_classes/tokenizer) to convert raw text to tensors. There are two types of tokenizers you can use with 🤗 Transformers:
 - [`PreTrainedTokenizer`]: a Python implementation of a tokenizer.
 - [`PreTrainedTokenizerFast`]: a tokenizer from our Rust-based [🤗 Tokenizer](https://huggingface.co/docs/tokenizers/python/latest/) library. This tokenizer type is significantly faster - especially during batch tokenization - due to it's Rust implementation. The fast tokenizer also offers additional methods like *offset mapping* which maps tokens to their original words or characters.
 Both tokenizers support common methods such as encoding and decoding, adding new tokens, and managing special tokens.
 <Tip warning={true}>
 Not every model supports a fast tokenizer. Take a look at this [table](index#supported-frameworks) to check if a model has fast tokenizer support.
 </Tip>
 If you trained your own tokenizer, you can create one from your *vocabulary* file:
 ```py
 >>> from transformers import DistilBertTokenizer
 >>> my_tokenizer = DistilBertTokenizer(vocab_file="my_vocab_file.txt", do_lower_case=False, padding_side="left")
 ```
 It is important to remember the vocabulary from a custom tokenizer will be different from the vocabulary generated by a pretrained model's tokenizer. You need to use a pretrained model's vocabulary if you are using a pretrained model, otherwise the inputs won't make sense. Create a tokenizer with a pretrained model's vocabulary with the [`DistilBertTokenizer`] class:
 ```py
 >>> from transformers import DistilBertTokenizer
 >>> slow_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
 ```
 Create a fast tokenizer with the [`DistilBertTokenizerFast`] class:
 ```py
 >>> from transformers import DistilBertTokenizerFast
 >>> fast_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
 ```
 <Tip>
 By default, [`AutoTokenizer`] will try to load a fast tokenizer. You can disable this behavior by setting `use_fast=False` in `from_pretrained`.
 </Tip>
 ## Feature Extractor
 A feature extractor processes audio or image inputs. It inherits from the base [`~feature_extraction_utils.FeatureExtractionMixin`] class, and may also inherit from the [`ImageFeatureExtractionMixin`] class for processing image features or the [`SequenceFeatureExtractor`] class for processing audio inputs.
 Depending on whether you are working on an audio or vision task, create a feature extractor associated with the model you're using. For example, create a default [`ViTFeatureExtractor`] if you are using [ViT](model_doc/vit) for image classification:
 ```py
 >>> from transformers import ViTFeatureExtractor
 >>> vit_extractor = ViTFeatureExtractor()
 >>> print(vit_extractor)
 ViTFeatureExtractor {
  "do_normalize": true,
  "do_resize": true,
  "feature_extractor_type": "ViTFeatureExtractor",
  "image_mean": [
    0.5,
    0.5,
    0.5
  ],
  "image_std": [
    0.5,
    0.5,
    0.5
  ],
  "resample": 2,
  "size": 224
 }
 ```
 <Tip>
 If you aren't looking for any customization, just use the `from_pretrained` method to load a model's default feature extractor parameters.
 </Tip>
 Modify any of the [`ViTFeatureExtractor`] parameters to create your custom feature extractor:
 ```py
 >>> from transformers import ViTFeatureExtractor
 >>> my_vit_extractor = ViTFeatureExtractor(resample="PIL.Image.BOX", do_normalize=False, image_mean=[0.3, 0.3, 0.3])
 >>> print(my_vit_extractor)
 ViTFeatureExtractor {
  "do_normalize": false,
  "do_resize": true,
  "feature_extractor_type": "ViTFeatureExtractor",
  "image_mean": [
    0.3,
    0.3,
    0.3
  ],
  "image_std": [
    0.5,
    0.5,
    0.5
  ],
  "resample": "PIL.Image.BOX",
  "size": 224
 }
 ```
 For audio inputs, you can create a [`Wav2Vec2FeatureExtractor`] and customize the parameters in a similar way:
 ```py
 >>> from transformers import Wav2Vec2FeatureExtractor
 >>> w2v2_extractor = Wav2Vec2FeatureExtractor()
 >>> print(w2v2_extractor)
 Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": false,
  "sampling_rate": 16000
 }
 ```
 ## Processor
 For models that support multimodal tasks, 🤗 Transformers offers a processor class that conveniently wraps a feature extractor and tokenizer into a single object. For example, let's use the [`Wav2Vec2Processor`] for an automatic speech recognition task (ASR). ASR transcribes audio to text, so you will need a feature extractor and a tokenizer.
 Create a feature extractor to handle the audio inputs:
 ```py
 >>> from transformers import Wav2Vec2FeatureExtractor
 >>> feature_extractor = Wav2Vec2FeatureExtractor(padding_value=1.0, do_normalize=True)
 ```
 Create a tokenizer to handle the text inputs:
 ```py
 >>> from transformers import Wav2Vec2CTCTokenizer
 >>> tokenizer = Wav2Vec2CTCTokenizer(vocab_file="my_vocab_file.txt")
 ```
 Combine the feature extractor and tokenizer in [`Wav2Vec2Processor`]:
 ```py
 >>> from transformers import Wav2Vec2Processor
 >>> processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
 ```
 With two basic classes - configuration and model - and an additional preprocessing class (tokenizer, feature extractor, or processor), you can create any of the models supported by 🤗 Transformers. Each of these base classes are configurable, allowing you to use the specific attributes you want. You can easily setup a model for training or modify an existing pretrained model to fine-tune.
--- a/docs/source/custom_datasets.mdx
+++ b/docs/source/custom_datasets.mdx
@ -54,6 +54,7 @@ The 🤗 Datasets library makes it simple to load a dataset:
 ```python
 from datasets import load_dataset
 imdb = load_dataset("imdb")
 ```
@ -61,8 +62,9 @@ This loads a `DatasetDict` object which you can index into to view an example:
 ```python
 imdb["train"][0]
-{'label': 1,
+{
- 'text': 'Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!'
+    "label": 1,
    "text": "Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as \"Teachers\". My 35 years in the teaching profession lead me to believe that Bromwell High's satire is much closer to reality than is \"Teachers\". The scramble to survive financially, the insightful students who can see right through their pathetic teachers' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I'm here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn't!",
 }
 ```
@ -74,6 +76,7 @@ model was trained with to ensure appropriately tokenized words. Load the DistilB
 ```python
 from transformers import AutoTokenizer
 tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
 ```
@ -99,6 +102,7 @@ batch. This is known as **dynamic padding**. You can do this with the `DataColla
 ```python
 from transformers import DataCollatorWithPadding
 data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
 ```
@ -108,6 +112,7 @@ Now load your model with the [`AutoModelForSequenceClassification`] class along
 ```python
 from transformers import AutoModelForSequenceClassification
 model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
 ```
@ -121,7 +126,7 @@ At this point, only three steps remain:
 from transformers import TrainingArguments, Trainer
 training_args = TrainingArguments(
-    output_dir='./results',
+    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
@ -150,6 +155,7 @@ Make sure you set `return_tensors="tf"` to return `tf.Tensor` outputs instead of
 ```python
 from transformers import DataCollatorWithPadding
 data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf")
 ```
@ -158,14 +164,14 @@ Next, convert your datasets to the `tf.data.Dataset` format with `to_tf_dataset`
 ```python
 tf_train_dataset = tokenized_imdb["train"].to_tf_dataset(
-    columns=['attention_mask', 'input_ids', 'label'],
+    columns=["attention_mask", "input_ids", "label"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
 )
 tf_validation_dataset = tokenized_imdb["train"].to_tf_dataset(
-    columns=['attention_mask', 'input_ids', 'label'],
+    columns=["attention_mask", "input_ids", "label"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
@ -182,17 +188,14 @@ batch_size = 16
 num_epochs = 5
 batches_per_epoch = len(tokenized_imdb["train"]) // batch_size
 total_train_steps = int(batches_per_epoch * num_epochs)
-optimizer, schedule = create_optimizer(
+optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)
    init_lr=2e-5, 
    num_warmup_steps=0, 
    num_train_steps=total_train_steps
 )
 ```
 Load your model with the [`TFAutoModelForSequenceClassification`] class along with the number of expected labels:
 ```python
 from transformers import TFAutoModelForSequenceClassification
 model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
 ```
@ -200,6 +203,7 @@ Compile the model:
 ```python
 import tensorflow as tf
 model.compile(optimizer=optimizer)
 ```
@ -234,14 +238,15 @@ or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/no
 Load the WNUT 17 dataset from the 🤗 Datasets library:
 ```python
-from datasets import load_dataset
+>>> from datasets import load_dataset
-wnut = load_dataset("wnut_17")
+
 >>> wnut = load_dataset("wnut_17")
 ```
 A quick look at the dataset shows the labels associated with each word in the sentence:
 ```python
-wnut["train"][0]
+>>> wnut["train"][0]
 {'id': '0',
 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0],
 'tokens': ['@paulwalk', 'It', "'s", 'the', 'view', 'from', 'where', 'I', "'m", 'living', 'for', 'two', 'weeks', '.', 'Empire', 'State', 'Building', '=', 'ESB', '.', 'Pretty', 'bad', 'storm', 'here', 'last', 'evening', '.']
@ -251,21 +256,22 @@ wnut["train"][0]
 View the specific NER tags by:
 ```python
-label_list = wnut["train"].features[f"ner_tags"].feature.names
+>>> label_list = wnut["train"].features[f"ner_tags"].feature.names
-label_list
+>>> label_list
-['O',
+[
- 'B-corporation',
+    "O",
- 'I-corporation',
+    "B-corporation",
- 'B-creative-work',
+    "I-corporation",
- 'I-creative-work',
+    "B-creative-work",
- 'B-group',
+    "I-creative-work",
- 'I-group',
+    "B-group",
- 'B-location',
+    "I-group",
- 'I-location',
+    "B-location",
- 'B-person',
+    "I-location",
- 'I-person',
+    "B-person",
- 'B-product',
+    "I-person",
- 'I-product'
+    "B-product",
    "I-product",
 ]
 ```
@ -282,6 +288,7 @@ Now you need to tokenize the text. Load the DistilBERT tokenizer with an [`AutoT
 ```python
 from transformers import AutoTokenizer
 tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
 ```
@ -289,9 +296,9 @@ Since the input has already been split into words, set `is_split_into_words=True
 subwords:
 ```python
-tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
+>>> tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
-tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
+>>> tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
-tokens
+>>> tokens
 ['[CLS]', '@', 'paul', '##walk', 'it', "'", 's', 'the', 'view', 'from', 'where', 'i', "'", 'm', 'living', 'for', 'two', 'weeks', '.', 'empire', 'state', 'building', '=', 'es', '##b', '.', 'pretty', 'bad', 'storm', 'here', 'last', 'evening', '.', '[SEP]']
 ```
@ -314,12 +321,14 @@ def tokenize_and_align_labels(examples):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
-        for word_idx in word_ids:                            # Set the special tokens to -100.
+        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
-            elif word_idx != previous_word_idx:              # Only label the first token of a given word.
+            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
-
+            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
@ -336,6 +345,7 @@ Finally, pad your text and labels, so they are a uniform length:
 ```python
 from transformers import DataCollatorForTokenClassification
 data_collator = DataCollatorForTokenClassification(tokenizer)
 ```
@ -345,6 +355,7 @@ Load your model with the [`AutoModelForTokenClassification`] class along with th
 ```python
 from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
 model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=len(label_list))
 ```
@ -352,7 +363,7 @@ Gather your training arguments in [`TrainingArguments`]:
 ```python
 training_args = TrainingArguments(
-    output_dir='./results',
+    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
@ -387,6 +398,7 @@ Batch your examples together and pad your text and labels, so they are a uniform
 ```python
 from transformers import DataCollatorForTokenClassification
 data_collator = DataCollatorForTokenClassification(tokenizer, return_tensors="tf")
 ```
@ -412,6 +424,7 @@ Load the model with the [`TFAutoModelForTokenClassification`] class along with t
 ```python
 from transformers import TFAutoModelForTokenClassification
 model = TFAutoModelForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=len(label_list))
 ```
@ -435,6 +448,7 @@ Compile the model:
 ```python
 import tensorflow as tf
 model.compile(optimizer=optimizer)
 ```
@ -469,13 +483,14 @@ Load the SQuAD dataset from the 🤗 Datasets library:
 ```python
 from datasets import load_dataset
 squad = load_dataset("squad")
 ```
 Take a look at an example from the dataset:
 ```python
-squad["train"][0]
+>>> squad["train"][0]
 {'answers': {'answer_start': [515], 'text': ['Saint Bernadette Soubirous']},
 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
 'id': '5733be284776f41900661182',
@ -490,6 +505,7 @@ Load the DistilBERT tokenizer with an [`AutoTokenizer`]:
 ```python
 from transformers import AutoTokenizer
 tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
 ```
@ -567,6 +583,7 @@ Batch the processed examples together:
 ```python
 from transformers import default_data_collator
 data_collator = default_data_collator
 ```
@ -576,6 +593,7 @@ Load your model with the [`AutoModelForQuestionAnswering`] class:
 ```python
 from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
 model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")
 ```
@ -583,7 +601,7 @@ Gather your training arguments in [`TrainingArguments`]:
 ```python
 training_args = TrainingArguments(
-    output_dir='./results',
+    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
@ -618,6 +636,7 @@ Batch the processed examples together with a TensorFlow default data collator:
 ```python
 from transformers.data.data_collator import tf_default_collator
 data_collator = tf_default_collator
 ```
@ -650,8 +669,8 @@ batch_size = 16
 num_epochs = 2
 total_train_steps = (len(tokenized_squad["train"]) // batch_size) * num_epochs
 optimizer, schedule = create_optimizer(
-    init_lr=2e-5, 
+    init_lr=2e-5,
-    num_warmup_steps=0, 
+    num_warmup_steps=0,
    num_train_steps=total_train_steps,
 )
 ```
@ -660,6 +679,7 @@ Load your model with the [`TFAutoModelForQuestionAnswering`] class:
 ```python
 from transformers import TFAutoModelForQuestionAnswering
 model = TFAutoModelForQuestionAnswering("distilbert-base-uncased")
 ```
@ -667,6 +687,7 @@ Compile the model:
 ```python
 import tensorflow as tf
 model.compile(optimizer=optimizer)
 ```
--- a/docs/source/custom_models.mdx
+++ b/docs/source/custom_models.mdx
@ -0,0 +1,349 @@
 <!--Copyright 2020 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
 -->
 # Sharing custom models
 The 🤗 Transformers library is designed to be easily extensible. Every model is fully coded in a given subfolder
 of the repository with no abstraction, so you can easily copy a modeling file and tweak it to your needs.
 If you are writing a brand new model, it might be easier to start from scratch. In this tutorial, we will show you
 how to write a custom model and its configuration so it can be used inside Transformers, and how you can share it
 with the community (with the code it relies on) so that anyone can use it, even if it's not present in the 🤗
 Transformers library.
 We will illustrate all of this on a ResNet model, by wrapping the ResNet class of the
 [timm library](https://github.com/rwightman/pytorch-image-models/tree/master/timm) into a [`PreTrainedModel`].
 ## Writing a custom configuration
 Before we dive into the model, let's first write its configuration. The configuration of a model is an object that
 will contain all the necessary information to build the model. As we will see in the next section, the model can only
 take a `config` to be initialized, so we really need that object to be as complete as possible.
 In our example, we will take a couple of arguments of the ResNet class that we might want to tweak. Different
 configurations will then give us the different types of ResNets that are possible. We then just store those arguments,
 after checking the validity of a few of them.
 ```python
 from transformers import PretrainedConfig
 from typing import List
 class ResnetConfig(PretrainedConfig):
    model_type = "resnet"
    def __init__(
        self,
        block_type="bottleneck",
        layers: List[int] = [3, 4, 6, 3],
        num_classes: int = 1000,
        input_channels: int = 3,
        cardinality: int = 1,
        base_width: int = 64,
        stem_width: int = 64,
        stem_type: str = "",
        avg_down: bool = False,
        **kwargs,
    ):
        if block_type not in ["basic", "bottleneck"]:
            raise ValueError(f"`block` must be 'basic' or bottleneck', got {block}.")
        if stem_type not in ["", "deep", "deep-tiered"]:
            raise ValueError(f"`stem_type` must be '', 'deep' or 'deep-tiered', got {block}.")
        self.block_type = block_type
        self.layers = layers
        self.num_classes = num_classes
        self.input_channels = input_channels
        self.cardinality = cardinality
        self.base_width = base_width
        self.stem_width = stem_width
        self.stem_type = stem_type
        self.avg_down = avg_down
        super().__init__(**kwargs)
 ```
 The three important things to remember when writing you own configuration are the following:
 - you have to inherit from `PretrainedConfig`,
 - the `__init__` of your `PretrainedConfig` must accept any kwargs,
 - those `kwargs` need to be passed to the superclass `__init__`.
 The inheritance is to make sure you get all the functionality from the 🤗 Transformers library, while the two other
 constraints come from the fact a `PretrainedConfig` has more fields than the ones you are setting. When reloading a
 config with the `from_pretrained` method, those fields need to be accepted by your config and then sent to the
 superclass.
 Defining a `model_type` for your configuration (here `model_type="resnet"`) is not mandatory, unless you want to
 register your model with the auto classes (see last section).
 With this done, you can easily create and save your configuration like you would do with any other model config of the
 library. Here is how we can create a resnet50d config and save it:
 ```py
 resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True)
 resnet50d_config.save_pretrained("custom-resnet")
 ```
 This will save a file named `config.json` inside the folder `custom-resnet`. You can then reload your config with the
 `from_pretrained` method:
 ```py
 resnet50d_config = ResnetConfig.from_pretrained("custom-resnet")
 ```
 You can also use any other method of the [`PretrainedConfig`] class, like [`~PretrainedConfig.push_to_hub`] to
 directly upload your config to the Hub.
 ## Writing a custom model
 Now that we have our ResNet configuration, we can go on writing the model. We will actually write two: one that
 extracts the hidden features from a batch of images (like [`BertModel`]) and one that is suitable for image
 classification (like [`BertModelForSequenceClassification`]).
 As we mentioned before, we'll only write a loose wrapper of the model to keep it simple for this example. The only
 thing we need to do before writing this class is a map between the block types and actual block classes. Then the
 model is defined from the configuration by passing everything to the `ResNet` class:
 ```py
 from transformers import PreTrainedModel
 from timm.models.resnet import BasicBlock, Bottleneck, ResNet
 from .configuration_resnet import ResnetConfig
 BLOCK_MAPPING = {"basic": BasicBlock, "bottleneck": Bottleneck}
 class ResnetModel(PreTrainedModel):
    config_class = ResnetConfig
    def __init__(self, config):
        super().__init__(config)
        block_layer = BLOCK_MAPPING[config.block_type]
        self.model = ResNet(
            block_layer,
            config.layers,
            num_classes=config.num_classes,
            in_chans=config.input_channels,
            cardinality=config.cardinality,
            base_width=config.base_width,
            stem_width=config.stem_width,
            stem_type=config.stem_type,
            avg_down=config.avg_down,
        )
    def forward(self, tensor):
        return self.model.forward_features(tensor)
 ```
 For the model that will classify images, we just change the forward method:
 ```py
 class ResnetModelForImageClassification(PreTrainedModel):
    config_class = ResnetConfig
    def __init__(self, config):
        super().__init__(config)
        block_layer = BLOCK_MAPPING[config.block_type]
        self.model = ResNet(
            block_layer,
            config.layers,
            num_classes=config.num_classes,
            in_chans=config.input_channels,
            cardinality=config.cardinality,
            base_width=config.base_width,
            stem_width=config.stem_width,
            stem_type=config.stem_type,
            avg_down=config.avg_down,
        )
    def forward(self, tensor, labels=None):
        logits = self.model(tensor)
        if labels is not None:
            loss = torch.nn.cross_entropy(logits, labels)
            return {"loss": loss, "logits": logits}
        return {"logits": logits}
 ```
 In both cases, notice how we inherit from `PreTrainedModel` and call the superclass initialization with the `config`
 (a bit like when you write a regular `torch.nn.Module`). The line that sets the `config_class` is not mandatory, unless
 you want to register your model with the auto classes (see last section).
 <Tip>
 If your model is very similar to a model inside the library, you can re-use the same configuration as this model.
 </Tip>
 You can have your model return anything you want, but returning a dictionary like we did for
 `ResnetModelForImageClassification`, with the loss included when labels are passed, will make your model directly
 usable inside the [`Trainer`] class. Using another output format is fine as long as you are planning on using your own
 training loop or another library for training.
 Now that we have our model class, let's create one:
 ```py
 resnet50d = ResnetModelForImageClassification(resnet50d_config)
 ```
 Again, you can use any of the methods of [`PreTrainedModel`], like [`~PreTrainedModel.save_pretrained`] or
 [`~PreTrainedModel.push_to_hub`]. We will use the second in the next section, and see how to push the model weights
 with the code of our model. But first, let's load some pretrained weights inside our model.
 In your own use case, you will probably be training your custom model on your own data. To go fast for this tutorial,
 we will use the pretrained version of the resnet50d. Since our model is just a wrapper around it, it's going to be
 easy to transfer those weights:
 ```py
 import timm
 pretrained_model = timm.create_model("resnet50d", pretrained=True)
 resnet50d.model.load_state_dict(pretrained_model.state_dict())
 ```
 Now let's see how to make sure that when we do [`~PreTrainedModel.save_pretrained`] or [`~PreTrainedModel.push_to_hub`], the
 code of the model is saved.
 ## Sending the code to the Hub
 <Tip warning={true}>
 This API is experimental and may have some slight breaking changes in the next releases.
 </Tip>
 First, make sure your model is fully defined in a `.py` file. It can rely on relative imports to some other files as
 long as all the files are in the same directory (we don't support submodules for this feature yet). For our example,
 we'll define a `modeling_resnet.py` file and a `configuration_resnet.py` file in a folder of the current working
 directory named `resnet_model`. The configuration file contains the code for `ResnetConfig` and the modeling file
 contains the code of `ResnetModel` and `ResnetModelForImageClassification`.
 ```
 .
 └── resnet_model
    ├── __init__.py
    ├── configuration_resnet.py
    └── modeling_resnet.py
 ```
 The `__init__.py` can be empty, it's just there so that Python detects `resnet_model` can be use as a module.
 <Tip warning={true}>
 If copying a modeling files from the library, you will need to replace all the relative imports at the top of the file
 to import from the `transformers` package.
 </Tip>
 Note that you can re-use (or subclass) an existing configuration/model.
 To share your model with the community, follow those steps: first import the ResNet model and config from the newly
 created files:
 ```py
 from resnet_model.configuration_resnet import ResnetConfig
 from resnet_model.modeling_resnet import ResnetModel, ResnetModelForImageClassification
 ```
 Then you have to tell the library you want to copy the code files of those objects when using the `save_pretrained`
 method and properly register them with a given Auto class (especially for models), just run:
 ```py
 ResnetConfig.register_for_auto_class()
 ResnetModel.register_for_auto_class("AutoModel")
 ResnetModelForImageClassification.register_for_auto_class("AutoModelForImageClassification")
 ```
 Note that there is no need to specify an auto class for the configuration (there is only one auto class for them,
 [`AutoConfig`]) but it's different for models. Your custom model could be suitable for many different tasks, so you
 have to specify which one of the auto classes is the correct one for your model.
 Next, let's create the config and models as we did before:
 ```py
 resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True)
 resnet50d = ResnetModelForImageClassification(resnet50d_config)
 pretrained_model = timm.create_model("resnet50d", pretrained=True)
 resnet50d.model.load_state_dict(pretrained_model.state_dict())
 ```
 Now to send the model to the Hub, make sure you are logged in. Either run in your terminal:
 ```bash
 huggingface-cli login
 ```
 or from a notebook:
 ```py
 from huggingface_hub import notebook_login
 notebook_login()
 ```
 You can then push to to your own namespace (or an organization you are a member of) like this:
 ```py
 resnet50d.push_to_hub("custom-resnet50d")
 ```
 On top of the modeling weights and the configuration in json format, this also copied the modeling and
 configuration `.py` files in the folder `custom-resnet50d` and uploaded the result to the Hub. You can check the result
 in this [model repo](https://huggingface.co/sgugger/custom-resnet50d).
 See the [sharing tutorial](model_sharing) for more information on the push to Hub method.
 ## Using a model with custom code
 You can use any configuration, model or tokenizer with custom code files in its repository with the auto-classes and
 the `from_pretrained` method. All files and code uploaded to the Hub are scanned for malware (refer to the [Hub security](https://huggingface.co/docs/hub/security#malware-scanning) documentation for more information), but you should still 
 review the model code and author to avoid executing malicious code on your machine. Set `trust_remote_code=True` to use
 a model with custom code:
 ```py
 from transformers import AutoModelForImageClassification
 model = AutoModelForImageClassification.from_pretrained("sgugger/custom-resnet50d", trust_remote_code=True)
 ```
 It is also strongly encouraged to pass a commit hash as a `revision` to make sure the author of the models did not
 update the code with some malicious new lines (unless you fully trust the authors of the models).
 ```py
 commit_hash = "ed94a7c6247d8aedce4647f00f20de6875b5b292"
 model = AutoModelForImageClassification.from_pretrained(
    "sgugger/custom-resnet50d", trust_remote_code=True, revision=commit_hash
 )
 ```
 Note that when browsing the commit history of the model repo on the Hub, there is a button to easily copy the commit
 hash of any commit.
 ## Registering a model with custom code to the auto classes
 If you are writing a library that extends 🤗 Transformers, you may want to extend the auto classes to include your own
 model. This is different from pushing the code to the Hub in the sense that users will need to import your library to
 get the custom models (contrarily to automatically downloading the model code from the Hub).
 As long as your config has a `model_type` attribute that is different from existing model types, and that your model
 classes have the right `config_class` attributes, you can just add them to the auto classes likes this:
 ```py
 from transformers import AutoConfig, AutoModel, AutoModelForImageClassification
 AutoConfig.register("resnet", ResnetConfig)
 AutoModel.register(ResnetConfig, ResnetModel)
 AutoModelForImageClassification.register(ResnetConfig, ResnetModelForImageClassification)
 ```
 Note that the first argument used when registering your custom config to [`AutoConfig`] needs to match the `model_type`
 of your custom config, and the first argument used when registering your custom models to any auto model class needs
 to match the `config_class` of those models.
--- a/docs/source/debugging.mdx
+++ b/docs/source/debugging.mdx
@ -0,0 +1,335 @@
 <!--Copyright 2021 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
 -->
 # Debugging
 ## Multi-GPU Network Issues Debug
 When training or inferencing with `DistributedDataParallel` and multiple GPU, if you run into issue of inter-communication between processes and/or nodes, you can use the following script to diagnose network issues.
 ```bash
 wget https://raw.githubusercontent.com/huggingface/transformers/master/scripts/distributed/torch-distributed-gpu-test.py
 ```
 For example to test how 2 GPUs interact do:
 ```bash
 python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py
 ```
 If both processes can talk to each and allocate GPU memory each will print an OK status.
 For more GPUs or nodes adjust the arguments in the script.
 You will find a lot more details inside the diagnostics script and even a recipe to how you could run it in a SLURM environment.
 An additional level of debug is to add `NCCL_DEBUG=INFO` environment variable as follows:
 ```bash
 NCCL_DEBUG=INFO python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py
 ```
 This will dump a lot of NCCL-related debug information, which you can then search online if you find that some problems are reported. Or if you're not sure how to interpret the output you can share the log file in an Issue.
 ## Underflow and Overflow Detection
 <Tip>
 This feature is currently available for PyTorch-only.
 </Tip>
 <Tip>
 For multi-GPU training it requires DDP (`torch.distributed.launch`).
 </Tip>
 <Tip>
 This feature can be used with any `nn.Module`-based model.
 </Tip>
 If you start getting `loss=NaN` or the model inhibits some other abnormal behavior due to `inf` or `nan` in
 activations or weights one needs to discover where the first underflow or overflow happens and what led to it. Luckily
 you can accomplish that easily by activating a special module that will do the detection automatically.
 If you're using [`Trainer`], you just need to add:
 ```bash
 --debug underflow_overflow
 ```
 to the normal command line arguments, or pass `debug="underflow_overflow"` when creating the
 [`TrainingArguments`] object.
 If you're using your own training loop or another Trainer you can accomplish the same with:
 ```python
 from .debug_utils import DebugUnderflowOverflow
 debug_overflow = DebugUnderflowOverflow(model)
 ```
 [`~debug_utils.DebugUnderflowOverflow`] inserts hooks into the model that immediately after each
 forward call will test input and output variables and also the corresponding module's weights. As soon as `inf` or
 `nan` is detected in at least one element of the activations or weights, the program will assert and print a report
 like this (this was caught with `google/mt5-small` under fp16 mixed precision):
 ```
 Detected inf/nan during batch_number=0
 Last 21 forward frames:
 abs min  abs max  metadata
                  encoder.block.1.layer.1.DenseReluDense.dropout Dropout
 0.00e+00 2.57e+02 input[0]
 0.00e+00 2.85e+02 output
 [...]
                  encoder.block.2.layer.0 T5LayerSelfAttention
 6.78e-04 3.15e+03 input[0]
 2.65e-04 3.42e+03 output[0]
             None output[1]
 2.25e-01 1.00e+04 output[2]
                  encoder.block.2.layer.1.layer_norm T5LayerNorm
 8.69e-02 4.18e-01 weight
 2.65e-04 3.42e+03 input[0]
 1.79e-06 4.65e+00 output
                  encoder.block.2.layer.1.DenseReluDense.wi_0 Linear
 2.17e-07 4.50e+00 weight
 1.79e-06 4.65e+00 input[0]
 2.68e-06 3.70e+01 output
                  encoder.block.2.layer.1.DenseReluDense.wi_1 Linear
 8.08e-07 2.66e+01 weight
 1.79e-06 4.65e+00 input[0]
 1.27e-04 2.37e+02 output
                  encoder.block.2.layer.1.DenseReluDense.dropout Dropout
 0.00e+00 8.76e+03 input[0]
 0.00e+00 9.74e+03 output
                  encoder.block.2.layer.1.DenseReluDense.wo Linear
 1.01e-06 6.44e+00 weight
 0.00e+00 9.74e+03 input[0]
 3.18e-04 6.27e+04 output
                  encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
 1.79e-06 4.65e+00 input[0]
 3.18e-04 6.27e+04 output
                  encoder.block.2.layer.1.dropout Dropout
 3.18e-04 6.27e+04 input[0]
 0.00e+00      inf output
 ```
 The example output has been trimmed in the middle for brevity.
 The second column shows the value of the absolute largest element, so if you have a closer look at the last few frames,
 the inputs and outputs were in the range of `1e4`. So when this training was done under fp16 mixed precision the very
 last step overflowed (since under `fp16` the largest number before `inf` is `64e3`). To avoid overflows under
 `fp16` the activations must remain way below `1e4`, because `1e4 * 1e4 = 1e8` so any matrix multiplication with
 large activations is going to lead to a numerical overflow condition.
 At the very start of the trace you can discover at which batch number the problem occurred (here `Detected inf/nan during batch_number=0` means the problem occurred on the first batch).
 Each reported frame starts by declaring the fully qualified entry for the corresponding module this frame is reporting
 for. If we look just at this frame:
 ```
                  encoder.block.2.layer.1.layer_norm T5LayerNorm
 8.69e-02 4.18e-01 weight
 2.65e-04 3.42e+03 input[0]
 1.79e-06 4.65e+00 output
 ```
 Here, `encoder.block.2.layer.1.layer_norm` indicates that it was a layer norm for the first layer, of the second
 block of the encoder. And the specific calls of the `forward` is `T5LayerNorm`.
 Let's look at the last few frames of that report:
 ```
 Detected inf/nan during batch_number=0
 Last 21 forward frames:
 abs min  abs max  metadata
 [...]
                  encoder.block.2.layer.1.DenseReluDense.wi_0 Linear
 2.17e-07 4.50e+00 weight
 1.79e-06 4.65e+00 input[0]
 2.68e-06 3.70e+01 output
                  encoder.block.2.layer.1.DenseReluDense.wi_1 Linear
 8.08e-07 2.66e+01 weight
 1.79e-06 4.65e+00 input[0]
 1.27e-04 2.37e+02 output
                  encoder.block.2.layer.1.DenseReluDense.wo Linear
 1.01e-06 6.44e+00 weight
 0.00e+00 9.74e+03 input[0]
 3.18e-04 6.27e+04 output
                  encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
 1.79e-06 4.65e+00 input[0]
 3.18e-04 6.27e+04 output
                  encoder.block.2.layer.1.dropout Dropout
 3.18e-04 6.27e+04 input[0]
 0.00e+00      inf output
 ```
 The last frame reports for `Dropout.forward` function with the first entry for the only input and the second for the
 only output. You can see that it was called from an attribute `dropout` inside `DenseReluDense` class. We can see
 that it happened during the first layer, of the 2nd block, during the very first batch. Finally, the absolute largest
 input elements was `6.27e+04` and same for the output was `inf`.
 You can see here, that `T5DenseGatedGeluDense.forward` resulted in output activations, whose absolute max value was
 around 62.7K, which is very close to fp16's top limit of 64K. In the next frame we have `Dropout` which renormalizes
 the weights, after it zeroed some of the elements, which pushes the absolute max value to more than 64K, and we get an
 overflow (`inf`).
 As you can see it's the previous frames that we need to look into when the numbers start going into very large for fp16
 numbers.
 Let's match the report to the code from `models/t5/modeling_t5.py`:
 ```python
 class T5DenseGatedGeluDense(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False)
        self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False)
        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
        self.dropout = nn.Dropout(config.dropout_rate)
        self.gelu_act = ACT2FN["gelu_new"]
    def forward(self, hidden_states):
        hidden_gelu = self.gelu_act(self.wi_0(hidden_states))
        hidden_linear = self.wi_1(hidden_states)
        hidden_states = hidden_gelu * hidden_linear
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.wo(hidden_states)
        return hidden_states
 ```
 Now it's easy to see the `dropout` call, and all the previous calls as well.
 Since the detection is happening in a forward hook, these reports are printed immediately after each `forward`
 returns.
 Going back to the full report, to act on it and to fix the problem, we need to go a few frames up where the numbers
 started to go up and most likely switch to the `fp32` mode here, so that the numbers don't overflow when multiplied
 or summed up. Of course, there might be other solutions. For example, we could turn off `amp` temporarily if it's
 enabled, after moving the original `forward` into a helper wrapper, like so:
 ```python
 def _forward(self, hidden_states):
    hidden_gelu = self.gelu_act(self.wi_0(hidden_states))
    hidden_linear = self.wi_1(hidden_states)
    hidden_states = hidden_gelu * hidden_linear
    hidden_states = self.dropout(hidden_states)
    hidden_states = self.wo(hidden_states)
    return hidden_states
 import torch
 def forward(self, hidden_states):
    if torch.is_autocast_enabled():
        with torch.cuda.amp.autocast(enabled=False):
            return self._forward(hidden_states)
    else:
        return self._forward(hidden_states)
 ```
 Since the automatic detector only reports on inputs and outputs of full frames, once you know where to look, you may
 want to analyse the intermediary stages of any specific `forward` function as well. In such a case you can use the
 `detect_overflow` helper function to inject the detector where you want it, for example:
 ```python
 from debug_utils import detect_overflow
 class T5LayerFF(nn.Module):
    [...]
    def forward(self, hidden_states):
        forwarded_states = self.layer_norm(hidden_states)
        detect_overflow(forwarded_states, "after layer_norm")
        forwarded_states = self.DenseReluDense(forwarded_states)
        detect_overflow(forwarded_states, "after DenseReluDense")
        return hidden_states + self.dropout(forwarded_states)
 ```
 You can see that we added 2 of these and now we track if `inf` or `nan` for `forwarded_states` was detected
 somewhere in between.
 Actually, the detector already reports these because each of the calls in the example above is a `nn.Module`, but
 let's say if you had some local direct calculations this is how you'd do that.
 Additionally, if you're instantiating the debugger in your own code, you can adjust the number of frames printed from
 its default, e.g.:
 ```python
 from .debug_utils import DebugUnderflowOverflow
 debug_overflow = DebugUnderflowOverflow(model, max_frames_to_save=100)
 ```
 ### Specific batch absolute mix and max value tracing
 The same debugging class can be used for per-batch tracing with the underflow/overflow detection feature turned off.
 Let's say you want to watch the absolute min and max values for all the ingredients of each `forward` call of a given
 batch, and only do that for batches 1 and 3. Then you instantiate this class as:
 ```python
 debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3])
 ```
 And now full batches 1 and 3 will be traced using the same format as the underflow/overflow detector does.
 Batches are 0-indexed.
 This is helpful if you know that the program starts misbehaving after a certain batch number, so you can fast-forward
 right to that area. Here is a sample truncated output for such configuration:
 ```
                  *** Starting batch number=1 ***
 abs min  abs max  metadata
                  shared Embedding
 1.01e-06 7.92e+02 weight
 0.00e+00 2.47e+04 input[0]
 5.36e-05 7.92e+02 output
 [...]
                  decoder.dropout Dropout
 1.60e-07 2.27e+01 input[0]
 0.00e+00 2.52e+01 output
                  decoder T5Stack
     not a tensor output
                  lm_head Linear
 1.01e-06 7.92e+02 weight
 0.00e+00 1.11e+00 input[0]
 6.06e-02 8.39e+01 output
                   T5ForConditionalGeneration
     not a tensor output
                  *** Starting batch number=3 ***
 abs min  abs max  metadata
                  shared Embedding
 1.01e-06 7.92e+02 weight
 0.00e+00 2.78e+04 input[0]
 5.36e-05 7.92e+02 output
 [...]
 ```
 Here you will get a huge number of frames dumped - as many as there were forward calls in your model, so it may or may
 not what you want, but sometimes it can be easier to use for debugging purposes than a normal debugger. For example, if
 a problem starts happening at batch number 150. So you can dump traces for batches 149 and 150 and compare where
 numbers started to diverge.
 You can also specify the batch number after which to stop the training, with:
 ```python
 debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3], abort_after_batch_num=3)
 ```
--- a/docs/source/debugging.rst
+++ b/docs/source/debugging.rst
@ -1,299 +0,0 @@
 ..
    Copyright 2021 The HuggingFace Team. All rights reserved.
    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
    the License. You may obtain a copy of the License at
        http://www.apache.org/licenses/LICENSE-2.0
    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
    specific language governing permissions and limitations under the License.
 Debugging
 =======================================================================================================================
 Underflow and Overflow Detection
 -----------------------------------------------------------------------------------------------------------------------
 .. note::
   This feature is currently available for PyTorch-only.
 .. note::
   For multi-GPU training it requires DDP (``torch.distributed.launch``).
 .. note::
   This feature can be used with any ``nn.Module``-based model.
 If you start getting ``loss=NaN`` or the model inhibits some other abnormal behavior due to ``inf`` or ``nan`` in
 activations or weights one needs to discover where the first underflow or overflow happens and what led to it. Luckily
 you can accomplish that easily by activating a special module that will do the detection automatically.
 If you're using :class:`~transformers.Trainer`, you just need to add:
 .. code-block:: bash
    --debug underflow_overflow
 to the normal command line arguments, or pass ``debug="underflow_overflow"`` when creating the
 :class:`~transformers.TrainingArguments` object.
 If you're using your own training loop or another Trainer you can accomplish the same with:
 .. code-block:: python
    from .debug_utils import DebugUnderflowOverflow
    debug_overflow = DebugUnderflowOverflow(model)
 :class:`~transformers.debug_utils.DebugUnderflowOverflow` inserts hooks into the model that immediately after each
 forward call will test input and output variables and also the corresponding module's weights. As soon as ``inf`` or
 ``nan`` is detected in at least one element of the activations or weights, the program will assert and print a report
 like this (this was caught with ``google/mt5-small`` under fp16 mixed precision):
 .. code-block::
    Detected inf/nan during batch_number=0
    Last 21 forward frames:
    abs min  abs max  metadata
                      encoder.block.1.layer.1.DenseReluDense.dropout Dropout
    0.00e+00 2.57e+02 input[0]
    0.00e+00 2.85e+02 output
    [...]
                      encoder.block.2.layer.0 T5LayerSelfAttention
    6.78e-04 3.15e+03 input[0]
    2.65e-04 3.42e+03 output[0]
                 None output[1]
    2.25e-01 1.00e+04 output[2]
                      encoder.block.2.layer.1.layer_norm T5LayerNorm
    8.69e-02 4.18e-01 weight
    2.65e-04 3.42e+03 input[0]
    1.79e-06 4.65e+00 output
                      encoder.block.2.layer.1.DenseReluDense.wi_0 Linear
    2.17e-07 4.50e+00 weight
    1.79e-06 4.65e+00 input[0]
    2.68e-06 3.70e+01 output
                      encoder.block.2.layer.1.DenseReluDense.wi_1 Linear
    8.08e-07 2.66e+01 weight
    1.79e-06 4.65e+00 input[0]
    1.27e-04 2.37e+02 output
                      encoder.block.2.layer.1.DenseReluDense.dropout Dropout
    0.00e+00 8.76e+03 input[0]
    0.00e+00 9.74e+03 output
                      encoder.block.2.layer.1.DenseReluDense.wo Linear
    1.01e-06 6.44e+00 weight
    0.00e+00 9.74e+03 input[0]
    3.18e-04 6.27e+04 output
                      encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
    1.79e-06 4.65e+00 input[0]
    3.18e-04 6.27e+04 output
                      encoder.block.2.layer.1.dropout Dropout
    3.18e-04 6.27e+04 input[0]
    0.00e+00      inf output
 The example output has been trimmed in the middle for brevity.
 The second column shows the value of the absolute largest element, so if you have a closer look at the last few frames,
 the inputs and outputs were in the range of ``1e4``. So when this training was done under fp16 mixed precision the very
 last step overflowed (since under ``fp16`` the largest number before ``inf`` is ``64e3``). To avoid overflows under
 ``fp16`` the activations must remain way below ``1e4``, because ``1e4 * 1e4 = 1e8`` so any matrix multiplication with
 large activations is going to lead to a numerical overflow condition.
 At the very start of the trace you can discover at which batch number the problem occurred (here ``Detected inf/nan
 during batch_number=0`` means the problem occurred on the first batch).
 Each reported frame starts by declaring the fully qualified entry for the corresponding module this frame is reporting
 for. If we look just at this frame:
 .. code-block::
                      encoder.block.2.layer.1.layer_norm T5LayerNorm
    8.69e-02 4.18e-01 weight
    2.65e-04 3.42e+03 input[0]
    1.79e-06 4.65e+00 output
 Here, ``encoder.block.2.layer.1.layer_norm`` indicates that it was a layer norm for the first layer, of the second
 block of the encoder. And the specific calls of the ``forward`` is ``T5LayerNorm``.
 Let's look at the last few frames of that report:
 .. code-block::
        Detected inf/nan during batch_number=0
        Last 21 forward frames:
        abs min  abs max  metadata
        [...]
                          encoder.block.2.layer.1.DenseReluDense.wi_0 Linear
        2.17e-07 4.50e+00 weight
        1.79e-06 4.65e+00 input[0]
        2.68e-06 3.70e+01 output
                          encoder.block.2.layer.1.DenseReluDense.wi_1 Linear
        8.08e-07 2.66e+01 weight
        1.79e-06 4.65e+00 input[0]
        1.27e-04 2.37e+02 output
                          encoder.block.2.layer.1.DenseReluDense.wo Linear
        1.01e-06 6.44e+00 weight
        0.00e+00 9.74e+03 input[0]
        3.18e-04 6.27e+04 output
                          encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
        1.79e-06 4.65e+00 input[0]
        3.18e-04 6.27e+04 output
                          encoder.block.2.layer.1.dropout Dropout
        3.18e-04 6.27e+04 input[0]
        0.00e+00      inf output
 The last frame reports for ``Dropout.forward`` function with the first entry for the only input and the second for the
 only output. You can see that it was called from an attribute ``dropout`` inside ``DenseReluDense`` class. We can see
 that it happened during the first layer, of the 2nd block, during the very first batch. Finally, the absolute largest
 input elements was ``6.27e+04`` and same for the output was ``inf``.
 You can see here, that ``T5DenseGatedGeluDense.forward`` resulted in output activations, whose absolute max value was
 around 62.7K, which is very close to fp16's top limit of 64K. In the next frame we have ``Dropout`` which renormalizes
 the weights, after it zeroed some of the elements, which pushes the absolute max value to more than 64K, and we get an
 overflow (``inf``).
 As you can see it's the previous frames that we need to look into when the numbers start going into very large for fp16
 numbers.
 Let's match the report to the code from ``models/t5/modeling_t5.py``:
 .. code-block:: python
    class T5DenseGatedGeluDense(nn.Module):
        def __init__(self, config):
            super().__init__()
            self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False)
            self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False)
            self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
            self.dropout = nn.Dropout(config.dropout_rate)
            self.gelu_act = ACT2FN["gelu_new"]
        def forward(self, hidden_states):
            hidden_gelu = self.gelu_act(self.wi_0(hidden_states))
            hidden_linear = self.wi_1(hidden_states)
            hidden_states = hidden_gelu * hidden_linear
            hidden_states = self.dropout(hidden_states)
            hidden_states = self.wo(hidden_states)
            return hidden_states
 Now it's easy to see the ``dropout`` call, and all the previous calls as well.
 Since the detection is happening in a forward hook, these reports are printed immediately after each ``forward``
 returns.
 Going back to the full report, to act on it and to fix the problem, we need to go a few frames up where the numbers
 started to go up and most likely switch to the ``fp32`` mode here, so that the numbers don't overflow when multiplied
 or summed up. Of course, there might be other solutions. For example, we could turn off ``amp`` temporarily if it's
 enabled, after moving the original ``forward`` into a helper wrapper, like so:
 .. code-block:: python
    def _forward(self, hidden_states):
        hidden_gelu = self.gelu_act(self.wi_0(hidden_states))
        hidden_linear = self.wi_1(hidden_states)
        hidden_states = hidden_gelu * hidden_linear
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.wo(hidden_states)
        return hidden_states
    import torch
    def forward(self, hidden_states):
        if torch.is_autocast_enabled():
             with torch.cuda.amp.autocast(enabled=False):
                 return self._forward(hidden_states)
         else:
             return self._forward(hidden_states)
 Since the automatic detector only reports on inputs and outputs of full frames, once you know where to look, you may
 want to analyse the intermediary stages of any specific ``forward`` function as well. In such a case you can use the
 ``detect_overflow`` helper function to inject the detector where you want it, for example:
 .. code-block:: python
    from debug_utils import detect_overflow
    class T5LayerFF(nn.Module):
        [...]
        def forward(self, hidden_states):
            forwarded_states = self.layer_norm(hidden_states)
            detect_overflow(forwarded_states, "after layer_norm")
            forwarded_states = self.DenseReluDense(forwarded_states)
            detect_overflow(forwarded_states, "after DenseReluDense")
            return hidden_states + self.dropout(forwarded_states)
 You can see that we added 2 of these and now we track if ``inf`` or ``nan`` for ``forwarded_states`` was detected
 somewhere in between.
 Actually, the detector already reports these because each of the calls in the example above is a `nn.Module``, but
 let's say if you had some local direct calculations this is how you'd do that.
 Additionally, if you're instantiating the debugger in your own code, you can adjust the number of frames printed from
 its default, e.g.:
 .. code-block:: python
    from .debug_utils import DebugUnderflowOverflow
    debug_overflow = DebugUnderflowOverflow(model, max_frames_to_save=100)
 Specific batch absolute mix and max value tracing
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The same debugging class can be used for per-batch tracing with the underflow/overflow detection feature turned off.
 Let's say you want to watch the absolute min and max values for all the ingredients of each ``forward`` call of a given
 batch, and only do that for batches 1 and 3. Then you instantiate this class as:
 .. code-block:: python
    debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1,3])
 And now full batches 1 and 3 will be traced using the same format as the underflow/overflow detector does.
 Batches are 0-indexed.
 This is helpful if you know that the program starts misbehaving after a certain batch number, so you can fast-forward
 right to that area. Here is a sample truncated output for such configuration:
 .. code-block::
                      *** Starting batch number=1 ***
    abs min  abs max  metadata
                      shared Embedding
    1.01e-06 7.92e+02 weight
    0.00e+00 2.47e+04 input[0]
    5.36e-05 7.92e+02 output
    [...]
                      decoder.dropout Dropout
    1.60e-07 2.27e+01 input[0]
    0.00e+00 2.52e+01 output
                      decoder T5Stack
         not a tensor output
                      lm_head Linear
    1.01e-06 7.92e+02 weight
    0.00e+00 1.11e+00 input[0]
    6.06e-02 8.39e+01 output
                       T5ForConditionalGeneration
         not a tensor output
                      *** Starting batch number=3 ***
    abs min  abs max  metadata
                      shared Embedding
    1.01e-06 7.92e+02 weight
    0.00e+00 2.78e+04 input[0]
    5.36e-05 7.92e+02 output
    [...]
 Here you will get a huge number of frames dumped - as many as there were forward calls in your model, so it may or may
 not what you want, but sometimes it can be easier to use for debugging purposes than a normal debugger. For example, if
 a problem starts happening at batch number 150. So you can dump traces for batches 149 and 150 and compare where
 numbers started to diverge.
 You can also specify the batch number after which to stop the training, with:
 .. code-block:: python
    debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1,3], abort_after_batch_num=3)
--- a/docs/source/examples.md
+++ b/docs/source/examples.md
@ -1 +0,0 @@
 ../../examples/README.md
--- a/docs/source/fast_tokenizers.mdx
+++ b/docs/source/fast_tokenizers.mdx
@ -0,0 +1,70 @@
 <!--Copyright 2020 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
 -->
 # Using tokenizers from 🤗 Tokenizers
 The [`PreTrainedTokenizerFast`] depends on the [🤗 Tokenizers](https://huggingface.co/docs/tokenizers) library. The tokenizers obtained from the 🤗 Tokenizers library can be
 loaded very simply into 🤗 Transformers.
 Before getting in the specifics, let's first start by creating a dummy tokenizer in a few lines:
 ```python
 >>> from tokenizers import Tokenizer
 >>> from tokenizers.models import BPE
 >>> from tokenizers.trainers import BpeTrainer
 >>> from tokenizers.pre_tokenizers import Whitespace
 >>> tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
 >>> trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
 >>> tokenizer.pre_tokenizer = Whitespace()
 >>> files = [...]
 >>> tokenizer.train(files, trainer)
 ```
 We now have a tokenizer trained on the files we defined. We can either continue using it in that runtime, or save it to
 a JSON file for future re-use.
 ## Loading directly from the tokenizer object
 Let's see how to leverage this tokenizer object in the 🤗 Transformers library. The
 [`PreTrainedTokenizerFast`] class allows for easy instantiation, by accepting the instantiated
 *tokenizer* object as an argument:
 ```python
 >>> from transformers import PreTrainedTokenizerFast
 >>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
 ```
 This object can now be used with all the methods shared by the 🤗 Transformers tokenizers! Head to [the tokenizer
 page](main_classes/tokenizer) for more information.
 ## Loading from a JSON file
 In order to load a tokenizer from a JSON file, let's first start by saving our tokenizer:
 ```python
 >>> tokenizer.save("tokenizer.json")
 ```
 The path to which we saved this file can be passed to the [`PreTrainedTokenizerFast`] initialization
 method using the `tokenizer_file` parameter:
 ```python
 >>> from transformers import PreTrainedTokenizerFast
 >>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json")
 ```
 This object can now be used with all the methods shared by the 🤗 Transformers tokenizers! Head to [the tokenizer
 page](main_classes/tokenizer) for more information.
--- a/docs/source/fast_tokenizers.rst
+++ b/docs/source/fast_tokenizers.rst
@ -1,62 +0,0 @@
 Using tokenizers from 🤗 Tokenizers
 =======================================================================================================================
 The :class:`~transformers.PreTrainedTokenizerFast` depends on the `tokenizers
 <https://huggingface.co/docs/tokenizers>`__ library. The tokenizers obtained from the 🤗 Tokenizers library can be
 loaded very simply into 🤗 Transformers.
 Before getting in the specifics, let's first start by creating a dummy tokenizer in a few lines:
 .. code-block::
    >>> from tokenizers import Tokenizer
    >>> from tokenizers.models import BPE
    >>> from tokenizers.trainers import BpeTrainer
    >>> from tokenizers.pre_tokenizers import Whitespace
    >>> tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
    >>> trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
    >>> tokenizer.pre_tokenizer = Whitespace()
    >>> files = [...]
    >>> tokenizer.train(files, trainer)
 We now have a tokenizer trained on the files we defined. We can either continue using it in that runtime, or save it to
 a JSON file for future re-use.
 Loading directly from the tokenizer object
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 Let's see how to leverage this tokenizer object in the 🤗 Transformers library. The
 :class:`~transformers.PreTrainedTokenizerFast` class allows for easy instantiation, by accepting the instantiated
 `tokenizer` object as an argument:
 .. code-block::
    >>> from transformers import PreTrainedTokenizerFast
    >>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
 This object can now be used with all the methods shared by the 🤗 Transformers tokenizers! Head to :doc:`the tokenizer
 page <main_classes/tokenizer>` for more information.
 Loading from a JSON file
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 In order to load a tokenizer from a JSON file, let's first start by saving our tokenizer:
 .. code-block::
    >>> tokenizer.save("tokenizer.json")
 The path to which we saved this file can be passed to the :class:`~transformers.PreTrainedTokenizerFast` initialization
 method using the :obj:`tokenizer_file` parameter:
 .. code-block::
    >>> from transformers import PreTrainedTokenizerFast
    >>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json")
 This object can now be used with all the methods shared by the 🤗 Transformers tokenizers! Head to :doc:`the tokenizer
 page <main_classes/tokenizer>` for more information.
--- a/docs/source/glossary.mdx
+++ b/docs/source/glossary.mdx
@ -0,0 +1,300 @@
 <!--Copyright 2020 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
 -->
 # Glossary
 ## General terms
 - autoencoding models: see MLM
 - autoregressive models: see CLM
 - CLM: causal language modeling, a pretraining task where the model reads the texts in order and has to predict the
  next word. It's usually done by reading the whole sentence but using a mask inside the model to hide the future
  tokens at a certain timestep.
 - deep learning: machine learning algorithms which uses neural networks with several layers.
 - MLM: masked language modeling, a pretraining task where the model sees a corrupted version of the texts, usually done
  by masking some tokens randomly, and has to predict the original text.
 - multimodal: a task that combines texts with another kind of inputs (for instance images).
 - NLG: natural language generation, all tasks related to generating text (for instance talk with transformers,
  translation).
 - NLP: natural language processing, a generic way to say "deal with texts".
 - NLU: natural language understanding, all tasks related to understanding what is in a text (for instance classifying
  the whole text, individual words).
 - pretrained model: a model that has been pretrained on some data (for instance all of Wikipedia). Pretraining methods
  involve a self-supervised objective, which can be reading the text and trying to predict the next word (see CLM) or
  masking some words and trying to predict them (see MLM).
 - RNN: recurrent neural network, a type of model that uses a loop over a layer to process texts.
 - self-attention: each element of the input finds out which other elements of the input they should attend to.
 - seq2seq or sequence-to-sequence: models that generate a new sequence from an input, like translation models, or
  summarization models (such as [Bart](model_doc/bart) or [T5](model_doc/t5)).
 - token: a part of a sentence, usually a word, but can also be a subword (non-common words are often split in subwords)
  or a punctuation symbol.
 - transformer: self-attention based deep learning model architecture.
 ## Model inputs
 Every model is different yet bears similarities with the others. Therefore most models use the same inputs, which are
 detailed here alongside usage examples.
 <a id='input-ids'></a>
 ### Input IDs
 The input ids are often the only required parameters to be passed to the model as input. *They are token indices,
 numerical representations of tokens building the sequences that will be used as input by the model*.
 <Youtube id="VFp38yj8h3A"/>
 Each tokenizer works differently but the underlying mechanism remains the same. Here's an example using the BERT
 tokenizer, which is a [WordPiece](https://arxiv.org/pdf/1609.08144.pdf) tokenizer:
 ```python
 >>> from transformers import BertTokenizer
 >>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
 >>> sequence = "A Titan RTX has 24GB of VRAM"
 ```
 The tokenizer takes care of splitting the sequence into tokens available in the tokenizer vocabulary.
 ```python
 >>> tokenized_sequence = tokenizer.tokenize(sequence)
 ```
 The tokens are either words or subwords. Here for instance, "VRAM" wasn't in the model vocabulary, so it's been split
 in "V", "RA" and "M". To indicate those tokens are not separate words but parts of the same word, a double-hash prefix
 is added for "RA" and "M":
 ```python
 >>> print(tokenized_sequence)
 ['A', 'Titan', 'R', '##T', '##X', 'has', '24', '##GB', 'of', 'V', '##RA', '##M']
 ```
 These tokens can then be converted into IDs which are understandable by the model. This can be done by directly feeding
 the sentence to the tokenizer, which leverages the Rust implementation of [🤗 Tokenizers](https://github.com/huggingface/tokenizers) for peak performance.
 ```python
 >>> inputs = tokenizer(sequence)
 ```
 The tokenizer returns a dictionary with all the arguments necessary for its corresponding model to work properly. The
 token indices are under the key "input_ids":
 ```python
 >>> encoded_sequence = inputs["input_ids"]
 >>> print(encoded_sequence)
 [101, 138, 18696, 155, 1942, 3190, 1144, 1572, 13745, 1104, 159, 9664, 2107, 102]
 ```
 Note that the tokenizer automatically adds "special tokens" (if the associated model relies on them) which are special
 IDs the model sometimes uses.
 If we decode the previous sequence of ids,
 ```python
 >>> decoded_sequence = tokenizer.decode(encoded_sequence)
 ```
 we will see
 ```python
 >>> print(decoded_sequence)
 [CLS] A Titan RTX has 24GB of VRAM [SEP]
 ```
 because this is the way a [`BertModel`] is going to expect its inputs.
 <a id='attention-mask'></a>
 ### Attention mask
 The attention mask is an optional argument used when batching sequences together.
 <Youtube id="M6adb1j2jPI"/>
 This argument indicates to the model which tokens should be attended to, and which should not.
 For example, consider these two sequences:
 ```python
 >>> from transformers import BertTokenizer
 >>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
 >>> sequence_a = "This is a short sequence."
 >>> sequence_b = "This is a rather long sequence. It is at least longer than the sequence A."
 >>> encoded_sequence_a = tokenizer(sequence_a)["input_ids"]
 >>> encoded_sequence_b = tokenizer(sequence_b)["input_ids"]
 ```
 The encoded versions have different lengths:
 ```python
 >>> len(encoded_sequence_a), len(encoded_sequence_b)
 (8, 19)
 ```
 Therefore, we can't put them together in the same tensor as-is. The first sequence needs to be padded up to the length
 of the second one, or the second one needs to be truncated down to the length of the first one.
 In the first case, the list of IDs will be extended by the padding indices. We can pass a list to the tokenizer and ask
 it to pad like this:
 ```python
 >>> padded_sequences = tokenizer([sequence_a, sequence_b], padding=True)
 ```
 We can see that 0s have been added on the right of the first sentence to make it the same length as the second one:
 ```python
 >>> padded_sequences["input_ids"]
 [[101, 1188, 1110, 170, 1603, 4954, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1188, 1110, 170, 1897, 1263, 4954, 119, 1135, 1110, 1120, 1655, 2039, 1190, 1103, 4954, 138, 119, 102]]
 ```
 This can then be converted into a tensor in PyTorch or TensorFlow. The attention mask is a binary tensor indicating the
 position of the padded indices so that the model does not attend to them. For the [`BertTokenizer`],
 `1` indicates a value that should be attended to, while `0` indicates a padded value. This attention mask is
 in the dictionary returned by the tokenizer under the key "attention_mask":
 ```python
 >>> padded_sequences["attention_mask"]
 [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
 ```
 <a id='token-type-ids'></a>
 ### Token Type IDs
 Some models' purpose is to do classification on pairs of sentences or question answering.
 <Youtube id="0u3ioSwev3s"/>
 These require two different sequences to be joined in a single "input_ids" entry, which usually is performed with the
 help of special tokens, such as the classifier (`[CLS]`) and separator (`[SEP]`) tokens. For example, the BERT
 model builds its two sequence input as such:
 ```python
 >>> # [CLS] SEQUENCE_A [SEP] SEQUENCE_B [SEP]
 ```
 We can use our tokenizer to automatically generate such a sentence by passing the two sequences to `tokenizer` as two
 arguments (and not a list, like before) like this:
 ```python
 >>> from transformers import BertTokenizer
 >>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
 >>> sequence_a = "HuggingFace is based in NYC"
 >>> sequence_b = "Where is HuggingFace based?"
 >>> encoded_dict = tokenizer(sequence_a, sequence_b)
 >>> decoded = tokenizer.decode(encoded_dict["input_ids"])
 ```
 which will return:
 ```python
 >>> print(decoded)
 [CLS] HuggingFace is based in NYC [SEP] Where is HuggingFace based? [SEP]
 ```
 This is enough for some models to understand where one sequence ends and where another begins. However, other models,
 such as BERT, also deploy token type IDs (also called segment IDs). They are represented as a binary mask identifying
 the two types of sequence in the model.
 The tokenizer returns this mask as the "token_type_ids" entry:
 ```python
 >>> encoded_dict["token_type_ids"]
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
 ```
 The first sequence, the "context" used for the question, has all its tokens represented by a `0`, whereas the
 second sequence, corresponding to the "question", has all its tokens represented by a `1`.
 Some models, like [`XLNetModel`] use an additional token represented by a `2`.
 <a id='position-ids'></a>
 ### Position IDs
 Contrary to RNNs that have the position of each token embedded within them, transformers are unaware of the position of
 each token. Therefore, the position IDs (`position_ids`) are used by the model to identify each token's position in
 the list of tokens.
 They are an optional parameter. If no `position_ids` are passed to the model, the IDs are automatically created as
 absolute positional embeddings.
 Absolute positional embeddings are selected in the range `[0, config.max_position_embeddings - 1]`. Some models use
 other types of positional embeddings, such as sinusoidal position embeddings or relative position embeddings.
 <a id='labels'></a>
 ### Labels
 The labels are an optional argument which can be passed in order for the model to compute the loss itself. These labels
 should be the expected prediction of the model: it will use the standard loss in order to compute the loss between its
 predictions and the expected value (the label).
 These labels are different according to the model head, for example:
 - For sequence classification models (e.g., [`BertForSequenceClassification`]), the model expects a
  tensor of dimension `(batch_size)` with each value of the batch corresponding to the expected label of the
  entire sequence.
 - For token classification models (e.g., [`BertForTokenClassification`]), the model expects a tensor
  of dimension `(batch_size, seq_length)` with each value corresponding to the expected label of each individual
  token.
 - For masked language modeling (e.g., [`BertForMaskedLM`]), the model expects a tensor of dimension
  `(batch_size, seq_length)` with each value corresponding to the expected label of each individual token: the
  labels being the token ID for the masked token, and values to be ignored for the rest (usually -100).
 - For sequence to sequence tasks,(e.g., [`BartForConditionalGeneration`],
  [`MBartForConditionalGeneration`]), the model expects a tensor of dimension `(batch_size, tgt_seq_length)` with each value corresponding to the target sequences associated with each input sequence. During
  training, both *BART* and *T5* will make the appropriate *decoder_input_ids* and decoder attention masks internally.
  They usually do not need to be supplied. This does not apply to models leveraging the Encoder-Decoder framework. See
  the documentation of each model for more information on each specific model's labels.
 The base models (e.g., [`BertModel`]) do not accept labels, as these are the base transformer
 models, simply outputting features.
 <a id='decoder-input-ids'></a>
 ### Decoder input IDs
 This input is specific to encoder-decoder models, and contains the input IDs that will be fed to the decoder. These
 inputs should be used for sequence to sequence tasks, such as translation or summarization, and are usually built in a
 way specific to each model.
 Most encoder-decoder models (BART, T5) create their `decoder_input_ids` on their own from the `labels`. In
 such models, passing the `labels` is the preferred way to handle training.
 Please check each model's docs to see how they handle these input IDs for sequence to sequence training.
 <a id='feed-forward-chunking'></a>
 ### Feed Forward Chunking
 In each residual attention block in transformers the self-attention layer is usually followed by 2 feed forward layers.
 The intermediate embedding size of the feed forward layers is often bigger than the hidden size of the model (e.g., for
 `bert-base-uncased`).
 For an input of size `[batch_size, sequence_length]`, the memory required to store the intermediate feed forward
 embeddings `[batch_size, sequence_length, config.intermediate_size]` can account for a large fraction of the memory
 use. The authors of [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) noticed that since the
 computation is independent of the `sequence_length` dimension, it is mathematically equivalent to compute the output
 embeddings of both feed forward layers `[batch_size, config.hidden_size]_0, ..., [batch_size, config.hidden_size]_n`
 individually and concat them afterward to `[batch_size, sequence_length, config.hidden_size]` with `n = sequence_length`, which trades increased computation time against reduced memory use, but yields a mathematically
 **equivalent** result.
 For models employing the function [`apply_chunking_to_forward`], the `chunk_size` defines the
 number of output embeddings that are computed in parallel and thus defines the trade-off between memory and time
 complexity. If `chunk_size` is set to 0, no feed forward chunking is done.
--- a/docs/source/glossary.rst
+++ b/docs/source/glossary.rst
@ -1,322 +0,0 @@
 .. 
    Copyright 2020 The HuggingFace Team. All rights reserved.
    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
    the License. You may obtain a copy of the License at
        http://www.apache.org/licenses/LICENSE-2.0
    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
    specific language governing permissions and limitations under the License.
 Glossary
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 General terms
 -----------------------------------------------------------------------------------------------------------------------
 - autoencoding models: see MLM
 - autoregressive models: see CLM
 - CLM: causal language modeling, a pretraining task where the model reads the texts in order and has to predict the
  next word. It's usually done by reading the whole sentence but using a mask inside the model to hide the future
  tokens at a certain timestep.
 - deep learning: machine learning algorithms which uses neural networks with several layers.
 - MLM: masked language modeling, a pretraining task where the model sees a corrupted version of the texts, usually done
  by masking some tokens randomly, and has to predict the original text.
 - multimodal: a task that combines texts with another kind of inputs (for instance images).
 - NLG: natural language generation, all tasks related to generating text (for instance talk with transformers,
  translation).
 - NLP: natural language processing, a generic way to say "deal with texts".
 - NLU: natural language understanding, all tasks related to understanding what is in a text (for instance classifying
  the whole text, individual words).
 - pretrained model: a model that has been pretrained on some data (for instance all of Wikipedia). Pretraining methods
  involve a self-supervised objective, which can be reading the text and trying to predict the next word (see CLM) or
  masking some words and trying to predict them (see MLM).
 - RNN: recurrent neural network, a type of model that uses a loop over a layer to process texts.
 - self-attention: each element of the input finds out which other elements of the input they should attend to.
 - seq2seq or sequence-to-sequence: models that generate a new sequence from an input, like translation models, or
  summarization models (such as :doc:`Bart </model_doc/bart>` or :doc:`T5 </model_doc/t5>`).
 - token: a part of a sentence, usually a word, but can also be a subword (non-common words are often split in subwords)
  or a punctuation symbol.
 - transformer: self-attention based deep learning model architecture.
 Model inputs
 -----------------------------------------------------------------------------------------------------------------------
 Every model is different yet bears similarities with the others. Therefore most models use the same inputs, which are
 detailed here alongside usage examples.
 .. _input-ids:
 Input IDs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The input ids are often the only required parameters to be passed to the model as input. *They are token indices,
 numerical representations of tokens building the sequences that will be used as input by the model*.
 .. raw:: html
   <iframe width="560" height="315" src="https://www.youtube.com/embed/VFp38yj8h3A" title="YouTube video player"
   frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope;
   picture-in-picture" allowfullscreen></iframe>
 Each tokenizer works differently but the underlying mechanism remains the same. Here's an example using the BERT
 tokenizer, which is a `WordPiece <https://arxiv.org/pdf/1609.08144.pdf>`__ tokenizer:
 .. code-block::
    >>> from transformers import BertTokenizer
    >>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
    >>> sequence = "A Titan RTX has 24GB of VRAM"
 The tokenizer takes care of splitting the sequence into tokens available in the tokenizer vocabulary.
 .. code-block::
    >>> tokenized_sequence = tokenizer.tokenize(sequence)
 The tokens are either words or subwords. Here for instance, "VRAM" wasn't in the model vocabulary, so it's been split
 in "V", "RA" and "M". To indicate those tokens are not separate words but parts of the same word, a double-hash prefix
 is added for "RA" and "M":
 .. code-block::
    >>> print(tokenized_sequence)
    ['A', 'Titan', 'R', '##T', '##X', 'has', '24', '##GB', 'of', 'V', '##RA', '##M']
 These tokens can then be converted into IDs which are understandable by the model. This can be done by directly feeding
 the sentence to the tokenizer, which leverages the Rust implementation of `huggingface/tokenizers
 <https://github.com/huggingface/tokenizers>`__ for peak performance.
 .. code-block::
    >>> inputs = tokenizer(sequence)
 The tokenizer returns a dictionary with all the arguments necessary for its corresponding model to work properly. The
 token indices are under the key "input_ids":
 .. code-block::
    >>> encoded_sequence = inputs["input_ids"]
    >>> print(encoded_sequence)
    [101, 138, 18696, 155, 1942, 3190, 1144, 1572, 13745, 1104, 159, 9664, 2107, 102]
 Note that the tokenizer automatically adds "special tokens" (if the associated model relies on them) which are special
 IDs the model sometimes uses.
 If we decode the previous sequence of ids,
 .. code-block::
    >>> decoded_sequence = tokenizer.decode(encoded_sequence)
 we will see
 .. code-block::
    >>> print(decoded_sequence)
    [CLS] A Titan RTX has 24GB of VRAM [SEP]
 because this is the way a :class:`~transformers.BertModel` is going to expect its inputs.
 .. _attention-mask:
 Attention mask
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The attention mask is an optional argument used when batching sequences together.
 .. raw:: html
   <iframe width="560" height="315" src="https://www.youtube.com/embed/M6adb1j2jPI" title="YouTube video player"
   frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope;
   picture-in-picture" allowfullscreen></iframe>
 This argument indicates to the model which tokens should be attended to, and which should not.
 For example, consider these two sequences:
 .. code-block::
    >>> from transformers import BertTokenizer
    >>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
    >>> sequence_a = "This is a short sequence."
    >>> sequence_b = "This is a rather long sequence. It is at least longer than the sequence A."
    >>> encoded_sequence_a = tokenizer(sequence_a)["input_ids"]
    >>> encoded_sequence_b = tokenizer(sequence_b)["input_ids"]
 The encoded versions have different lengths:
 .. code-block::
    >>> len(encoded_sequence_a), len(encoded_sequence_b)
    (8, 19)
 Therefore, we can't put them together in the same tensor as-is. The first sequence needs to be padded up to the length
 of the second one, or the second one needs to be truncated down to the length of the first one.
 In the first case, the list of IDs will be extended by the padding indices. We can pass a list to the tokenizer and ask
 it to pad like this:
 .. code-block::
    >>> padded_sequences = tokenizer([sequence_a, sequence_b], padding=True)
 We can see that 0s have been added on the right of the first sentence to make it the same length as the second one:
 .. code-block::
    >>> padded_sequences["input_ids"]
    [[101, 1188, 1110, 170, 1603, 4954, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1188, 1110, 170, 1897, 1263, 4954, 119, 1135, 1110, 1120, 1655, 2039, 1190, 1103, 4954, 138, 119, 102]]
 This can then be converted into a tensor in PyTorch or TensorFlow. The attention mask is a binary tensor indicating the
 position of the padded indices so that the model does not attend to them. For the :class:`~transformers.BertTokenizer`,
 :obj:`1` indicates a value that should be attended to, while :obj:`0` indicates a padded value. This attention mask is
 in the dictionary returned by the tokenizer under the key "attention_mask":
 .. code-block::
    >>> padded_sequences["attention_mask"]
    [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
 .. _token-type-ids:
 Token Type IDs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Some models' purpose is to do classification on pairs of sentences or question answering.
 .. raw:: html
   <iframe width="560" height="315" src="https://www.youtube.com/embed/0u3ioSwev3s" title="YouTube video player"
   frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope;
   picture-in-picture" allowfullscreen></iframe>
 These require two different sequences to be joined in a single "input_ids" entry, which usually is performed with the
 help of special tokens, such as the classifier (``[CLS]``) and separator (``[SEP]``) tokens. For example, the BERT
 model builds its two sequence input as such:
 .. code-block::
    >>> # [CLS] SEQUENCE_A [SEP] SEQUENCE_B [SEP]
 We can use our tokenizer to automatically generate such a sentence by passing the two sequences to ``tokenizer`` as two
 arguments (and not a list, like before) like this:
 .. code-block::
    >>> from transformers import BertTokenizer
    >>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
    >>> sequence_a = "HuggingFace is based in NYC"
    >>> sequence_b = "Where is HuggingFace based?"
    >>> encoded_dict = tokenizer(sequence_a, sequence_b)
    >>> decoded = tokenizer.decode(encoded_dict["input_ids"])
 which will return:
 .. code-block::
    >>> print(decoded)
    [CLS] HuggingFace is based in NYC [SEP] Where is HuggingFace based? [SEP]
 This is enough for some models to understand where one sequence ends and where another begins. However, other models,
 such as BERT, also deploy token type IDs (also called segment IDs). They are represented as a binary mask identifying
 the two types of sequence in the model.
 The tokenizer returns this mask as the "token_type_ids" entry:
 .. code-block::
    >>> encoded_dict['token_type_ids']
    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
 The first sequence, the "context" used for the question, has all its tokens represented by a :obj:`0`, whereas the
 second sequence, corresponding to the "question", has all its tokens represented by a :obj:`1`.
 Some models, like :class:`~transformers.XLNetModel` use an additional token represented by a :obj:`2`.
 .. _position-ids:
 Position IDs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Contrary to RNNs that have the position of each token embedded within them, transformers are unaware of the position of
 each token. Therefore, the position IDs (``position_ids``) are used by the model to identify each token's position in
 the list of tokens.
 They are an optional parameter. If no ``position_ids`` are passed to the model, the IDs are automatically created as
 absolute positional embeddings.
 Absolute positional embeddings are selected in the range ``[0, config.max_position_embeddings - 1]``. Some models use
 other types of positional embeddings, such as sinusoidal position embeddings or relative position embeddings.
 .. _labels:
 Labels
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The labels are an optional argument which can be passed in order for the model to compute the loss itself. These labels
 should be the expected prediction of the model: it will use the standard loss in order to compute the loss between its
 predictions and the expected value (the label).
 These labels are different according to the model head, for example:
 - For sequence classification models (e.g., :class:`~transformers.BertForSequenceClassification`), the model expects a
  tensor of dimension :obj:`(batch_size)` with each value of the batch corresponding to the expected label of the
  entire sequence.
 - For token classification models (e.g., :class:`~transformers.BertForTokenClassification`), the model expects a tensor
  of dimension :obj:`(batch_size, seq_length)` with each value corresponding to the expected label of each individual
  token.
 - For masked language modeling (e.g., :class:`~transformers.BertForMaskedLM`), the model expects a tensor of dimension
  :obj:`(batch_size, seq_length)` with each value corresponding to the expected label of each individual token: the
  labels being the token ID for the masked token, and values to be ignored for the rest (usually -100).
 - For sequence to sequence tasks,(e.g., :class:`~transformers.BartForConditionalGeneration`,
  :class:`~transformers.MBartForConditionalGeneration`), the model expects a tensor of dimension :obj:`(batch_size,
  tgt_seq_length)` with each value corresponding to the target sequences associated with each input sequence. During
  training, both `BART` and `T5` will make the appropriate `decoder_input_ids` and decoder attention masks internally.
  They usually do not need to be supplied. This does not apply to models leveraging the Encoder-Decoder framework. See
  the documentation of each model for more information on each specific model's labels.
 The base models (e.g., :class:`~transformers.BertModel`) do not accept labels, as these are the base transformer
 models, simply outputting features.
 .. _decoder-input-ids:
 Decoder input IDs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 This input is specific to encoder-decoder models, and contains the input IDs that will be fed to the decoder. These
 inputs should be used for sequence to sequence tasks, such as translation or summarization, and are usually built in a
 way specific to each model.
 Most encoder-decoder models (BART, T5) create their :obj:`decoder_input_ids` on their own from the :obj:`labels`. In
 such models, passing the :obj:`labels` is the preferred way to handle training.
 Please check each model's docs to see how they handle these input IDs for sequence to sequence training.
 .. _feed-forward-chunking:
 Feed Forward Chunking
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 In each residual attention block in transformers the self-attention layer is usually followed by 2 feed forward layers.
 The intermediate embedding size of the feed forward layers is often bigger than the hidden size of the model (e.g., for
 ``bert-base-uncased``).
 For an input of size ``[batch_size, sequence_length]``, the memory required to store the intermediate feed forward
 embeddings ``[batch_size, sequence_length, config.intermediate_size]`` can account for a large fraction of the memory
 use. The authors of `Reformer: The Efficient Transformer <https://arxiv.org/abs/2001.04451>`_ noticed that since the
 computation is independent of the ``sequence_length`` dimension, it is mathematically equivalent to compute the output
 embeddings of both feed forward layers ``[batch_size, config.hidden_size]_0, ..., [batch_size, config.hidden_size]_n``
 individually and concat them afterward to ``[batch_size, sequence_length, config.hidden_size]`` with ``n =
 sequence_length``, which trades increased computation time against reduced memory use, but yields a mathematically
 **equivalent** result.
 For models employing the function :func:`~.transformers.apply_chunking_to_forward`, the ``chunk_size`` defines the
 number of output embeddings that are computed in parallel and thus defines the trade-off between memory and time
 complexity. If ``chunk_size`` is set to 0, no feed forward chunking is done.
--- a/docs/source/imgs/course_banner.png
+++ b/docs/source/imgs/course_banner.png
--- a/docs/source/imgs/local_attention_mask.png
+++ b/docs/source/imgs/local_attention_mask.png
--- a/docs/source/imgs/parallelism-deepspeed-3d.png
+++ b/docs/source/imgs/parallelism-deepspeed-3d.png
--- a/docs/source/imgs/parallelism-flexflow.jpeg
+++ b/docs/source/imgs/parallelism-flexflow.jpeg
--- a/docs/source/imgs/parallelism-gpipe-bubble.png
+++ b/docs/source/imgs/parallelism-gpipe-bubble.png
--- a/docs/source/imgs/parallelism-sagemaker-interleaved-pipeline.png
+++ b/docs/source/imgs/parallelism-sagemaker-interleaved-pipeline.png
--- a/docs/source/imgs/parallelism-tp-independent-gelu.png
+++ b/docs/source/imgs/parallelism-tp-independent-gelu.png
--- a/docs/source/imgs/parallelism-tp-parallel_gemm.png
+++ b/docs/source/imgs/parallelism-tp-parallel_gemm.png
--- a/docs/source/imgs/parallelism-tp-parallel_self_attention.png
+++ b/docs/source/imgs/parallelism-tp-parallel_self_attention.png
--- a/docs/source/imgs/parallelism-tp-parallel_shard_processing.png
+++ b/docs/source/imgs/parallelism-tp-parallel_shard_processing.png
--- a/docs/source/imgs/parallelism-zero-dp-pp.png
+++ b/docs/source/imgs/parallelism-zero-dp-pp.png
--- a/docs/source/imgs/parallelism-zero.png
+++ b/docs/source/imgs/parallelism-zero.png
--- a/docs/source/imgs/perf-moe-transformer.png
+++ b/docs/source/imgs/perf-moe-transformer.png
--- a/docs/source/imgs/ppl_chunked.gif
+++ b/docs/source/imgs/ppl_chunked.gif
--- a/docs/source/imgs/ppl_full.gif
+++ b/docs/source/imgs/ppl_full.gif
--- a/docs/source/imgs/ppl_sliding.gif
+++ b/docs/source/imgs/ppl_sliding.gif
--- a/docs/source/imgs/tf32-bf16-fp16-fp32.png
+++ b/docs/source/imgs/tf32-bf16-fp16-fp32.png
--- a/docs/source/imgs/transformers_logo_name.png
+++ b/docs/source/imgs/transformers_logo_name.png
--- a/docs/source/imgs/transformers_overview.png
+++ b/docs/source/imgs/transformers_overview.png
--- a/docs/source/imgs/warmup_constant_schedule.png
+++ b/docs/source/imgs/warmup_constant_schedule.png
--- a/docs/source/imgs/warmup_cosine_hard_restarts_schedule.png
+++ b/docs/source/imgs/warmup_cosine_hard_restarts_schedule.png
--- a/docs/source/imgs/warmup_cosine_schedule.png
+++ b/docs/source/imgs/warmup_cosine_schedule.png
--- a/docs/source/imgs/warmup_cosine_warm_restarts_schedule.png
+++ b/docs/source/imgs/warmup_cosine_warm_restarts_schedule.png
--- a/docs/source/imgs/warmup_linear_schedule.png
+++ b/docs/source/imgs/warmup_linear_schedule.png
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@ -12,25 +12,18 @@ specific language governing permissions and limitations under the License.
 # 🤗 Transformers
-State-of-the-art Machine Learning for Jax, Pytorch and TensorFlow
+State-of-the-art Machine Learning for PyTorch, TensorFlow and JAX.
-🤗 Transformers (formerly known as _pytorch-transformers_ and _pytorch-pretrained-bert_) provides thousands of pretrained models to perform tasks on different modalities such as text, vision, and audio.
+🤗 Transformers provides APIs to easily download and train state-of-the-art pretrained models. Using pretrained models can reduce your compute costs, carbon footprint, and save you time from training a model from scratch. The models can be used across different modalities such as:
-These models can applied on:
+* 📝 Text: text classification, information extraction, question answering, summarization, translation, and text generation in over 100 languages.
 * 🖼️ Images: image classification, object detection, and segmentation.
 * 🗣️ Audio: speech recognition and audio classification.
 * 🐙 Multimodal: table question answering, optical character recognition, information extraction from scanned documents, video classification, and visual question answering.
-* 📝 Text, for tasks like text classification, information extraction, question answering, summarization, translation, text generation, in over 100 languages.
+Our library supports seamless integration between three of the most popular deep learning libraries: [PyTorch](https://pytorch.org/), [TensorFlow](https://www.tensorflow.org/) and [JAX](https://jax.readthedocs.io/en/latest/). Train your model in three lines of code in one framework, and load it for inference with another.
 * 🖼️ Images, for tasks like image classification, object detection, and segmentation.
 * 🗣️ Audio, for tasks like speech recognition and audio classification.
-Transformer models can also perform tasks on **several modalities combined**, such as table question answering, optical character recognition, information extraction from scanned documents, video classification, and visual question answering.
+Each 🤗 Transformers architecture is defined in a standalone Python module so they can be easily customized for research and experiments.
 🤗 Transformers provides APIs to quickly download and use those pretrained models on a given text, fine-tune them on your own datasets and then share them with the community on our [model hub](https://huggingface.co/models). At the same time, each python module defining an architecture is fully standalone and can be modified to enable quick research experiments.
 🤗 Transformers is backed by the three most popular deep learning libraries — [Jax](https://jax.readthedocs.io/en/latest/), [PyTorch](https://pytorch.org/) and [TensorFlow](https://www.tensorflow.org/) — with a seamless integration between them. It's straightforward to train your models with one before loading them for inference with the other.
 This is the documentation of our repository [transformers](https://github.com/huggingface/transformers). You can
 also follow our [online course](https://huggingface.co/course) that teaches how to use this library, as well as the
 other libraries developed by Hugging Face and the Hub.
 ## If you are looking for custom support from the Hugging Face team
@ -38,35 +31,6 @@ other libraries developed by Hugging Face and the Hub.
 <img alt="HuggingFace Expert Acceleration Program" src="https://huggingface.co/front/thumbnails/support.png" style="max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
 </a><br>
 ## Features
 1. Easy-to-use state-of-the-art models:
    - High performance on natural language understanding & generation, computer vision, and audio tasks.
    - Low barrier to entry for educators and practitioners.
    - Few user-facing abstractions with just three classes to learn.
    - A unified API for using all our pretrained models.
 1. Lower compute costs, smaller carbon footprint:
    - Researchers can share trained models instead of always retraining.
    - Practitioners can reduce compute time and production costs.
    - Dozens of architectures with over 20,000 pretrained models, some in more than 100 languages.
 1. Choose the right framework for every part of a model's lifetime:
    - Train state-of-the-art models in 3 lines of code.
    - Move a single model between TF2.0/PyTorch/JAX frameworks at will.
    - Seamlessly pick the right framework for training, evaluation and production.
 1. Easily customize a model or an example to your needs:
    - We provide examples for each architecture to reproduce the results published by its original authors.
    - Model internals are exposed as consistently as possible.
    - Model files can be used independently of the library for quick experiments.
 [All the model checkpoints](https://huggingface.co/models) are seamlessly integrated from the huggingface.co [model
 hub](https://huggingface.co) where they are uploaded directly by [users](https://huggingface.co/users) and
 [organizations](https://huggingface.co/organizations).
 Current number of checkpoints: <img src="https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen">
 ## Contents
 The documentation is organized in five parts:
@ -91,38 +55,40 @@ conversion utilities for the following models.
 <!--This list is updated automatically from the README with _make fix-copies_. Do not update manually! -->
 1. **[ALBERT](model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-1. **[BART](model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
+1. **[BART](model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BARTpho](model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
 1. **[BEiT](model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
 1. **[BERT](model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
 1. **[BERTweet](model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
-1. **[BERT For Sequence Generation](model_doc/bertgeneration)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+1. **[BERT For Sequence Generation](model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
-1. **[BigBird-RoBERTa](model_doc/bigbird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[BigBird-RoBERTa](model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
 1. **[BigBird-Pegasus](model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
 1. **[Blenderbot](model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-1. **[BlenderbotSmall](model_doc/blenderbot_small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+1. **[BlenderbotSmall](model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BORT](model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
 1. **[ByT5](model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
 1. **[CamemBERT](model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 1. **[CANINE](model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
 1. **[ConvNeXT](model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
 1. **[CLIP](model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[ConvBERT](model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[CPM](model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
 1. **[CTRL](model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 1. **[Data2Vec](model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
 1. **[DeBERTa](model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
-1. **[DeBERTa-v2](model_doc/deberta_v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
+1. **[DeBERTa-v2](model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
 1. **[DeiT](model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
 1. **[DETR](model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
 1. **[DialoGPT](model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
 1. **[DistilBERT](model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/research_projects/distillation) and a German version of DistilBERT.
 1. **[DPR](model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
-1. **[EncoderDecoder](model_doc/encoderdecoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+1. **[EncoderDecoder](model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
 1. **[ELECTRA](model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
 1. **[FlauBERT](model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
 1. **[FNet](model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
 1. **[Funnel Transformer](model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-1. **[GPT](model_doc/gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
+1. **[GPT](model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
 1. **[GPT-2](model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
 1. **[GPT-J](model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
 1. **[GPT Neo](model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
@ -139,21 +105,26 @@ conversion utilities for the following models.
 1. **[LXMERT](model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
 1. **[M2M100](model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
 1. **[MarianMT](model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
 1. **[MaskFormer](model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
 1. **[MBart](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
 1. **[MBart-50](model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
-1. **[Megatron-BERT](model_doc/megatron_bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+1. **[Megatron-BERT](model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 1. **[Megatron-GPT2](model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 1. **[MPNet](model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
 1. **[MT5](model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
 1. **[Nyströmformer](model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
 1. **[Pegasus](model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[Perceiver IO](model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
 1. **[PhoBERT](model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
 1. **[PLBart](model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
 1. **[PoolFormer](model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
 1. **[ProphetNet](model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
 1. **[QDQBert](model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
 1. **[REALM](https://huggingface.co/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
 1. **[Reformer](model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
-1. **[RemBERT](model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
+1. **[RemBERT](model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
-1. **[RoBERTa](model_doc/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
+1. **[RoBERTa](model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-1. **[RoFormer](model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
+1. **[RoFormer](model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
 1. **[SegFormer](model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
 1. **[SEW](model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
 1. **[SEW-D](model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
@ -161,21 +132,30 @@ conversion utilities for the following models.
 1. **[SpeechToTextTransformer2](model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
 1. **[Splinter](model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
 1. **[SqueezeBert](model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
 1. **[Swin Transformer](model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
 1. **[T5](model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[T5v1.1](model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[TAPAS](model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
-1. **[Transformer-XL](model_doc/transformerxl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+1. **[Transformer-XL](model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 1. **[TrOCR](model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
 1. **[UniSpeech](model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
-1. **[UniSpeechSat](model_doc/unispeech_sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
+1. **[UniSpeechSat](model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
 1. **[ViLT](model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
 1. **[Vision Transformer (ViT)](model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[ViTMAE](model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
 1. **[VisualBERT](model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
 1. **[WavLM](model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
 1. **[Wav2Vec2](model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
 1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/master/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
 1. **[XGLM](https://huggingface.co/docs/master/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
 1. **[XLM](model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
-1. **[XLM-ProphetNet](model_doc/xlmprophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[XLM-ProphetNet](model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-1. **[XLM-RoBERTa](model_doc/xlmroberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
+1. **[XLM-RoBERTa](model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
 1. **[XLM-RoBERTa-XL](model_doc/xlm-roberta-xl)** (from Facebook AI), released together with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
 1. **[XLNet](model_doc/xlnet)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
 1. **[XLSR-Wav2Vec2](model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
 1. **[XLS-R](https://huggingface.co/docs/master/transformers/model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
 1. **[YOSO](model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
 ### Supported frameworks
@ -187,7 +167,7 @@ Flax), PyTorch, and/or TensorFlow.
 <!--This table is updated automatically from the auto modules with _make fix-copies_. Do not update manually!-->
 |            Model            | Tokenizer slow | Tokenizer fast | PyTorch support | TensorFlow support | Flax Support |
-|-----------------------------|----------------|----------------|-----------------|--------------------|--------------|
+|:---------------------------:|:--------------:|:--------------:|:---------------:|:------------------:|:------------:|
 |           ALBERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |            BART             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |            BEiT             |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
@ -199,9 +179,12 @@ Flax), PyTorch, and/or TensorFlow.
 |       BlenderbotSmall       |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |          CamemBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |           Canine            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
-|            CLIP             |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
+|            CLIP             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |          ConvNext           |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 |            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 |        Data2VecAudio        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |        Data2VecText         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           DeBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |         DeBERTa-v2          |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 |            DeiT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
@ -227,45 +210,57 @@ Flax), PyTorch, and/or TensorFlow.
 |           LXMERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |           M2M100            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           Marian            |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
 |         MaskFormer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            mBART            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |        MegatronBert         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |         MobileBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |            MPNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |             mT5             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |        Nystromformer        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |         OpenAI GPT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |        OpenAI GPT-2         |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |           Pegasus           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |          Perceiver          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           PLBart            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |         PoolFormer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |         ProphetNet          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           QDQBert           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             RAG             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 |            Realm            |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 |          Reformer           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 |           RemBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |          RetriBERT          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 |           RoBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-|          RoFormer           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|          RoFormer           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |          SegFormer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             SEW             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            SEW-D            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|   Speech Encoder decoder    |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|   Speech Encoder decoder    |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
-|         Speech2Text         |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         Speech2Text         |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 |        Speech2Text2         |       ✅       |       ❌       |       ❌        |         ❌         |      ❌      |
 |          Splinter           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 |         SqueezeBERT         |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 |            Swin             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             T5              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |            TAPAS            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 |       Transformer-XL        |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 |            TrOCR            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |          UniSpeech          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |        UniSpeechSat         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
-|   Vision Encoder decoder    |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|            ViLT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |   Vision Encoder decoder    |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
 |    VisionTextDualEncoder    |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
 |         VisualBert          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             ViT             |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
 |           ViTMAE            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |          Wav2Vec2           |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
 |            WavLM            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            XGLM             |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
 |             XLM             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-|         XLM-RoBERTa         |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|         XLM-RoBERTa         |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 |       XLM-RoBERTa-XL        |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |        XLMProphetNet        |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            XLNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 |            YOSO             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 <!-- End table-->
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@ -1,198 +0,0 @@
 <!---
 Copyright 2020 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 -->
 # Installation
 🤗 Transformers is tested on Python 3.6+, and PyTorch 1.1.0+ or TensorFlow 2.0+.
 You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're
 unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). Create a virtual environment with the version of Python you're going
 to use and activate it.
 Now, if you want to use 🤗 Transformers, you can install it with pip. If you'd like to play with the examples, you
 must install it from source.
 ## Installation with pip
 First you need to install one of, or both, TensorFlow 2.0 and PyTorch.
 Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available),
 [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) and/or
 [Flax installation page](https://github.com/google/flax#quick-install)
 regarding the specific install command for your platform.
 When TensorFlow 2.0 and/or PyTorch has been installed, 🤗 Transformers can be installed using pip as follows:
 ```bash
 pip install transformers
 ```
 Alternatively, for CPU-support only, you can install 🤗 Transformers and PyTorch in one line with:
 ```bash
 pip install transformers[torch]
 ```
 or 🤗 Transformers and TensorFlow 2.0 in one line with:
 ```bash
 pip install transformers[tf-cpu]
 ```
 or 🤗 Transformers and Flax in one line with:
 ```bash
 pip install transformers[flax]
 ```
 To check 🤗 Transformers is properly installed, run the following command:
 ```bash
 python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))"
 ```
 It should download a pretrained model then print something like
 ```bash
 [{'label': 'POSITIVE', 'score': 0.9998704791069031}]
 ```
 (Note that TensorFlow will print additional stuff before that last statement.)
 ## Installing from source
 Here is how to quickly install `transformers` from source:
 ```bash
 pip install git+https://github.com/huggingface/transformers
 ```
 Note that this will install not the latest released version, but the bleeding edge `master` version, which you may want to use in case a bug has been fixed since the last official release and a new release hasn't been yet rolled out.
 While we strive to keep `master` operational at all times, if you notice some issues, they usually get fixed within a few hours or a day and you're more than welcome to help us detect any problems by opening an [Issue](https://github.com/huggingface/transformers/issues) and this way, things will get fixed even sooner.
 Again, you can run:
 ```bash
 python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I hate you'))"
 ```
 to check 🤗 Transformers is properly installed.
 ## Editable install
 If you want to constantly use the bleeding edge `master` version of the source code, or if you want to contribute to the library and need to test the changes in the code you're making, you will need an editable install. This is done by cloning the repository and installing with the following commands:
 ``` bash
 git clone https://github.com/huggingface/transformers.git
 cd transformers
 pip install -e .
 ```
 This command performs a magical link between the folder you cloned the repository to and your python library paths, and it'll look inside this folder in addition to the normal library-wide paths. So if normally your python packages get installed into:
 ```
 ~/anaconda3/envs/main/lib/python3.7/site-packages/
 ```
 now this editable install will reside where you clone the folder to, e.g. `~/transformers/` and python will search it too.
 Do note that you have to keep that `transformers` folder around and not delete it to continue using the  `transformers` library.
 Now, let's get to the real benefit of this installation approach. Say, you saw some new feature has been just committed into `master`. If you have already performed all the steps above, to update your transformers to include all the latest commits, all you need to do is to `cd` into that cloned repository folder and update the clone to the latest version:
 ```
 cd ~/transformers/
 git pull
 ```
 There is nothing else to do. Your python environment will find the bleeding edge version of `transformers` on the next run.
 ## With conda
 Since Transformers version v4.0.0, we now have a conda channel: `huggingface`.
 🤗 Transformers can be installed using conda as follows:
 ```
 conda install -c huggingface transformers
 ```
 Follow the installation pages of TensorFlow, PyTorch or Flax to see how to install them with conda.
 ## Caching models
 This library provides pretrained models that will be downloaded and cached locally. Unless you specify a location with
 `cache_dir=...` when you use methods like `from_pretrained`, these models will automatically be downloaded in the
 folder given by the shell environment variable ``TRANSFORMERS_CACHE``. The default value for it will be the Hugging
 Face cache home followed by ``/transformers/``. This is (by order of priority):
  * shell environment variable ``HF_HOME``
  * shell environment variable ``XDG_CACHE_HOME`` + ``/huggingface/``
  * default: ``~/.cache/huggingface/``
 So if you don't have any specific environment variable set, the cache directory will be at
 ``~/.cache/huggingface/transformers/``.
 **Note:** If you have set a shell environment variable for one of the predecessors of this library
 (``PYTORCH_TRANSFORMERS_CACHE`` or ``PYTORCH_PRETRAINED_BERT_CACHE``), those will be used if there is no shell
 environment variable for ``TRANSFORMERS_CACHE``.
 ### Offline mode
 It's possible to run 🤗 Transformers in a firewalled or a no-network environment.
 Setting environment variable `TRANSFORMERS_OFFLINE=1` will tell 🤗 Transformers to use local files only and will not try to look things up.
 Most likely you may want to couple this with `HF_DATASETS_OFFLINE=1` that performs the same for 🤗 Datasets if you're using the latter.
 Here is an example of how this can be used on a filesystem that is shared between a normally networked and a firewalled to the external world instances.
 On the instance with the normal network run your program which will download and cache models (and optionally datasets if you use 🤗 Datasets). For example:
 ```
 python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
 ```
 and then with the same filesystem you can now run the same program on a firewalled instance:
 ```
 HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
 python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
 ```
 and it should succeed without any hanging waiting to timeout.
 #### Fetching models and tokenizers to use offline
 When running a script the first time like mentioned above, the downloaded files will be cached for future reuse. 
 However, it is also possible to download files and point to their local path instead.
 Downloading files can be done through the Web Interface by clicking on the "Download" button, but it can also be handled
 programmatically using the `huggingface_hub` library that is a dependency to `transformers`:
 - Using `snapshot_download` to download an entire repository
 - Using `hf_hub_download` to download a specific file
 See the reference for these methods in the huggingface_hub
 [documentation](https://github.com/huggingface/huggingface_hub/tree/main/src/huggingface_hub).
 ## Do you want to run a Transformer model on a mobile device?
 You should check out our [swift-coreml-transformers](https://github.com/huggingface/swift-coreml-transformers) repo.
 It contains a set of tools to convert PyTorch or TensorFlow 2.0 trained Transformer models (currently contains `GPT-2`,
 `DistilGPT-2`, `BERT`, and `DistilBERT`) to CoreML models that run on iOS devices.
 At some point in the future, you'll be able to seamlessly move from pretraining or fine-tuning models in PyTorch or
 TensorFlow 2.0 to productizing them in CoreML, or prototype a model or an app in CoreML then research its
 hyperparameters or architecture from PyTorch or TensorFlow 2.0. Super exciting!
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@ -0,0 +1,235 @@
 <!---
 Copyright 2022 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 -->
 # Installation
 Install 🤗 Transformers for whichever deep learning library you're working with, setup your cache, and optionally configure 🤗 Transformers to run offline.
 🤗 Transformers is tested on Python 3.6+, PyTorch 1.1.0+, TensorFlow 2.0+, and Flax. Follow the installation instructions below for the deep learning library you are using:
 * [PyTorch](https://pytorch.org/get-started/locally/) installation instructions.
 * [TensorFlow 2.0](https://www.tensorflow.org/install/pip) installation instructions.
 * [Flax](https://flax.readthedocs.io/en/latest/) installation instructions.
 ## Install with pip
 You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, take a look at this [guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). A virtual environment makes it easier to manage different projects, and avoid compatibility issues between dependencies.
 Start by creating a virtual environment in your project directory:
 ```bash
 python -m venv .env
 ```
 Activate the virtual environment:
 ```bash
 source .env/bin/activate
 ```
 Now you're ready to install 🤗 Transformers with the following command:
 ```bash
 pip install transformers
 ```
 For CPU-support only, you can conveniently install 🤗 Transformers and a deep learning library in one line. For example, install 🤗 Transformers and PyTorch with:
 ```bash
 pip install transformers[torch]
 ```
 🤗 Transformers and TensorFlow 2.0:
 ```bash
 pip install transformers[tf-cpu]
 ```
 🤗 Transformers and Flax:
 ```bash
 pip install transformers[flax]
 ```
 Finally, check if 🤗 Transformers has been properly installed by running the following command. It will download a pretrained model:
 ```bash
 python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))"
 ```
 Then print out the label and score:
 ```bash
 [{'label': 'POSITIVE', 'score': 0.9998704791069031}]
 ```
 ## Install from source
 Install 🤗 Transformers from source with the following command:
 ```bash
 pip install git+https://github.com/huggingface/transformers
 ```
 This command installs the bleeding edge `master` version rather than the latest `stable` version. The `master` version is useful for staying up-to-date with the latest developments. For instance, if a bug has been fixed since the last official release but a new release hasn't been rolled out yet. However, this means the `master` version may not always be stable. We strive to keep the `master` version operational, and most issues are usually resolved within a few hours or a day. If you run into a problem, please open an [Issue](https://github.com/huggingface/transformers/issues) so we can fix it even sooner!
 Check if 🤗 Transformers has been properly installed by running the following command:
 ```bash
 python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I love you'))"
 ```
 ## Editable install
 You will need an editable install if you'd like to:
 * Use the `master` version of the source code.
 * Contribute to 🤗 Transformers and need to test changes in the code.
 Clone the repository and install 🤗 Transformers with the following commands:
 ```bash
 git clone https://github.com/huggingface/transformers.git
 cd transformers
 pip install -e .
 ```
 These commands will link the folder you cloned the repository to and your Python library paths. Python will now look inside the folder you cloned to in addition to the normal library paths. For example, if your Python packages are typically installed in `~/anaconda3/envs/main/lib/python3.7/site-packages/`, Python will also search the folder you cloned to: `~/transformers/`.
 <Tip warning={true}>
 You must keep the `transformers` folder if you want to keep using the library.
 </Tip>
 Now you can easily update your clone to the latest version of 🤗 Transformers with the following command:
 ```bash
 cd ~/transformers/
 git pull
 ```
 Your Python environment will find the `master` version of 🤗 Transformers on the next run.
 ## Install with conda
 Install from the conda channel `huggingface`:
 ```bash
 conda install -c huggingface transformers
 ```
 ## Cache setup
 Pretrained models are downloaded and locally cached at: `~/.cache/huggingface/transformers/`. This is the default directory given by the shell environment variable `TRANSFORMERS_CACHE`. On Windows, the default directory is given by `C:\Users\username\.cache\huggingface\transformers`. You can change the shell environment variables shown below - in order of priority - to specify a different cache directory:
 1. Shell environment variable (default): `TRANSFORMERS_CACHE`.
 2. Shell environment variable: `HF_HOME` + `transformers/`.
 3. Shell environment variable: `XDG_CACHE_HOME` + `/huggingface/transformers`.
 <Tip>
 🤗 Transformers will use the shell environment variables `PYTORCH_TRANSFORMERS_CACHE` or `PYTORCH_PRETRAINED_BERT_CACHE` if you are coming from an earlier iteration of this library and have set those environment variables, unless you specify the shell environment variable `TRANSFORMERS_CACHE`.
 </Tip>
 ## Offline mode
 🤗 Transformers is able to run in a firewalled or offline environment by only using local files. Set the environment variable `TRANSFORMERS_OFFLINE=1` to enable this behavior.
 <Tip>
 Add [🤗 Datasets](https://huggingface.co/docs/datasets/) to your offline training workflow by setting the environment variable `HF_DATASETS_OFFLINE=1`.
 </Tip>
 For example, you would typically run a program on a normal network firewalled to external instances with the following command:
 ```bash
 python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
 ```
 Run this same program in an offline instance with:
 ```bash
 HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
 python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
 ```
 The script should now run without hanging or waiting to timeout because it knows it should only look for local files.
 ### Fetch models and tokenizers to use offline
 Another option for using 🤗 Transformers offline is to download the files ahead of time, and then point to their local path when you need to use them offline. There are three ways to do this:
 * Download a file through the user interface on the [Model Hub](https://huggingface.co/models) by clicking on the ↓ icon.
    ![download-icon](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/download-icon.png)
 * Use the [`PreTrainedModel.from_pretrained`] and [`PreTrainedModel.save_pretrained`] workflow:
    1. Download your files ahead of time with [`PreTrainedModel.from_pretrained`]:
    ```py
    >>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
    >>> tokenizer = AutoTokenizer.from_pretrained("bigscience/T0_3B")
    >>> model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0_3B")
    ```
    2. Save your files to a specified directory with [`PreTrainedModel.save_pretrained`]:
    ```py
    >>> tokenizer.save_pretrained("./your/path/bigscience_t0")
    >>> model.save_pretrained("./your/path/bigscience_t0")
    ```
    3. Now when you're offline, reload your files with [`PreTrainedModel.from_pretrained`] from the specified directory:
    ```py
    >>> tokenizer = AutoTokenizer.from_pretrained("./your/path/bigscience_t0")
    >>> model = AutoModel.from_pretrained("./your/path/bigscience_t0")
    ```
 * Programmatically download files with the [huggingface_hub](https://github.com/huggingface/huggingface_hub/tree/main/src/huggingface_hub) library:
    1. Install the `huggingface_hub` library in your virtual environment:
    ```bash
    python -m pip install huggingface_hub
    ```
    2. Use the [`hf_hub_download`](https://huggingface.co/docs/hub/adding-a-library#download-files-from-the-hub) function to download a file to a specific path. For example, the following command downloads the `config.json` file from the [T0](https://huggingface.co/bigscience/T0_3B) model to your desired path:
    ```py
    >>> from huggingface_hub import hf_hub_download
    >>> hf_hub_download(repo_id="bigscience/T0_3B", filename="config.json", cache_dir="./your/path/bigscience_t0")
    ```
 Once your file is downloaded and locally cached, specify it's local path to load and use it:
 ```py
 >>> from transformers import AutoConfig
 >>> config = AutoConfig.from_pretrained("./your/path/bigscience_t0/config.json")
 ```
 <Tip>
 See the [How to download files from the Hub](https://huggingface.co/docs/hub/how-to-downstream) section for more details on downloading files stored on the Hub.
 </Tip>
--- a/docs/source/internal/file_utils.mdx
+++ b/docs/source/internal/file_utils.mdx
@ -0,0 +1,46 @@
 <!--Copyright 2021 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
 -->
 # General Utilities
 This page lists all of Transformers general utility functions that are found in the file `file_utils.py`.
 Most of those are only useful if you are studying the general code in the library.
 ## Enums and namedtuples
 [[autodoc]] file_utils.ExplicitEnum
 [[autodoc]] file_utils.PaddingStrategy
 [[autodoc]] file_utils.TensorType
 ## Special Decorators
 [[autodoc]] file_utils.add_start_docstrings
 [[autodoc]] file_utils.add_start_docstrings_to_model_forward
 [[autodoc]] file_utils.add_end_docstrings
 [[autodoc]] file_utils.add_code_sample_docstrings
 [[autodoc]] file_utils.replace_return_docstrings
 ## Special Properties
 [[autodoc]] file_utils.cached_property
 ## Other Utilities
 [[autodoc]] file_utils._LazyModule
--- a/docs/source/internal/file_utils.rst
+++ b/docs/source/internal/file_utils.rst
@ -1,54 +0,0 @@
 .. 
    Copyright 2021 The HuggingFace Team. All rights reserved.
    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
    the License. You may obtain a copy of the License at
        http://www.apache.org/licenses/LICENSE-2.0
    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
    specific language governing permissions and limitations under the License.
 General Utilities
 -----------------------------------------------------------------------------------------------------------------------
 This page lists all of Transformers general utility functions that are found in the file ``file_utils.py``.
 Most of those are only useful if you are studying the general code in the library.
 Enums and namedtuples
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.file_utils.ExplicitEnum
 .. autoclass:: transformers.file_utils.PaddingStrategy
 .. autoclass:: transformers.file_utils.TensorType
 Special Decorators
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autofunction:: transformers.file_utils.add_start_docstrings
 .. autofunction:: transformers.file_utils.add_start_docstrings_to_model_forward
 .. autofunction:: transformers.file_utils.add_end_docstrings
 .. autofunction:: transformers.file_utils.add_code_sample_docstrings
 .. autofunction:: transformers.file_utils.replace_return_docstrings
 Special Properties
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.file_utils.cached_property
 Other Utilities
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.file_utils._LazyModule
--- a/docs/source/internal/generation_utils.mdx
+++ b/docs/source/internal/generation_utils.mdx
@ -0,0 +1,254 @@
 <!--Copyright 2020 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
 -->
 # Utilities for Generation
 This page lists all the utility functions used by [`~generation_utils.GenerationMixin.generate`],
 [`~generation_utils.GenerationMixin.greedy_search`],
 [`~generation_utils.GenerationMixin.sample`],
 [`~generation_utils.GenerationMixin.beam_search`],
 [`~generation_utils.GenerationMixin.beam_sample`],
 [`~generation_utils.GenerationMixin.group_beam_search`], and
 [`~generation_utils.GenerationMixin.constrained_beam_search`].
 Most of those are only useful if you are studying the code of the generate methods in the library.
 ## Generate Outputs
 The output of [`~generation_utils.GenerationMixin.generate`] is an instance of a subclass of
 [`~file_utils.ModelOutput`]. This output is a data structure containing all the information returned
 by [`~generation_utils.GenerationMixin.generate`], but that can also be used as tuple or dictionary.
 Here's an example:
 ```python
 from transformers import GPT2Tokenizer, GPT2LMHeadModel
 tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
 model = GPT2LMHeadModel.from_pretrained("gpt2")
 inputs = tokenizer("Hello, my dog is cute and ", return_tensors="pt")
 generation_output = model.generate(**inputs, return_dict_in_generate=True, output_scores=True)
 ```
 The `generation_output` object is a [`~generation_utils.GreedySearchDecoderOnlyOutput`], as we can
 see in the documentation of that class below, it means it has the following attributes:
 - `sequences`: the generated sequences of tokens
 - `scores` (optional): the prediction scores of the language modelling head, for each generation step
 - `hidden_states` (optional): the hidden states of the model, for each generation step
 - `attentions` (optional): the attention weights of the model, for each generation step
 Here we have the `scores` since we passed along `output_scores=True`, but we don't have `hidden_states` and
 `attentions` because we didn't pass `output_hidden_states=True` or `output_attentions=True`.
 You can access each attribute as you would usually do, and if that attribute has not been returned by the model, you
 will get `None`. Here for instance `generation_output.scores` are all the generated prediction scores of the
 language modeling head, and `generation_output.attentions` is `None`.
 When using our `generation_output` object as a tuple, it only keeps the attributes that don't have `None` values.
 Here, for instance, it has two elements, `loss` then `logits`, so
 ```python
 generation_output[:2]
 ```
 will return the tuple `(generation_output.sequences, generation_output.scores)` for instance.
 When using our `generation_output` object as a dictionary, it only keeps the attributes that don't have `None`
 values. Here, for instance, it has two keys that are `sequences` and `scores`.
 We document here all output types.
 ### GreedySearchOutput
 [[autodoc]] generation_utils.GreedySearchDecoderOnlyOutput
 [[autodoc]] generation_utils.GreedySearchEncoderDecoderOutput
 [[autodoc]] generation_flax_utils.FlaxGreedySearchOutput
 ### SampleOutput
 [[autodoc]] generation_utils.SampleDecoderOnlyOutput
 [[autodoc]] generation_utils.SampleEncoderDecoderOutput
 [[autodoc]] generation_flax_utils.FlaxSampleOutput
 ### BeamSearchOutput
 [[autodoc]] generation_utils.BeamSearchDecoderOnlyOutput
 [[autodoc]] generation_utils.BeamSearchEncoderDecoderOutput
 ### BeamSampleOutput
 [[autodoc]] generation_utils.BeamSampleDecoderOnlyOutput
 [[autodoc]] generation_utils.BeamSampleEncoderDecoderOutput
 ## LogitsProcessor
 A [`LogitsProcessor`] can be used to modify the prediction scores of a language model head for
 generation.
 [[autodoc]] LogitsProcessor
    - __call__
 [[autodoc]] LogitsProcessorList
    - __call__
 [[autodoc]] LogitsWarper
    - __call__
 [[autodoc]] MinLengthLogitsProcessor
    - __call__
 [[autodoc]] TemperatureLogitsWarper
    - __call__
 [[autodoc]] RepetitionPenaltyLogitsProcessor
    - __call__
 [[autodoc]] TopPLogitsWarper
    - __call__
 [[autodoc]] TopKLogitsWarper
    - __call__
 [[autodoc]] NoRepeatNGramLogitsProcessor
    - __call__
 [[autodoc]] NoBadWordsLogitsProcessor
    - __call__
 [[autodoc]] PrefixConstrainedLogitsProcessor
    - __call__
 [[autodoc]] HammingDiversityLogitsProcessor
    - __call__
 [[autodoc]] ForcedBOSTokenLogitsProcessor
    - __call__
 [[autodoc]] ForcedEOSTokenLogitsProcessor
    - __call__
 [[autodoc]] InfNanRemoveLogitsProcessor
    - __call__
 [[autodoc]] TFLogitsProcessor
    - __call__
 [[autodoc]] TFLogitsProcessorList
    - __call__
 [[autodoc]] TFLogitsWarper
    - __call__
 [[autodoc]] TFTemperatureLogitsWarper
    - __call__
 [[autodoc]] TFTopPLogitsWarper
    - __call__
 [[autodoc]] TFTopKLogitsWarper
    - __call__
 [[autodoc]] TFMinLengthLogitsProcessor
    - __call__
 [[autodoc]] TFNoBadWordsLogitsProcessor
    - __call__
 [[autodoc]] TFNoRepeatNGramLogitsProcessor
    - __call__
 [[autodoc]] TFRepetitionPenaltyLogitsProcessor
    - __call__
 [[autodoc]] FlaxLogitsProcessor
    - __call__
 [[autodoc]] FlaxLogitsProcessorList
    - __call__
 [[autodoc]] FlaxLogitsWarper
    - __call__
 [[autodoc]] FlaxTemperatureLogitsWarper
    - __call__
 [[autodoc]] FlaxTopPLogitsWarper
    - __call__
 [[autodoc]] FlaxTopKLogitsWarper
    - __call__
 [[autodoc]] FlaxForcedBOSTokenLogitsProcessor
    - __call__
 [[autodoc]] FlaxForcedEOSTokenLogitsProcessor
    - __call__
 [[autodoc]] FlaxMinLengthLogitsProcessor
    - __call__
 ## StoppingCriteria
 A [`StoppingCriteria`] can be used to change when to stop generation (other than EOS token).
 [[autodoc]] StoppingCriteria
    - __call__
 [[autodoc]] StoppingCriteriaList
    - __call__
 [[autodoc]] MaxLengthCriteria
    - __call__
 [[autodoc]] MaxTimeCriteria
    - __call__
 ## Constraints
 A [`Constraint`] can be used to force the generation to include specific tokens or sequences in the output.
 [[autodoc]] Constraint
 [[autodoc]] PhrasalConstraint
 [[autodoc]] DisjunctiveConstraint
 [[autodoc]] ConstraintListState
 ## BeamSearch
 [[autodoc]] BeamScorer
    - process
    - finalize
 [[autodoc]] BeamSearchScorer
    - process
    - finalize
 [[autodoc]] ConstrainedBeamSearchScorer
    - process
    - finalize
 ## Utilities
 [[autodoc]] top_k_top_p_filtering
 [[autodoc]] tf_top_k_top_p_filtering
--- a/docs/source/internal/generation_utils.rst
+++ b/docs/source/internal/generation_utils.rst
@ -1,230 +0,0 @@
 .. 
    Copyright 2020 The HuggingFace Team. All rights reserved.
    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
    the License. You may obtain a copy of the License at
        http://www.apache.org/licenses/LICENSE-2.0
    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
    specific language governing permissions and limitations under the License.
 Utilities for Generation
 -----------------------------------------------------------------------------------------------------------------------
 This page lists all the utility functions used by :meth:`~transformers.generation_utils.GenerationMixin.generate`,
 :meth:`~transformers.generation_utils.GenerationMixin.greedy_search`,
 :meth:`~transformers.generation_utils.GenerationMixin.sample`,
 :meth:`~transformers.generation_utils.GenerationMixin.beam_search`,
 :meth:`~transformers.generation_utils.GenerationMixin.beam_sample`, and
 :meth:`~transformers.generation_utils.GenerationMixin.group_beam_search`.
 Most of those are only useful if you are studying the code of the generate methods in the library.
 Generate Outputs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 The output of :meth:`~transformers.generation_utils.GenerationMixin.generate` is an instance of a subclass of
 :class:`~transformers.file_utils.ModelOutput`. This output is a data structure containing all the information returned
 by :meth:`~transformers.generation_utils.GenerationMixin.generate`, but that can also be used as tuple or dictionary.
 Here's an example:
 .. code-block::
    from transformers import GPT2Tokenizer, GPT2LMHeadModel
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    inputs = tokenizer("Hello, my dog is cute and ", return_tensors="pt")
    generation_output = model.generate(**inputs, return_dict_in_generate=True, output_scores=True)
 The ``generation_output`` object is a :class:`~transformers.generation_utils.GreedySearchDecoderOnlyOutput`, as we can
 see in the documentation of that class below, it means it has the following attributes:
 - ``sequences``: the generated sequences of tokens
 - ``scores`` (optional): the prediction scores of the language modelling head, for each generation step
 - ``hidden_states`` (optional): the hidden states of the model, for each generation step
 - ``attentions`` (optional): the attention weights of the model, for each generation step
 Here we have the ``scores`` since we passed along ``output_scores=True``, but we don't have ``hidden_states`` and
 ``attentions`` because we didn't pass ``output_hidden_states=True`` or ``output_attentions=True``.
 You can access each attribute as you would usually do, and if that attribute has not been returned by the model, you
 will get ``None``. Here for instance ``generation_output.scores`` are all the generated prediction scores of the
 language modeling head, and ``generation_output.attentions`` is ``None``.
 When using our ``generation_output`` object as a tuple, it only keeps the attributes that don't have ``None`` values.
 Here, for instance, it has two elements, ``loss`` then ``logits``, so
 .. code-block::
    generation_output[:2]
 will return the tuple ``(generation_output.sequences, generation_output.scores)`` for instance.
 When using our ``generation_output`` object as a dictionary, it only keeps the attributes that don't have ``None``
 values. Here, for instance, it has two keys that are ``sequences`` and ``scores``.
 We document here all output types.
 GreedySearchOutput
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. autoclass:: transformers.generation_utils.GreedySearchDecoderOnlyOutput
    :members:
 .. autoclass:: transformers.generation_utils.GreedySearchEncoderDecoderOutput
    :members:
 .. autoclass:: transformers.generation_flax_utils.FlaxGreedySearchOutput
    :members:
 SampleOutput
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. autoclass:: transformers.generation_utils.SampleDecoderOnlyOutput
    :members:
 .. autoclass:: transformers.generation_utils.SampleEncoderDecoderOutput
    :members:
 .. autoclass:: transformers.generation_flax_utils.FlaxSampleOutput
    :members:
 BeamSearchOutput
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. autoclass:: transformers.generation_utils.BeamSearchDecoderOnlyOutput
    :members:
 .. autoclass:: transformers.generation_utils.BeamSearchEncoderDecoderOutput
    :members:
 BeamSampleOutput
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 .. autoclass:: transformers.generation_utils.BeamSampleDecoderOnlyOutput
    :members:
 .. autoclass:: transformers.generation_utils.BeamSampleEncoderDecoderOutput
    :members:
 LogitsProcessor
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 A :class:`~transformers.LogitsProcessor` can be used to modify the prediction scores of a language model head for
 generation.
 .. autoclass:: transformers.LogitsProcessor
    :members: __call__
 .. autoclass:: transformers.LogitsProcessorList
    :members: __call__
 .. autoclass:: transformers.LogitsWarper
    :members: __call__
 .. autoclass:: transformers.MinLengthLogitsProcessor
    :members: __call__
 .. autoclass:: transformers.TemperatureLogitsWarper
    :members: __call__
 .. autoclass:: transformers.RepetitionPenaltyLogitsProcessor
    :members: __call__
 .. autoclass:: transformers.TopPLogitsWarper
    :members: __call__
 .. autoclass:: transformers.TopKLogitsWarper
    :members: __call__
 .. autoclass:: transformers.NoRepeatNGramLogitsProcessor
    :members: __call__
 .. autoclass:: transformers.NoBadWordsLogitsProcessor
    :members: __call__
 .. autoclass:: transformers.PrefixConstrainedLogitsProcessor
    :members: __call__
 .. autoclass:: transformers.HammingDiversityLogitsProcessor
    :members: __call__
 .. autoclass:: transformers.ForcedBOSTokenLogitsProcessor
    :members: __call__
 .. autoclass:: transformers.ForcedEOSTokenLogitsProcessor
    :members: __call__
 .. autoclass:: transformers.InfNanRemoveLogitsProcessor
    :members: __call__
 .. autoclass:: transformers.FlaxLogitsProcessor
    :members: __call__
 .. autoclass:: transformers.FlaxLogitsProcessorList
    :members: __call__
 .. autoclass:: transformers.FlaxLogitsWarper
    :members: __call__
 .. autoclass:: transformers.FlaxTemperatureLogitsWarper
    :members: __call__
 .. autoclass:: transformers.FlaxTopPLogitsWarper
    :members: __call__
 .. autoclass:: transformers.FlaxTopKLogitsWarper
    :members: __call__
 .. autoclass:: transformers.FlaxForcedBOSTokenLogitsProcessor
    :members: __call__
 .. autoclass:: transformers.FlaxForcedEOSTokenLogitsProcessor
    :members: __call__
 .. autoclass:: transformers.FlaxMinLengthLogitsProcessor
    :members: __call__
 StoppingCriteria
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 A :class:`~transformers.StoppingCriteria` can be used to change when to stop generation (other than EOS token).
 .. autoclass:: transformers.StoppingCriteria
    :members: __call__
 .. autoclass:: transformers.StoppingCriteriaList
    :members: __call__
 .. autoclass:: transformers.MaxLengthCriteria
    :members: __call__
 .. autoclass:: transformers.MaxTimeCriteria
    :members: __call__
 BeamSearch
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.BeamScorer
    :members: process, finalize
 .. autoclass:: transformers.BeamSearchScorer
    :members: process, finalize
 Utilities
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autofunction:: transformers.top_k_top_p_filtering
 .. autofunction:: transformers.tf_top_k_top_p_filtering
--- a/docs/source/internal/modeling_utils.mdx
+++ b/docs/source/internal/modeling_utils.mdx
@ -0,0 +1,82 @@
 <!--Copyright 2020 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
 -->
 # Custom Layers and Utilities
 This page lists all the custom layers used by the library, as well as the utility functions it provides for modeling.
 Most of those are only useful if you are studying the code of the models in the library.
 ## Pytorch custom modules
 [[autodoc]] modeling_utils.Conv1D
 [[autodoc]] modeling_utils.PoolerStartLogits
    - forward
 [[autodoc]] modeling_utils.PoolerEndLogits
    - forward
 [[autodoc]] modeling_utils.PoolerAnswerClass
    - forward
 [[autodoc]] modeling_utils.SquadHeadOutput
 [[autodoc]] modeling_utils.SQuADHead
    - forward
 [[autodoc]] modeling_utils.SequenceSummary
    - forward
 ## PyTorch Helper Functions
 [[autodoc]] apply_chunking_to_forward
 [[autodoc]] modeling_utils.find_pruneable_heads_and_indices
 [[autodoc]] modeling_utils.prune_layer
 [[autodoc]] modeling_utils.prune_conv1d_layer
 [[autodoc]] modeling_utils.prune_linear_layer
 ## TensorFlow custom layers
 [[autodoc]] modeling_tf_utils.TFConv1D
 [[autodoc]] modeling_tf_utils.TFSharedEmbeddings
    - call
 [[autodoc]] modeling_tf_utils.TFSequenceSummary
 ## TensorFlow loss functions
 [[autodoc]] modeling_tf_utils.TFCausalLanguageModelingLoss
 [[autodoc]] modeling_tf_utils.TFMaskedLanguageModelingLoss
 [[autodoc]] modeling_tf_utils.TFMultipleChoiceLoss
 [[autodoc]] modeling_tf_utils.TFQuestionAnsweringLoss
 [[autodoc]] modeling_tf_utils.TFSequenceClassificationLoss
 [[autodoc]] modeling_tf_utils.TFTokenClassificationLoss
 ## TensorFlow Helper Functions
 [[autodoc]] modeling_tf_utils.get_initializer
 [[autodoc]] modeling_tf_utils.keras_serializable
 [[autodoc]] modeling_tf_utils.shape_list
--- a/docs/source/internal/modeling_utils.rst
+++ b/docs/source/internal/modeling_utils.rst
@ -1,97 +0,0 @@
 .. 
    Copyright 2020 The HuggingFace Team. All rights reserved.
    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
    the License. You may obtain a copy of the License at
        http://www.apache.org/licenses/LICENSE-2.0
    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
    specific language governing permissions and limitations under the License.
 Custom Layers and Utilities
 -----------------------------------------------------------------------------------------------------------------------
 This page lists all the custom layers used by the library, as well as the utility functions it provides for modeling.
 Most of those are only useful if you are studying the code of the models in the library.
 Pytorch custom modules
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.modeling_utils.Conv1D
 .. autoclass:: transformers.modeling_utils.PoolerStartLogits
    :members: forward
 .. autoclass:: transformers.modeling_utils.PoolerEndLogits
    :members: forward
 .. autoclass:: transformers.modeling_utils.PoolerAnswerClass
    :members: forward
 .. autoclass:: transformers.modeling_utils.SquadHeadOutput
 .. autoclass:: transformers.modeling_utils.SQuADHead
    :members: forward
 .. autoclass:: transformers.modeling_utils.SequenceSummary
    :members: forward
 PyTorch Helper Functions
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autofunction:: transformers.apply_chunking_to_forward
 .. autofunction:: transformers.modeling_utils.find_pruneable_heads_and_indices
 .. autofunction:: transformers.modeling_utils.prune_layer
 .. autofunction:: transformers.modeling_utils.prune_conv1d_layer
 .. autofunction:: transformers.modeling_utils.prune_linear_layer
 TensorFlow custom layers
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.modeling_tf_utils.TFConv1D
 .. autoclass:: transformers.modeling_tf_utils.TFSharedEmbeddings
    :members: call
 .. autoclass:: transformers.modeling_tf_utils.TFSequenceSummary
 TensorFlow loss functions
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.modeling_tf_utils.TFCausalLanguageModelingLoss
    :members:
 .. autoclass:: transformers.modeling_tf_utils.TFMaskedLanguageModelingLoss
    :members:
 .. autoclass:: transformers.modeling_tf_utils.TFMultipleChoiceLoss
    :members:
 .. autoclass:: transformers.modeling_tf_utils.TFQuestionAnsweringLoss
    :members:
 .. autoclass:: transformers.modeling_tf_utils.TFSequenceClassificationLoss
    :members:
 .. autoclass:: transformers.modeling_tf_utils.TFTokenClassificationLoss
    :members:
 TensorFlow Helper Functions
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autofunction:: transformers.modeling_tf_utils.get_initializer
 .. autofunction:: transformers.modeling_tf_utils.keras_serializable
 .. autofunction:: transformers.modeling_tf_utils.shape_list
--- a/docs/source/internal/pipelines_utils.mdx
+++ b/docs/source/internal/pipelines_utils.mdx
@ -0,0 +1,40 @@
 <!--Copyright 2020 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
 -->
 # Utilities for pipelines
 This page lists all the utility functions the library provides for pipelines.
 Most of those are only useful if you are studying the code of the models in the library.
 ## Argument handling
 [[autodoc]] pipelines.ArgumentHandler
 [[autodoc]] pipelines.ZeroShotClassificationArgumentHandler
 [[autodoc]] pipelines.QuestionAnsweringArgumentHandler
 ## Data format
 [[autodoc]] pipelines.PipelineDataFormat
 [[autodoc]] pipelines.CsvPipelineDataFormat
 [[autodoc]] pipelines.JsonPipelineDataFormat
 [[autodoc]] pipelines.PipedPipelineDataFormat
 ## Utilities
 [[autodoc]] pipelines.PipelineException
--- a/docs/source/internal/pipelines_utils.rst
+++ b/docs/source/internal/pipelines_utils.rst
@ -1,50 +0,0 @@
 .. 
    Copyright 2020 The HuggingFace Team. All rights reserved.
    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
    the License. You may obtain a copy of the License at
        http://www.apache.org/licenses/LICENSE-2.0
    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
    specific language governing permissions and limitations under the License.
 Utilities for pipelines
 -----------------------------------------------------------------------------------------------------------------------
 This page lists all the utility functions the library provides for pipelines.
 Most of those are only useful if you are studying the code of the models in the library.
 Argument handling
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.pipelines.ArgumentHandler
 .. autoclass:: transformers.pipelines.ZeroShotClassificationArgumentHandler
 .. autoclass:: transformers.pipelines.QuestionAnsweringArgumentHandler
 Data format
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.pipelines.PipelineDataFormat
    :members:
 .. autoclass:: transformers.pipelines.CsvPipelineDataFormat
    :members:
 .. autoclass:: transformers.pipelines.JsonPipelineDataFormat
    :members:
 .. autoclass:: transformers.pipelines.PipedPipelineDataFormat
    :members:
 Utilities
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.pipelines.PipelineException
--- a/docs/source/internal/tokenization_utils.mdx
+++ b/docs/source/internal/tokenization_utils.mdx
@ -0,0 +1,38 @@
 <!--Copyright 2020 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
 -->
 # Utilities for Tokenizers
 This page lists all the utility functions used by the tokenizers, mainly the class
 [`~tokenization_utils_base.PreTrainedTokenizerBase`] that implements the common methods between
 [`PreTrainedTokenizer`] and [`PreTrainedTokenizerFast`] and the mixin
 [`~tokenization_utils_base.SpecialTokensMixin`].
 Most of those are only useful if you are studying the code of the tokenizers in the library.
 ## PreTrainedTokenizerBase
 [[autodoc]] tokenization_utils_base.PreTrainedTokenizerBase
    - __call__
    - all
 ## SpecialTokensMixin
 [[autodoc]] tokenization_utils_base.SpecialTokensMixin
 ## Enums and namedtuples
 [[autodoc]] tokenization_utils_base.TruncationStrategy
 [[autodoc]] tokenization_utils_base.CharSpan
 [[autodoc]] tokenization_utils_base.TokenSpan
--- a/docs/source/internal/tokenization_utils.rst
+++ b/docs/source/internal/tokenization_utils.rst
@ -1,45 +0,0 @@
 .. 
    Copyright 2020 The HuggingFace Team. All rights reserved.
    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
    the License. You may obtain a copy of the License at
        http://www.apache.org/licenses/LICENSE-2.0
    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
    specific language governing permissions and limitations under the License.
 Utilities for Tokenizers
 -----------------------------------------------------------------------------------------------------------------------
 This page lists all the utility functions used by the tokenizers, mainly the class
 :class:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase` that implements the common methods between
 :class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast` and the mixin
 :class:`~transformers.tokenization_utils_base.SpecialTokensMixin`.
 Most of those are only useful if you are studying the code of the tokenizers in the library.
 PreTrainedTokenizerBase
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.tokenization_utils_base.PreTrainedTokenizerBase
    :special-members: __call__
    :members:
 SpecialTokensMixin
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.tokenization_utils_base.SpecialTokensMixin
    :members:
 Enums and namedtuples
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.tokenization_utils_base.TruncationStrategy
 .. autoclass:: transformers.tokenization_utils_base.CharSpan
 .. autoclass:: transformers.tokenization_utils_base.TokenSpan
--- a/docs/source/internal/trainer_utils.mdx
+++ b/docs/source/internal/trainer_utils.mdx
@ -0,0 +1,43 @@
 <!--Copyright 2020 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
 -->
 # Utilities for Trainer
 This page lists all the utility functions used by [`Trainer`].
 Most of those are only useful if you are studying the code of the Trainer in the library.
 ## Utilities
 [[autodoc]] EvalPrediction
 [[autodoc]] IntervalStrategy
 [[autodoc]] set_seed
 [[autodoc]] torch_distributed_zero_first
 ## Callbacks internals
 [[autodoc]] trainer_callback.CallbackHandler
 ## Distributed Evaluation
 [[autodoc]] trainer_pt_utils.DistributedTensorGatherer
 ## Distributed Evaluation
 [[autodoc]] HfArgumentParser
 ## Debug Utilities
 [[autodoc]] debug_utils.DebugUnderflowOverflow
--- a/docs/source/internal/trainer_utils.rst
+++ b/docs/source/internal/trainer_utils.rst
@ -1,54 +0,0 @@
 ..
    Copyright 2020 The HuggingFace Team. All rights reserved.
    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
    the License. You may obtain a copy of the License at
        http://www.apache.org/licenses/LICENSE-2.0
    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
    specific language governing permissions and limitations under the License.
 Utilities for Trainer
 -----------------------------------------------------------------------------------------------------------------------
 This page lists all the utility functions used by :class:`~transformers.Trainer`.
 Most of those are only useful if you are studying the code of the Trainer in the library.
 Utilities
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.EvalPrediction
 .. autoclass:: transformers.IntervalStrategy
 .. autofunction:: transformers.set_seed
 .. autofunction:: transformers.torch_distributed_zero_first
 Callbacks internals
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.trainer_callback.CallbackHandler
 Distributed Evaluation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.trainer_pt_utils.DistributedTensorGatherer
    :members:
 Distributed Evaluation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.HfArgumentParser
 Debug Utilities
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.debug_utils.DebugUnderflowOverflow
--- a/docs/source/main_classes/callback.mdx
+++ b/docs/source/main_classes/callback.mdx
@ -0,0 +1,111 @@
 <!--Copyright 2020 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
 -->
 # Callbacks
 Callbacks are objects that can customize the behavior of the training loop in the PyTorch
 [`Trainer`] (this feature is not yet implemented in TensorFlow) that can inspect the training loop
 state (for progress reporting, logging on TensorBoard or other ML platforms...) and take decisions (like early
 stopping).
 Callbacks are "read only" pieces of code, apart from the [`TrainerControl`] object they return, they
 cannot change anything in the training loop. For customizations that require changes in the training loop, you should
 subclass [`Trainer`] and override the methods you need (see [trainer](trainer) for examples).
 By default a [`Trainer`] will use the following callbacks:
 - [`DefaultFlowCallback`] which handles the default behavior for logging, saving and evaluation.
 - [`PrinterCallback`] or [`ProgressCallback`] to display progress and print the
  logs (the first one is used if you deactivate tqdm through the [`TrainingArguments`], otherwise
  it's the second one).
 - [`~integrations.TensorBoardCallback`] if tensorboard is accessible (either through PyTorch >= 1.4
  or tensorboardX).
 - [`~integrations.WandbCallback`] if [wandb](https://www.wandb.com/) is installed.
 - [`~integrations.CometCallback`] if [comet_ml](https://www.comet.ml/site/) is installed.
 - [`~integrations.MLflowCallback`] if [mlflow](https://www.mlflow.org/) is installed.
 - [`~integrations.AzureMLCallback`] if [azureml-sdk](https://pypi.org/project/azureml-sdk/) is
  installed.
 - [`~integrations.CodeCarbonCallback`] if [codecarbon](https://pypi.org/project/codecarbon/) is
  installed.
 The main class that implements callbacks is [`TrainerCallback`]. It gets the
 [`TrainingArguments`] used to instantiate the [`Trainer`], can access that
 Trainer's internal state via [`TrainerState`], and can take some actions on the training loop via
 [`TrainerControl`].
 ## Available Callbacks
 Here is the list of the available [`TrainerCallback`] in the library:
 [[autodoc]] integrations.CometCallback
    - setup
 [[autodoc]] DefaultFlowCallback
 [[autodoc]] PrinterCallback
 [[autodoc]] ProgressCallback
 [[autodoc]] EarlyStoppingCallback
 [[autodoc]] integrations.TensorBoardCallback
 [[autodoc]] integrations.WandbCallback
    - setup
 [[autodoc]] integrations.MLflowCallback
    - setup
 [[autodoc]] integrations.AzureMLCallback
 [[autodoc]] integrations.CodeCarbonCallback
 ## TrainerCallback
 [[autodoc]] TrainerCallback
 Here is an example of how to register a custom callback with the PyTorch [`Trainer`]:
 ```python
 class MyCallback(TrainerCallback):
    "A callback that prints a message at the beginning of training"
    def on_train_begin(self, args, state, control, **kwargs):
        print("Starting training")
 trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    callbacks=[MyCallback],  # We can either pass the callback class this way or an instance of it (MyCallback())
 )
 ```
 Another way to register a callback is to call `trainer.add_callback()` as follows:
 ```python
 trainer = Trainer(...)
 trainer.add_callback(MyCallback)
 # Alternatively, we can pass an instance of the callback class
 trainer.add_callback(MyCallback())
 ```
 ## TrainerState
 [[autodoc]] TrainerState
 ## TrainerControl
 [[autodoc]] TrainerControl
--- a/docs/source/main_classes/callback.rst
+++ b/docs/source/main_classes/callback.rst
@ -1,115 +0,0 @@
 .. 
    Copyright 2020 The HuggingFace Team. All rights reserved.
    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
    the License. You may obtain a copy of the License at
        http://www.apache.org/licenses/LICENSE-2.0
    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
    specific language governing permissions and limitations under the License.
 Callbacks
 -----------------------------------------------------------------------------------------------------------------------
 Callbacks are objects that can customize the behavior of the training loop in the PyTorch
 :class:`~transformers.Trainer` (this feature is not yet implemented in TensorFlow) that can inspect the training loop
 state (for progress reporting, logging on TensorBoard or other ML platforms...) and take decisions (like early
 stopping).
 Callbacks are "read only" pieces of code, apart from the :class:`~transformers.TrainerControl` object they return, they
 cannot change anything in the training loop. For customizations that require changes in the training loop, you should
 subclass :class:`~transformers.Trainer` and override the methods you need (see :doc:`trainer` for examples).
 By default a :class:`~transformers.Trainer` will use the following callbacks:
 - :class:`~transformers.DefaultFlowCallback` which handles the default behavior for logging, saving and evaluation.
 - :class:`~transformers.PrinterCallback` or :class:`~transformers.ProgressCallback` to display progress and print the
  logs (the first one is used if you deactivate tqdm through the :class:`~transformers.TrainingArguments`, otherwise
  it's the second one).
 - :class:`~transformers.integrations.TensorBoardCallback` if tensorboard is accessible (either through PyTorch >= 1.4
  or tensorboardX).
 - :class:`~transformers.integrations.WandbCallback` if `wandb <https://www.wandb.com/>`__ is installed.
 - :class:`~transformers.integrations.CometCallback` if `comet_ml <https://www.comet.ml/site/>`__ is installed.
 - :class:`~transformers.integrations.MLflowCallback` if `mlflow <https://www.mlflow.org/>`__ is installed.
 - :class:`~transformers.integrations.AzureMLCallback` if `azureml-sdk <https://pypi.org/project/azureml-sdk/>`__ is
  installed.
 The main class that implements callbacks is :class:`~transformers.TrainerCallback`. It gets the
 :class:`~transformers.TrainingArguments` used to instantiate the :class:`~transformers.Trainer`, can access that
 Trainer's internal state via :class:`~transformers.TrainerState`, and can take some actions on the training loop via
 :class:`~transformers.TrainerControl`.
 Available Callbacks
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Here is the list of the available :class:`~transformers.TrainerCallback` in the library:
 .. autoclass:: transformers.integrations.CometCallback
    :members: setup
 .. autoclass:: transformers.DefaultFlowCallback
 .. autoclass:: transformers.PrinterCallback
 .. autoclass:: transformers.ProgressCallback
 .. autoclass:: transformers.EarlyStoppingCallback
 .. autoclass:: transformers.integrations.TensorBoardCallback
 .. autoclass:: transformers.integrations.WandbCallback
    :members: setup
 .. autoclass:: transformers.integrations.MLflowCallback
    :members: setup
 .. autoclass:: transformers.integrations.AzureMLCallback
 TrainerCallback
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TrainerCallback
    :members:
 Here is an example of how to register a custom callback with the PyTorch :class:`~transformers.Trainer`:
 .. code-block:: python
    class MyCallback(TrainerCallback):
        "A callback that prints a message at the beginning of training"
        def on_train_begin(self, args, state, control, **kwargs):
            print("Starting training")
    trainer = Trainer(
        model,
        args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        callbacks=[MyCallback]  # We can either pass the callback class this way or an instance of it (MyCallback())
    )
 Another way to register a callback is to call ``trainer.add_callback()`` as follows:
 .. code-block:: python
    trainer = Trainer(...)
    trainer.add_callback(MyCallback)
    # Alternatively, we can pass an instance of the callback class
    trainer.add_callback(MyCallback())
 TrainerState
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TrainerState
    :members:
 TrainerControl
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.TrainerControl
    :members:
--- a/docs/source/main_classes/configuration.mdx
+++ b/docs/source/main_classes/configuration.mdx
@ -0,0 +1,28 @@
 <!--Copyright 2020 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
 -->
 # Configuration
 The base class [`PretrainedConfig`] implements the common methods for loading/saving a configuration
 either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded
 from HuggingFace's AWS S3 repository).
 Each derived config class implements model specific attributes. Common attributes present in all config classes are:
 `hidden_size`, `num_attention_heads`, and `num_hidden_layers`. Text models further implement:
 `vocab_size`.
 ## PretrainedConfig
 [[autodoc]] PretrainedConfig
    - push_to_hub
    - all
--- a/docs/source/main_classes/configuration.rst
+++ b/docs/source/main_classes/configuration.rst
@ -1,31 +0,0 @@
 .. 
    Copyright 2020 The HuggingFace Team. All rights reserved.
    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
    the License. You may obtain a copy of the License at
        http://www.apache.org/licenses/LICENSE-2.0
    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
    specific language governing permissions and limitations under the License.
 Configuration
 -----------------------------------------------------------------------------------------------------------------------
 The base class :class:`~transformers.PretrainedConfig` implements the common methods for loading/saving a configuration
 either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded
 from HuggingFace's AWS S3 repository).
 Each derived config class implements model specific attributes. Common attributes present in all config classes are:
 :obj:`hidden_size`, :obj:`num_attention_heads`, and :obj:`num_hidden_layers`. Text models further implement:
 :obj:`vocab_size`.
 PretrainedConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.PretrainedConfig
    :special-members: push_to_hub
    :members:
--- a/docs/source/main_classes/data_collator.mdx
+++ b/docs/source/main_classes/data_collator.mdx
@ -0,0 +1,64 @@
 <!--Copyright 2020 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
 -->
 # Data Collator
 Data collators are objects that will form a batch by using a list of dataset elements as input. These elements are of
 the same type as the elements of `train_dataset` or `eval_dataset`.
 To be able to build batches, data collators may apply some processing (like padding). Some of them (like
 [`DataCollatorForLanguageModeling`]) also apply some random data augmentation (like random masking)
 on the formed batch.
 Examples of use can be found in the [example scripts](../examples) or [example notebooks](../notebooks).
 ## Default data collator
 [[autodoc]] data.data_collator.default_data_collator
 ## DefaultDataCollator
 [[autodoc]] data.data_collator.DefaultDataCollator
 ## DataCollatorWithPadding
 [[autodoc]] data.data_collator.DataCollatorWithPadding
 ## DataCollatorForTokenClassification
 [[autodoc]] data.data_collator.DataCollatorForTokenClassification
 ## DataCollatorForSeq2Seq
 [[autodoc]] data.data_collator.DataCollatorForSeq2Seq
 ## DataCollatorForLanguageModeling
 [[autodoc]] data.data_collator.DataCollatorForLanguageModeling
    - numpy_mask_tokens
    - tf_mask_tokens
    - torch_mask_tokens
 ## DataCollatorForWholeWordMask
 [[autodoc]] data.data_collator.DataCollatorForWholeWordMask
    - numpy_mask_tokens
    - tf_mask_tokens
    - torch_mask_tokens
 ## DataCollatorForPermutationLanguageModeling
 [[autodoc]] data.data_collator.DataCollatorForPermutationLanguageModeling
    - numpy_mask_tokens
    - tf_mask_tokens
    - torch_mask_tokens
--- a/docs/source/main_classes/data_collator.rst
+++ b/docs/source/main_classes/data_collator.rst
@ -1,78 +0,0 @@
 .. 
    Copyright 2020 The HuggingFace Team. All rights reserved.
    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
    the License. You may obtain a copy of the License at
        http://www.apache.org/licenses/LICENSE-2.0
    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
    specific language governing permissions and limitations under the License.
 Data Collator
 -----------------------------------------------------------------------------------------------------------------------
 Data collators are objects that will form a batch by using a list of dataset elements as input. These elements are of
 the same type as the elements of :obj:`train_dataset` or :obj:`eval_dataset`.
 To be able to build batches, data collators may apply some processing (like padding). Some of them (like
 :class:`~transformers.DataCollatorForLanguageModeling`) also apply some random data augmentation (like random masking)
 on the formed batch.
 Examples of use can be found in the :doc:`example scripts <../examples>` or :doc:`example notebooks <../notebooks>`.
 Default data collator
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autofunction:: transformers.data.data_collator.default_data_collator
 DefaultDataCollator
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.data.data_collator.DefaultDataCollator
    :members:
 DataCollatorWithPadding
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.data.data_collator.DataCollatorWithPadding
    :members:
 DataCollatorForTokenClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.data.data_collator.DataCollatorForTokenClassification
    :members:
 DataCollatorForSeq2Seq
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.data.data_collator.DataCollatorForSeq2Seq
    :members:
 DataCollatorForLanguageModeling
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.data.data_collator.DataCollatorForLanguageModeling
    :members: numpy_mask_tokens, tf_mask_tokens, torch_mask_tokens
 DataCollatorForWholeWordMask
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.data.data_collator.DataCollatorForWholeWordMask
    :members: numpy_mask_tokens, tf_mask_tokens, torch_mask_tokens
 DataCollatorForPermutationLanguageModeling
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.data.data_collator.DataCollatorForPermutationLanguageModeling
    :members: numpy_mask_tokens, tf_mask_tokens, torch_mask_tokens
--- a/docs/source/main_classes/deepspeed.mdx
+++ b/docs/source/main_classes/deepspeed.mdx
--- a/docs/source/main_classes/deepspeed.rst
+++ b/docs/source/main_classes/deepspeed.rst
--- a/docs/source/main_classes/feature_extractor.mdx
+++ b/docs/source/main_classes/feature_extractor.mdx
@ -0,0 +1,38 @@
 <!--Copyright 2021 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
 -->
 # Feature Extractor
 A feature extractor is in charge of preparing input features for a multi-modal model. This includes feature extraction
 from sequences, *e.g.*, pre-processing audio files to Log-Mel Spectrogram features, feature extraction from images
 *e.g.* cropping image image files, but also padding, normalization, and conversion to Numpy, PyTorch, and TensorFlow
 tensors.
 ## FeatureExtractionMixin
 [[autodoc]] feature_extraction_utils.FeatureExtractionMixin
    - from_pretrained
    - save_pretrained
 ## SequenceFeatureExtractor
 [[autodoc]] SequenceFeatureExtractor
    - pad
 ## BatchFeature
 [[autodoc]] BatchFeature
 ## ImageFeatureExtractionMixin
 [[autodoc]] image_utils.ImageFeatureExtractionMixin
--- a/docs/source/main_classes/feature_extractor.rst
+++ b/docs/source/main_classes/feature_extractor.rst
@ -1,48 +0,0 @@
 .. 
    Copyright 2021 The HuggingFace Team. All rights reserved.
    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
    the License. You may obtain a copy of the License at
        http://www.apache.org/licenses/LICENSE-2.0
    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
    specific language governing permissions and limitations under the License.
 Feature Extractor
 -----------------------------------------------------------------------------------------------------------------------
 A feature extractor is in charge of preparing input features for a multi-modal model. This includes feature extraction
 from sequences, *e.g.*, pre-processing audio files to Log-Mel Spectrogram features, feature extraction from images
 *e.g.* cropping image image files, but also padding, normalization, and conversion to Numpy, PyTorch, and TensorFlow
 tensors.
 FeatureExtractionMixin
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.feature_extraction_utils.FeatureExtractionMixin
    :members: from_pretrained, save_pretrained
 SequenceFeatureExtractor
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.SequenceFeatureExtractor
    :members: pad
 BatchFeature
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.BatchFeature
    :members:
 ImageFeatureExtractionMixin
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autoclass:: transformers.image_utils.ImageFeatureExtractionMixin
    :members:
--- a/docs/source/main_classes/keras_callbacks.mdx
+++ b/docs/source/main_classes/keras_callbacks.mdx
@ -0,0 +1,24 @@
 <!--Copyright 2021 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
 http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
 -->
 # Keras callbacks
 When training a Transformers model with Keras, there are some library-specific callbacks available to automate common
 tasks:
 ## KerasMetricCallback
 [[autodoc]] KerasMetricCallback
 ## PushToHubCallback
 [[autodoc]] PushToHubCallback
--- a/Show More
+++ b/Show More