update kernel name

2025-11-15 07:04:49 +08:00 · 2025-10-31 10:54:57 +00:00
1124 changed files with 24124 additions and 15438 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -46,8 +46,8 @@ jobs:
            - run: uv pip install -U -e .
            - run: echo 'export "GIT_COMMIT_MESSAGE=$(git show -s --format=%s)"' >> "$BASH_ENV" && source "$BASH_ENV"
            - run: mkdir -p test_preparation
-            - run: python utils/tests_fetcher.py | tee tests_fetched_summary.txt || true
-            - run: python utils/tests_fetcher.py --filter_tests || true
+            - run: python utils/tests_fetcher.py | tee tests_fetched_summary.txt
+            - run: python utils/tests_fetcher.py --filter_tests
            - run: export "GIT_COMMIT_MESSAGE=$(git show -s --format=%s)" && echo $GIT_COMMIT_MESSAGE && python .circleci/create_circleci_config.py --fetcher_folder test_preparation
            - run: |
                if [ ! -s test_preparation/generated_config.yml ]; then
@ -98,8 +98,8 @@ jobs:
            - run: uv pip install -U -e .
            - run: echo 'export "GIT_COMMIT_MESSAGE=$(git show -s --format=%s)"' >> "$BASH_ENV" && source "$BASH_ENV"
            - run: mkdir -p test_preparation
-            - run: python utils/tests_fetcher.py --fetch_all | tee tests_fetched_summary.txt || true
-            - run: python utils/tests_fetcher.py --filter_tests || true
+            - run: python utils/tests_fetcher.py --fetch_all | tee tests_fetched_summary.txt
+            - run: python utils/tests_fetcher.py --filter_tests
            - run: export "GIT_COMMIT_MESSAGE=$(git show -s --format=%s)" && echo $GIT_COMMIT_MESSAGE && python .circleci/create_circleci_config.py --fetcher_folder test_preparation
            - run: |
                if [ ! -s test_preparation/generated_config.yml ]; then
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@ -185,8 +185,8 @@ class CircleCIJob:
            },
            # During the CircleCI docker images build time, we might already (or not) download the data.
            # If it's done already, the files are inside the directory `/test_data/`.
-            # {"run": {"name": "fetch hub objects before pytest", "command": "cp -r /test_data/* . 2>/dev/null || true; python3 utils/fetch_hub_objects_for_ci.py"}},
-            # {"run": {"name": "download and unzip hub cache", "command": 'curl -L -o huggingface-cache.tar.gz https://huggingface.co/datasets/hf-internal-testing/hf_hub_cache/resolve/main/huggingface-cache.tar.gz && apt-get install pigz && tar --use-compress-program="pigz -d -p 8" -xf huggingface-cache.tar.gz && mv -n hub/* /root/.cache/huggingface/hub/ && ls -la /root/.cache/huggingface/hub/'}},
+            {"run": {"name": "fetch hub objects before pytest", "command": "cp -r /test_data/* . 2>/dev/null || true; python3 utils/fetch_hub_objects_for_ci.py"}},
+            {"run": {"name": "download and unzip hub cache", "command": 'curl -L -o huggingface-cache.tar.gz https://huggingface.co/datasets/hf-internal-testing/hf_hub_cache/resolve/main/huggingface-cache.tar.gz && apt-get install pigz && tar --use-compress-program="pigz -d -p 8" -xf huggingface-cache.tar.gz && mv -n hub/* /root/.cache/huggingface/hub/ && ls -la /root/.cache/huggingface/hub/'}},
            {"run": {
                "name": "Run tests",
                "command": f"({timeout_cmd} python3 -m pytest {marker_cmd} -n {self.pytest_num_workers} {junit_flags} {repeat_on_failure_flags} {' '.join(pytest_flags)} $(cat splitted_tests.txt) | tee tests_output.txt)"}
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@ -32,16 +32,16 @@ jobs:
      options: --gpus all --privileged --ipc host
    steps:
      - name: Get repo
-        uses: actions/checkout@v5
+        uses: actions/checkout@v4
        with:
-          fetch-depth: 1
+          ref: ${{ github.event.pull_request.head.sha || github.sha }}

      - name: Install benchmark script dependencies
        run: python3 -m pip install -r benchmark_v2/requirements.txt kernels

      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
        working-directory: /transformers
-        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e ".[torch]"
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e ".[torch]" && python3 -m pip uninstall -y torchvision # temp fix

      - name: Run benchmark
        run: |
@ -52,7 +52,7 @@ jobs:
            commit_id=$GITHUB_SHA
          fi
          commit_msg=$(git show -s --format=%s | cut -c1-70)
-          python3 benchmark_v2/run_benchmarks.py -b 32 -s 128 -n 256 --level 2 --branch-name "$BRANCH_NAME" --commit-id "$commit_id" --commit-message "$commit_msg" --model-id "$MODEL_ID" --log-level INFO --push-result-to-dataset "$DATASET_ID"
+          python3 benchmark_v2/run_benchmarks.py -b 32 -s 128 -n 256 --branch-name "$BRANCH_NAME" --commit-id "$commit_id" --commit-message "$commit_msg" --model-id "$MODEL_ID" --log-level INFO --push-result-to-dataset "$DATASET_ID"
        env:
          HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
          PUSH_TO_HUB_TOKEN: ${{ secrets.PUSH_TO_HUB_TOKEN }}
--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
@ -97,7 +97,7 @@ jobs:
  latest-torch-deepspeed-docker:
    name: "Latest PyTorch + DeepSpeed"
    runs-on:
-      group: aws-general-8-plus
+      group: aws-g4dn-2xlarge-cache
    steps:
      -
        name: Set up Docker Buildx
--- a/.github/workflows/check-workflow-permissions.yml
+++ b/.github/workflows/check-workflow-permissions.yml
@ -1,23 +0,0 @@
---
-name: Check Permissions Advisor
-
-on:
-  workflow_dispatch:
-    inputs:
-      workflow_name:
-        description: 'Workflow file name'
-        type: string
-      run_count:
-        description: 'Number of runs to analyze'
-        type: string
-        default: "10"
-
-jobs:
-  advisor:
-    uses: huggingface/security-workflows/.github/workflows/permissions-advisor-reusable.yml@main
-    permissions:
-      actions: read
-      contents: read
-    with:
-      workflow_name: ${{ inputs.workflow_name }}
-      run_count: ${{ fromJSON(inputs.run_count) }}
--- a/.github/workflows/check_failed_tests.yml
+++ b/.github/workflows/check_failed_tests.yml
@ -6,6 +6,9 @@ on:
      docker:
        required: true
        type: string
+      start_sha:
+        required: true
+        type: string
      job:
        required: true
        type: string
@ -21,13 +24,7 @@ on:
      commit_sha:
        required: false
        type: string
-      pr_number:
-        required: false
-        type: string
-    outputs:
-      report:
-        description: "Content of the report of new failures"
-        value: ${{ jobs.process_new_failures_with_commit_info.outputs.report }}
+

 env:
  HF_HOME: /mnt/cache
@ -64,15 +61,13 @@ jobs:
      - name: Check file
        id: check_file
        working-directory: /transformers
-        env:
-          job: ${{ inputs.job }}
        run: |
-          if [ -f "ci_results_${job}/new_failures.json" ]; then
-            echo "\`ci_results_${job}/new_failures.json\` exists, continue ..."
+          if [ -f ci_results_${{ inputs.job }}/new_failures.json ]; then
+            echo "`ci_results_${{ inputs.job }}/new_failures.json` exists, continue ..."
            echo "process=true" >> $GITHUB_ENV
            echo "process=true" >> $GITHUB_OUTPUT
          else
-            echo "\`ci_results_${job}/new_failures.json\` doesn't exist, abort."
+            echo "`ci_results_${{ inputs.job }}/new_failures.json` doesn't exist, abort."
            echo "process=false" >> $GITHUB_ENV
            echo "process=false" >> $GITHUB_OUTPUT
          fi
@ -93,62 +88,27 @@ jobs:
            echo "PREV_WORKFLOW_RUN_ID=" >> $GITHUB_ENV
          fi

+          if [ -f setup_values/other_workflow_run_id.txt ]; then
+            echo "OTHER_WORKFLOW_RUN_ID=$(cat setup_values/other_workflow_run_id.txt)" >> $GITHUB_ENV
+          else
+            echo "OTHER_WORKFLOW_RUN_ID=" >> $GITHUB_ENV
+          fi
+
      - name: Update clone
        working-directory: /transformers
        if: ${{ env.process == 'true' }}
-        env:
-          commit_sha: ${{ inputs.commit_sha || github.sha }}
-        run: |
-          git fetch origin "$commit_sha" && git checkout "$commit_sha"
+        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}

-      - name: Get `START_SHA`
+      - name: Get target commit
        working-directory: /transformers/utils
        if: ${{ env.process == 'true' }}
-        env:
-          commit_sha: ${{ inputs.commit_sha || github.sha }}
        run: |
-          echo "START_SHA=$commit_sha" >> $GITHUB_ENV
+          echo "END_SHA=$(TOKEN=${{ secrets.ACCESS_REPO_INFO_TOKEN }} python3 -c 'import os; from get_previous_daily_ci import get_last_daily_ci_run_commit; commit=get_last_daily_ci_run_commit(token=os.environ["TOKEN"], workflow_run_id=os.environ["PREV_WORKFLOW_RUN_ID"]); print(commit)')" >> $GITHUB_ENV

-      # This is used if the CI is triggered from a pull request `self-comment-ci.yml` (after security check is verified)
-      - name: Extract the base commit on `main` (of the merge commit created by Github) if it is a PR
-        id: pr_info
-        if: ${{ env.process == 'true' && inputs.pr_number != '' }}
-        uses: actions/github-script@v6
-        with:
-          script: |            
-            const { data: pr } = await github.rest.pulls.get({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              pull_number: ${{ inputs.pr_number }}
-            });
-
-            const { data: merge_commit }  = await github.rest.repos.getCommit({
-              owner: pr.base.repo.owner.login,
-              repo: pr.base.repo.name,
-              ref: '${{ inputs.commit_sha }}',
-            });
-
-            core.setOutput('merge_commit_base_sha', merge_commit.parents[0].sha);
-
-      # Usually, `END_SHA` should be the commit of the last previous workflow run of the **SAME** (scheduled) workflow.
-      # (This is why we don't need to specify `workflow_id` which would be fetched automatically in the python script.)
-      - name: Get `END_SHA` from previous CI runs of the same workflow
-        working-directory: /transformers/utils
-        if: ${{ env.process == 'true' && inputs.pr_number == '' }}
-        env:
-          ACCESS_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
-        run: |
-          echo "END_SHA=$(TOKEN="$ACCESS_TOKEN" python3 -c 'import os; from get_previous_daily_ci import get_last_daily_ci_run_commit; commit=get_last_daily_ci_run_commit(token=os.environ["TOKEN"], workflow_run_id=os.environ["PREV_WORKFLOW_RUN_ID"]); print(commit)')" >> $GITHUB_ENV
-
-      # However, for workflow runs triggered by `issue_comment` (for pull requests), we want to check against the
-      # parent commit (on `main`) of the `merge_commit` (dynamically created by GitHub). In this case, the goal is to
-      # see if a reported failing test is actually ONLY failing on the `merge_commit`.
-      - name: Set `END_SHA`
-        if: ${{ env.process == 'true' && inputs.pr_number != '' }}
-        env:
-          merge_commit_base_sha: ${{ steps.pr_info.outputs.merge_commit_base_sha }}
-        run: |
-          echo "END_SHA=$merge_commit_base_sha" >> $GITHUB_ENV
+      - name: Checkout to `start_sha`
+        working-directory: /transformers
+        if: ${{ env.process == 'true' }}
+        run: git fetch && git checkout ${{ inputs.start_sha }}

      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
        working-directory: /transformers
@ -178,20 +138,14 @@ jobs:
      - name: Check failed tests
        working-directory: /transformers
        if: ${{ env.process == 'true' }}
-        env:
-          job: ${{ inputs.job }}
-          run_idx: ${{ matrix.run_idx }}
-        run: python3 utils/check_bad_commit.py --start_commit "$START_SHA" --end_commit "$END_SHA" --file "ci_results_${job}/new_failures.json" --output_file "new_failures_with_bad_commit_${job}_${run_idx}.json"
+        run: python3 utils/check_bad_commit.py --start_commit ${{ inputs.start_sha }} --end_commit ${{ env.END_SHA }} --file ci_results_${{ inputs.job }}/new_failures.json --output_file new_failures_with_bad_commit_${{ inputs.job }}_${{ matrix.run_idx }}.json

      - name: Show results
        working-directory: /transformers
        if: ${{ env.process == 'true' }}
-        env:
-          job: ${{ inputs.job }}
-          run_idx: ${{ matrix.run_idx }}
        run: |
-          ls -l "new_failures_with_bad_commit_${job}_${run_idx}.json"
-          cat "new_failures_with_bad_commit_${job}_${run_idx}.json"
+          ls -l new_failures_with_bad_commit_${{ inputs.job }}_${{ matrix.run_idx }}.json
+          cat new_failures_with_bad_commit_${{ inputs.job }}_${{ matrix.run_idx }}.json

      - name: Upload artifacts
        uses: actions/upload-artifact@v4
@ -205,8 +159,6 @@ jobs:
    if: needs.check_new_failures.outputs.process == 'true'
    runs-on:
      group: aws-g5-4xlarge-cache
-    outputs:
-      report: ${{ steps.set_output.outputs.report }}
    container:
      image: ${{ inputs.docker }}
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@ -224,28 +176,32 @@ jobs:

      - name: Check files
        working-directory: /transformers
-        env:
-          job: ${{ inputs.job }}
        run: |
          ls -la /transformers
-          ls -la "/transformers/new_failures_with_bad_commit_${job}"
+          ls -la /transformers/new_failures_with_bad_commit_${{ inputs.job }}

      # Currently, we only run with a single runner by using `run_idx: [1]`. We might try to run with multiple runners
      # to further reduce the false positive caused by flaky tests, which requires further processing to merge reports.
      - name: Merge files
        shell: bash
        working-directory: /transformers
-        env:
-          job: ${{ inputs.job }}
        run: |
-          cp "/transformers/new_failures_with_bad_commit_${job}/new_failures_with_bad_commit_${job}_1.json" new_failures_with_bad_commit.json
+          cp /transformers/new_failures_with_bad_commit_${{ inputs.job }}/new_failures_with_bad_commit_${{ inputs.job }}_1.json new_failures_with_bad_commit.json

      - name: Update clone
+        working-directory: /transformers
+        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
+
+      - name: Process report
+        shell: bash
        working-directory: /transformers
        env:
-          commit_sha: ${{ inputs.commit_sha || github.sha }}
+          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
+          TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}
+          JOB_NAME: ${{ inputs.job }}
+          REPORT_REPO_ID: ${{ inputs.report_repo_id }}
        run: |
-          git fetch origin "$commit_sha" && git checkout "$commit_sha"
+          python3 utils/process_bad_commit_report.py

      - name: Process report
        shell: bash
@ -262,37 +218,11 @@ jobs:
            echo EOF
          } >> "$GITHUB_ENV"

-      # The output is useful if a caller needs more processing, for example, we have a chain
-      # self-comment-ci.yml -> self-scheduled.yml -> this one (check_failed_tests.yml),
-      # and `self-comment-ci.yml` needs further processing before sending a GitHub comment to the pull request page.
-      - name: Show results & Set outputs
-        id: set_output
-        working-directory: /transformers
-        run: |
-          ls -l new_failures_with_bad_commit.json
-          cat new_failures_with_bad_commit.json
-
-          {
-            echo 'report<<EOF'
-            cat new_failures_with_bad_commit.json
-            echo ''  # Force a newline
-            echo EOF
-          } >> "$GITHUB_OUTPUT"
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          name: new_failures_with_bad_commit_${{ inputs.job }}
-          path: /transformers/new_failures_with_bad_commit.json
-
      - name: Prepare Slack report title
        working-directory: /transformers
-        env:
-          ci_event: ${{ inputs.ci_event }}
-          job: ${{ inputs.job }}
        run: |
          pip install slack_sdk
-          echo "title=$(python3 -c 'import sys; import os; sys.path.append("utils"); from utils.notification_service import job_to_test_map; ci_event = os.environ["ci_event"]; job = os.environ["job"]; test_name = job_to_test_map[job]; title = f"New failed tests of {ci_event}" + ":" + f" {test_name}"; print(title)')" >> $GITHUB_ENV
+          echo "title=$(python3 -c 'import sys; sys.path.append("utils"); from utils.notification_service import job_to_test_map; ci_event = "${{ inputs.ci_event }}"; job = "${{ inputs.job }}"; test_name = job_to_test_map[job]; title = f"New failed tests of {ci_event}" + ":" + f" {test_name}"; print(title)')" >> $GITHUB_ENV

      - name: Send processed report
        if: ${{ !endsWith(env.REPORT_TEXT, '{}') }}
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@ -1,22 +0,0 @@
---
-name: CodeQL Security Analysis
-
-on:
-  push:
-    branches: ["main", "fix_security_issue_*"]
-  # pull_request:
-  #   branches: ["main"]
-  workflow_dispatch:
-
-jobs:
-  codeql:
-    name: CodeQL Analysis
-    uses: huggingface/security-workflows/.github/workflows/codeql-reusable.yml@main
-    permissions:
-      security-events: write
-      packages: read
-      actions: read
-      contents: read
-    with:
-      languages: '["actions"]'
-      queries: 'security-extended,security-and-quality'
--- a/.github/workflows/get-pr-info.yml
+++ b/.github/workflows/get-pr-info.yml
@ -39,9 +39,6 @@ on:
      PR_MERGE_COMMIT_SHA:
        description: "The sha of the merge commit for the pull request (created by GitHub) in the base repository"
        value: ${{ jobs.get-pr-info.outputs.PR_MERGE_COMMIT_SHA }}
-      PR_MERGE_COMMIT_BASE_SHA:
-        description: "The sha of the parent commit of the the merge commit on the target branch in the base repository"
-        value: ${{ jobs.get-pr-info.outputs.PR_MERGE_COMMIT_BASE_SHA }}
      PR_HEAD_COMMIT_DATE:
        description: "The date of the head sha of the pull request branch in the head repository"
        value: ${{ jobs.get-pr-info.outputs.PR_HEAD_COMMIT_DATE }}
@ -77,7 +74,6 @@ jobs:
      PR_BASE_REF: ${{ steps.pr_info.outputs.base_ref }}
      PR_HEAD_SHA: ${{ steps.pr_info.outputs.head_sha }}
      PR_BASE_SHA: ${{ steps.pr_info.outputs.base_sha }}
-      PR_MERGE_COMMIT_BASE_SHA: ${{ steps.pr_info.outputs.merge_commit_base_sha }}
      PR_MERGE_COMMIT_SHA: ${{ steps.pr_info.outputs.merge_commit_sha }}
      PR_HEAD_COMMIT_DATE: ${{ steps.pr_info.outputs.head_commit_date }}
      PR_MERGE_COMMIT_DATE: ${{ steps.pr_info.outputs.merge_commit_date }}
@ -126,7 +122,6 @@ jobs:
            core.setOutput('base_ref', pr.base.ref);
            core.setOutput('head_sha', pr.head.sha);
            core.setOutput('base_sha', pr.base.sha);
-            core.setOutput('merge_commit_base_sha', merge_commit.parents[0].sha);
            core.setOutput('merge_commit_sha', pr.merge_commit_sha);
            core.setOutput('pr', pr);

@ -147,21 +142,16 @@ jobs:
              date: merge_commit.commit.committer.date
            });

-            console.log('PR Info:', {
-              pr_info: pr
-            });
-
      - name: Convert dates to timestamps
        id: get_timestamps
-        env:
-          head_commit_date: ${{ steps.pr_info.outputs.head_commit_date }}
-          merge_commit_date: ${{ steps.pr_info.outputs.merge_commit_date }}
        run: |
-          echo "$head_commit_date"
-          echo "$merge_commit_date"
+          head_commit_date=${{ steps.pr_info.outputs.head_commit_date }}
+          merge_commit_date=${{ steps.pr_info.outputs.merge_commit_date }}
+          echo $head_commit_date
+          echo $merge_commit_date
          head_commit_timestamp=$(date -d "$head_commit_date" +%s)
          merge_commit_timestamp=$(date -d "$merge_commit_date" +%s)
-          echo "$head_commit_timestamp"
-          echo "$merge_commit_timestamp"
+          echo $head_commit_timestamp
+          echo $merge_commit_timestamp
          echo "head_commit_timestamp=$head_commit_timestamp" >> $GITHUB_OUTPUT
-          echo "merge_commit_timestamp=$merge_commit_timestamp" >> $GITHUB_OUTPUT
+          echo "merge_commit_timestamp=$merge_commit_timestamp" >> $GITHUB_OUTPUT
--- a/.github/workflows/get-pr-number.yml
+++ b/.github/workflows/get-pr-number.yml
@ -15,19 +15,13 @@ jobs:
    steps:
      - name: Get PR number
        shell: bash
-        env:
-          issue_number: ${{ github.event.issue.number }}
-          is_pull_request_issue: ${{ github.event.issue.pull_request != null }}
-          pr_number: ${{ github.event.pull_request.number }}
-          is_pull_request: ${{ github.event.pull_request != null }}
-          event_number: ${{ github.event.number }}
        run: |
-          if [[ "$issue_number" != "" && "$is_pull_request_issue" == "true" ]]; then
-            echo "PR_NUMBER=$issue_number" >> $GITHUB_ENV
-          elif [[ "$pr_number" != "" ]]; then
-            echo "PR_NUMBER=$pr_number" >> $GITHUB_ENV
-          elif [[ "$is_pull_request" == "true" ]]; then
-            echo "PR_NUMBER=$event_number" >> $GITHUB_ENV
+          if [[ "${{ github.event.issue.number }}" != "" && "${{ github.event.issue.pull_request }}" != "" ]]; then
+            echo "PR_NUMBER=${{ github.event.issue.number }}" >> $GITHUB_ENV
+          elif [[ "${{ github.event.pull_request.number }}" != "" ]]; then
+            echo "PR_NUMBER=${{ github.event.pull_request.number }}" >> $GITHUB_ENV
+          elif [[ "${{ github.event.pull_request }}" != "" ]]; then
+            echo "PR_NUMBER=${{ github.event.number }}" >> $GITHUB_ENV
          else
            echo "PR_NUMBER=" >> $GITHUB_ENV
          fi
@ -35,8 +29,8 @@ jobs:
      - name: Check PR number
        shell: bash
        run: |
-          echo "$PR_NUMBER"
+          echo "${{ env.PR_NUMBER }}"

      - name: Set PR number
        id: set_pr_number
-        run: echo "PR_NUMBER=$PR_NUMBER" >> "$GITHUB_OUTPUT"
+        run: echo "PR_NUMBER=${{ env.PR_NUMBER }}" >> "$GITHUB_OUTPUT"
--- a/.github/workflows/model_jobs.yml
+++ b/.github/workflows/model_jobs.yml
@ -62,33 +62,25 @@ jobs:
    steps:
      - name: Echo input and matrix info
        shell: bash
-        env:
-          folder_slices: ${{ inputs.folder_slices }}
-          matrix_folders: ${{ matrix.folders }}
-          slice_data: ${{ toJson(fromJson(inputs.folder_slices)[inputs.slice_id]) }}
        run: |
-          echo "$folder_slices"
-          echo "$matrix_folders"
-          echo "$slice_data"
+          echo "${{ inputs.folder_slices }}"
+          echo "${{ matrix.folders }}"
+          echo "${{ toJson(fromJson(inputs.folder_slices)[inputs.slice_id]) }}"

      - name: Echo folder ${{ matrix.folders }}
        shell: bash
        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
        # set the artifact folder names (because the character `/` is not allowed).
-        env:
-          matrix_folders_raw: ${{ matrix.folders }}
        run: |
-          echo "$matrix_folders_raw"
-          matrix_folders="${matrix_folders_raw/'models/'/'models_'}"
+          echo "${{ matrix.folders }}"
+          matrix_folders=${{ matrix.folders }}
+          matrix_folders=${matrix_folders/'models/'/'models_'}
          echo "$matrix_folders"
          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV

      - name: Update clone
        working-directory: /transformers
-        env:
-          commit_sha: ${{ inputs.commit_sha || github.sha }}
-        run: |
-          git fetch origin "$commit_sha" && git checkout "$commit_sha"
+        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}

      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
        working-directory: /transformers
@ -123,17 +115,15 @@ jobs:
        id: set_machine_type
        working-directory: /transformers
        shell: bash
-        env:
-          input_machine_type: ${{ inputs.machine_type }}
        run: |
-          echo "$input_machine_type"
+          echo "${{ inputs.machine_type }}"

-          if [ "$input_machine_type" = "aws-g5-4xlarge-cache" ]; then
+          if [ "${{ inputs.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "$input_machine_type" = "aws-g5-12xlarge-cache" ]; then
+          elif [ "${{ inputs.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
-            machine_type="$input_machine_type"
+            machine_type=${{ inputs.machine_type }}
          fi

          echo "$machine_type"
@ -142,21 +132,15 @@ jobs:

      - name: Create report directory if it doesn't exist
        shell: bash
-        env:
-          report_name_prefix: ${{ inputs.report_name_prefix }}
        run: |
-          mkdir -p "/transformers/reports/${machine_type}_${report_name_prefix}_${matrix_folders}_test_reports"
-          echo "dummy" > "/transformers/reports/${machine_type}_${report_name_prefix}_${matrix_folders}_test_reports/dummy.txt"
-          ls -la "/transformers/reports/${machine_type}_${report_name_prefix}_${matrix_folders}_test_reports"
+          mkdir -p /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
+          echo "dummy" > /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports/dummy.txt
+          ls -la /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports

      - name: Run all tests on GPU
        working-directory: /transformers
-        env:
-          report_name_prefix: ${{ inputs.report_name_prefix }}
-          pytest_marker: ${{ inputs.pytest_marker }}
-          model: ${{ matrix.folders }}
        run: |
-          script -q -c "PATCH_TESTING_METHODS_TO_COLLECT_OUTPUTS=yes _PATCHED_TESTING_METHODS_OUTPUT_DIR=/transformers/reports/${machine_type}_${report_name_prefix}_${matrix_folders}_test_reports python3 -m pytest -rsfE -v -m '${pytest_marker}' --make-reports=${machine_type}_${report_name_prefix}_${matrix_folders}_test_reports tests/${model}" test_outputs.txt
+          script -q -c "PATCH_TESTING_METHODS_TO_COLLECT_OUTPUTS=yes _PATCHED_TESTING_METHODS_OUTPUT_DIR=/transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports python3 -m pytest -rsfE -v -m '${{ inputs.pytest_marker }}' --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports tests/${{ matrix.folders }}" test_outputs.txt
          ls -la
          # Extract the exit code from the output file
          EXIT_CODE=$(tail -1 test_outputs.txt | grep -o 'COMMAND_EXIT_CODE="[0-9]*"' | cut -d'"' -f2)
@ -167,25 +151,19 @@ jobs:
        # This step is only to show information on Github Actions log.
        # Always mark this step as successful, even if the report directory or the file `failures_short.txt` in it doesn't exist
        continue-on-error: true
-        env:
-          report_name_prefix: ${{ inputs.report_name_prefix }}
-        run: cat "/transformers/reports/${machine_type}_${report_name_prefix}_${matrix_folders}_test_reports/failures_short.txt"
+        run: cat /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports/failures_short.txt

      - name: Captured information
        if: ${{ failure() }}
        continue-on-error: true
-        env:
-          report_name_prefix: ${{ inputs.report_name_prefix }}
        run: |
-          cat "/transformers/reports/${machine_type}_${report_name_prefix}_${matrix_folders}_test_reports/captured_info.txt"
+          cat /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports/captured_info.txt

      - name: Copy test_outputs.txt
        if: ${{ always() }}
        continue-on-error: true
-        env:
-          report_name_prefix: ${{ inputs.report_name_prefix }}
        run: |
-          cp /transformers/test_outputs.txt "/transformers/reports/${machine_type}_${report_name_prefix}_${matrix_folders}_test_reports"
+          cp /transformers/test_outputs.txt /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports

      - name: "Test suite reports artifacts: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports"
        if: ${{ always() }}
@ -196,7 +174,7 @@ jobs:

  collated_reports:
    name: Collated Reports
-    if: ${{ always() && inputs.runner_type != '' }}
+    if: ${{ always() }}
    needs: run_models_gpu
    uses: huggingface/transformers/.github/workflows/collated-reports.yml@main
    with:
--- a/.github/workflows/pr_slow_ci_suggestion.yml
+++ b/.github/workflows/pr_slow_ci_suggestion.yml
@ -1,4 +1,4 @@
-name: PR slow CI - Suggestion
+name: PR slow CI
 on:
  pull_request_target:
    types: [opened, synchronize, reopened]
@ -23,28 +23,11 @@ jobs:
    outputs:
      jobs: ${{ steps.get_jobs.outputs.jobs_to_run }}
    steps:
-      # This checkout to the main branch
-      - uses: actions/checkout@v4
-        with:
-          fetch-depth: "0"
-
-      # We need to use `${{ ... }}` here to avoid `Argument list too long` error when a PR changes a lot of files.
-      # (We could also try to use artifact approach, but it's more involved).
-      # `CodeQL` doesn't identify any security issue here. Also `PR_FILES` is from `get-pr-info.yml` by using an api
-      # `github.rest.pulls.listFiles`, which is fine.
-      - name: Write pr_files file
-        run: |
-          cat > pr_files.txt << 'EOF'
-          ${{ needs.get-pr-info.outputs.PR_FILES }}
-          EOF
-
      - name: Get repository content
        id: repo_content
        uses: actions/github-script@v6
        with:
          script: |
-            const fs = require('node:fs');
-
            const { data: tests_dir } = await github.rest.repos.getContent({
              owner: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_OWNER }}',
              repo: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_NAME }}',
@ -66,10 +49,38 @@ jobs:
              ref: '${{ needs.get-pr-info.outputs.PR_HEAD_SHA }}',
            });

-            // Write to files instead of outputs
-            fs.writeFileSync('tests_dir.txt', JSON.stringify(tests_dir, null, 2));
-            fs.writeFileSync('tests_models_dir.txt', JSON.stringify(tests_models_dir, null, 2));
-            fs.writeFileSync('tests_quantization_dir.txt', JSON.stringify(tests_quantization_dir, null, 2));
+            core.setOutput('tests_dir', tests_dir);
+            core.setOutput('tests_models_dir', tests_models_dir);
+            core.setOutput('tests_quantization_dir', tests_quantization_dir);
+
+      # This checkout to the main branch
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: "0"
+
+      - name: Write pr_files file
+        run: |
+          cat > pr_files.txt << 'EOF'
+          ${{ needs.get-pr-info.outputs.PR_FILES }}
+          EOF
+
+      - name: Write tests_dir file
+        run: |
+          cat > tests_dir.txt << 'EOF'
+          ${{ steps.repo_content.outputs.tests_dir }}
+          EOF
+
+      - name: Write tests_models_dir file
+        run: |
+          cat > tests_models_dir.txt << 'EOF'
+          ${{ steps.repo_content.outputs.tests_models_dir }}
+          EOF
+
+      - name: Write tests_quantization_dir file
+        run: |
+          cat > tests_quantization_dir.txt << 'EOF'
+          ${{ steps.repo_content.outputs.tests_quantization_dir }}
+          EOF

      - name: Run script to get jobs to run
        id: get_jobs
--- a/.github/workflows/push-important-models.yml
+++ b/.github/workflows/push-important-models.yml
@ -153,5 +153,5 @@ jobs:
      ci_event: push
      report_repo_id: hf-internal-testing/transformers_ci_push
      commit_sha: ${{ github.sha }}
-      subdirs: ${{ needs.get_modified_models.outputs.matrix }}
+      models: ${{ needs.get_modified_models.outputs.matrix }}
    secrets: inherit
--- a/.github/workflows/self-comment-ci.yml
+++ b/.github/workflows/self-comment-ci.yml
@ -23,34 +23,62 @@ env:
  TF_FORCE_GPU_ALLOW_GROWTH: true
  CUDA_VISIBLE_DEVICES: 0,1

-
 jobs:
  get-pr-number:
+    runs-on: ubuntu-22.04
    name: Get PR number
+    # For security: only allow team members to run
    if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "eustlb", "MekkCyber", "vasqu", "ivarflakstad", "stevhliu", "ebezzam", "remi-or", "itazap"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
-    uses: ./.github/workflows/get-pr-number.yml
+    outputs:
+      PR_NUMBER: ${{ steps.set_pr_number.outputs.PR_NUMBER }}
+    steps:
+      - name: Get PR number
+        shell: bash
+        run: |
+          if [[ "${{ github.event.issue.number }}" != "" && "${{ github.event.issue.pull_request }}" != "" ]]; then
+            echo "PR_NUMBER=${{ github.event.issue.number }}" >> $GITHUB_ENV
+          else
+            echo "PR_NUMBER=" >> $GITHUB_ENV
+          fi

-  get-pr-info:
-    name: Get PR commit SHA
+      - name: Check PR number
+        shell: bash
+        run: |
+          echo "${{ env.PR_NUMBER }}"
+
+      - name: Set PR number
+        id: set_pr_number
+        run: echo "PR_NUMBER=${{ env.PR_NUMBER }}" >> "$GITHUB_OUTPUT"
+
+  get-sha:
+    runs-on: ubuntu-22.04
    needs: get-pr-number
    if: ${{ needs.get-pr-number.outputs.PR_NUMBER != ''}}
-    uses: ./.github/workflows/get-pr-info.yml
-    with:
-      pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
-
-  check-timestamps:
-    name: Check timestamps (security check)
-    runs-on: ubuntu-22.04
-    needs: get-pr-info
    outputs:
-      PR_HEAD_SHA: ${{ needs.get-pr-info.outputs.PR_HEAD_SHA }}
-      PR_MERGE_SHA: ${{ needs.get-pr-info.outputs.PR_MERGE_COMMIT_SHA }}
+      PR_HEAD_SHA: ${{ steps.get_sha.outputs.PR_HEAD_SHA }}
+      PR_MERGE_SHA: ${{ steps.get_sha.outputs.PR_MERGE_SHA }}
    steps:
-      - name: Verify `merge_commit` timestamp is older than the issue comment timestamp
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: "0"
+          ref: "refs/pull/${{needs.get-pr-number.outputs.PR_NUMBER}}/merge"
+
+      - name: Get SHA (and verify timestamps against the issue comment date)
+        id: get_sha
        env:
+          PR_NUMBER: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
          COMMENT_DATE: ${{ github.event.comment.created_at }}
-          PR_MERGE_COMMIT_TIMESTAMP: ${{ needs.get-pr-info.outputs.PR_MERGE_COMMIT_TIMESTAMP }}
        run: |
+            git fetch origin refs/pull/$PR_NUMBER/head:refs/remotes/pull/$PR_NUMBER/head
+            git checkout refs/remotes/pull/$PR_NUMBER/head
+            echo "PR_HEAD_SHA: $(git log -1 --format=%H)"
+            echo "PR_HEAD_SHA=$(git log -1 --format=%H)" >> "$GITHUB_OUTPUT"
+            git fetch origin refs/pull/$PR_NUMBER/merge:refs/remotes/pull/$PR_NUMBER/merge
+            git checkout refs/remotes/pull/$PR_NUMBER/merge
+            echo "PR_MERGE_SHA: $(git log -1 --format=%H)"
+            echo "PR_MERGE_SHA=$(git log -1 --format=%H)" >> "$GITHUB_OUTPUT"
+            PR_MERGE_COMMIT_TIMESTAMP=$(git log -1 --date=unix --format=%cd)
+            echo "PR_MERGE_COMMIT_TIMESTAMP: $PR_MERGE_COMMIT_TIMESTAMP"
            COMMENT_TIMESTAMP=$(date -d "${COMMENT_DATE}" +"%s")
            echo "COMMENT_DATE: $COMMENT_DATE"
            echo "COMMENT_TIMESTAMP: $COMMENT_TIMESTAMP"
@ -59,10 +87,13 @@ jobs:
              exit -1;
            fi

-  # use a python script to handle this complex logic.
+  # use a python script to handle this complex logic
+  # case 1: `run-slow` (auto. infer with limited number of models, but in particular, new model)
+  # case 2: `run-slow model_1, model_2`
  get-tests:
    runs-on: ubuntu-22.04
-    needs: [get-pr-number, check-timestamps]
+    needs: [get-pr-number, get-sha]
+    if: ${{ needs.get-pr-number.outputs.PR_NUMBER != ''}}
    outputs:
      models: ${{ steps.models_to_run.outputs.models }}
      quantizations: ${{ steps.models_to_run.outputs.quantizations }}
@ -70,11 +101,11 @@ jobs:
      - uses: actions/checkout@v4
        with:
          fetch-depth: "0"
-          ref: "refs/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge"
+          ref: "refs/pull/${{needs.get-pr-number.outputs.PR_NUMBER}}/merge"

      - name: Verify merge commit SHA
        env:
-          VERIFIED_PR_MERGE_SHA: ${{ needs.check-timestamps.outputs.PR_MERGE_SHA }}
+          VERIFIED_PR_MERGE_SHA: ${{ needs.get-sha.outputs.PR_MERGE_SHA }}
        run: |
            PR_MERGE_SHA=$(git log -1 --format=%H)
            if [ $PR_MERGE_SHA != $VERIFIED_PR_MERGE_SHA ]; then
@ -95,33 +126,11 @@ jobs:
      - name: Show models to test
        id: models_to_run
        run: |
-          echo "$models"
-          echo "models=$models" >> $GITHUB_OUTPUT
-          echo "$quantizations"
-          echo "quantizations=$quantizations" >> $GITHUB_OUTPUT
-
-  # Report back if we are not able to get the tests (for example, security check is failing)
-  report_error_earlier:
-    name: Report error earlier
-    if: ${{ always() && needs.get-pr-info.result == 'success' && needs.get-tests.result != 'success' }}
-    needs: [get-pr-number, get-pr-info, get-tests]
-    permissions:
-      pull-requests: write
-    runs-on: ubuntu-22.04
-    steps:
-      - name: Reply to the comment
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
-          github_repository: ${{ github.repository }}
-          pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
-        run: |
-          gh api \
-            --method POST \
-            -H "Accept: application/vnd.github+json" \
-            -H "X-GitHub-Api-Version: 2022-11-28" \
-            "repos/${github_repository}/issues/${pr_number}/comments" \
-            -f body="💔 This comment contains \`run-slow\`, but unknown error occurred and [the workflow run]($GITHUB_RUN_URL) aborted!"
+          echo "${{ env.models }}"
+          echo "models=${{ env.models }}" >> $GITHUB_ENV
+          echo "models=${{ env.models }}" >> $GITHUB_OUTPUT
+          echo "${{ env.quantizations }}"
+          echo "quantizations=${{ env.quantizations }}" >> $GITHUB_OUTPUT

  reply_to_comment:
    name: Reply to the comment
@ -134,20 +143,20 @@ jobs:
      - name: Reply to the comment
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          BODY: '\n\nmodels: ${{ needs.get-tests.outputs.models }}\nquantizations: ${{ needs.get-tests.outputs.quantizations }}'
-          github_repository: ${{ github.repository }}
-          pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
+          MODELS: ${{ needs.get-tests.outputs.models }}
+          BODY: "\n\nmodels: ${{ needs.get-tests.outputs.models }}\nquantizations: ${{ needs.get-tests.outputs.quantizations }}"
        run: |
          gh api \
            --method POST \
            -H "Accept: application/vnd.github+json" \
            -H "X-GitHub-Api-Version: 2022-11-28" \
-            "repos/${github_repository}/issues/${pr_number}/comments" \
-            -f body="This comment contains \`run-slow\`, running the specified jobs: $(echo -e "$BODY")"
+            repos/${{ github.repository }}/issues/${{ needs.get-pr-number.outputs.PR_NUMBER }}/comments \
+            -f "body=This comment contains run-slow, running the specified jobs: ${{ env.BODY }} ..."

  create_run:
    name: Create run
-    needs: [check-timestamps, reply_to_comment]
+    if: ${{ needs.get-tests.outputs.models != '[]' || needs.get-tests.outputs.quantizations != '[]' }}
+    needs: [get-sha, get-tests, reply_to_comment]
    permissions:
      statuses: write
    runs-on: ubuntu-22.04
@ -159,196 +168,248 @@ jobs:
          # Create a commit status (pending) for a run of this workflow. The status has to be updated later in `update_run_status`.
          # See https://docs.github.com/en/rest/commits/statuses?apiVersion=2022-11-28#create-a-commit-status
          GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
-          github_repository: ${{ github.repository }}
-          pr_head_sha: ${{ needs.check-timestamps.outputs.PR_HEAD_SHA }}
        run: |
          gh api \
            --method POST \
            -H "Accept: application/vnd.github+json" \
            -H "X-GitHub-Api-Version: 2022-11-28" \
-            "repos/${github_repository}/statuses/${pr_head_sha}" \
+            repos/${{ github.repository }}/statuses/${{ needs.get-sha.outputs.PR_HEAD_SHA }} \
            -f "target_url=$GITHUB_RUN_URL" -f "state=pending" -f "description=Slow CI job" -f "context=pytest/custom-tests"

-  model-ci:
-    name: Model CI
+  run_models_gpu:
+    name: Run all tests for the model
    if: ${{ needs.get-tests.outputs.models != '[]' }}
-    uses: ./.github/workflows/self-scheduled.yml
-    needs: [get-pr-number, check-timestamps, get-tests, create_run]
-    with:
-      job: run_models_gpu
-      slack_report_channel: "#transformers-ci-pr"
-      docker: huggingface/transformers-all-latest-gpu
-      ci_event: PR Comment CI
-      report_repo_id: hf-internal-testing/transformers_pr_ci
-      commit_sha: ${{ needs.check-timestamps.outputs.PR_MERGE_SHA }}
-      subdirs: ${{ needs.get-tests.outputs.models }}
-      pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
-    secrets: inherit
+    needs: [get-pr-number, get-sha, get-tests, create_run]
+    strategy:
+      fail-fast: false
+      matrix:
+        folders: ${{ fromJson(needs.get-tests.outputs.models) }}
+        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
+    runs-on:
+       group: '${{ matrix.machine_type }}'
+    container:
+      image: huggingface/transformers-all-latest-gpu
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: Echo input and matrix info
+        shell: bash
+        run: |
+          echo "${{ matrix.folders }}"

-  quantization-ci:
-    name: Quantization CI
+      - name: Echo folder ${{ matrix.folders }}
+        shell: bash
+        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
+        # set the artifact folder names (because the character `/` is not allowed).
+        run: |
+          echo "${{ matrix.folders }}"
+          matrix_folders=${{ matrix.folders }}
+          matrix_folders=${matrix_folders/'models/'/'models_'}
+          echo "$matrix_folders"
+          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
+
+      - name: Checkout to PR merge commit
+        working-directory: /transformers
+        run: |
+          git fetch origin refs/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge:refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge
+          git checkout refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge
+          git log -1 --format=%H
+
+      - name: Verify merge commit SHA
+        env:
+          VERIFIED_PR_MERGE_SHA: ${{ needs.get-sha.outputs.PR_MERGE_SHA }}
+        working-directory: /transformers
+        run: |
+          PR_MERGE_SHA=$(git log -1 --format=%H)
+          if [ $PR_MERGE_SHA != $VERIFIED_PR_MERGE_SHA ]; then
+            echo "The merged commit SHA is not the same as the verified one! Security issue detected, abort the workflow!";
+            exit -1;
+          fi
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
+      - name: Set `machine_type` for report and artifact names
+        working-directory: /transformers
+        shell: bash
+        run: |
+          echo "${{ matrix.machine_type }}"
+          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+            machine_type=single-gpu
+          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
+            machine_type=multi-gpu
+          else
+            machine_type=${{ matrix.machine_type }}
+          fi
+          echo "$machine_type"
+          echo "machine_type=$machine_type" >> $GITHUB_ENV
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Run all tests on GPU
+        working-directory: /transformers
+        run: |
+          export CUDA_VISIBLE_DEVICES="$(python3 utils/set_cuda_devices_for_ci.py --test_folder ${{ matrix.folders }})"
+          echo $CUDA_VISIBLE_DEVICES
+          python3 -m pytest -v -rsfE --make-reports=${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
+
+      - name: Make sure report directory exists
+        shell: bash
+        run: |
+          mkdir -p /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
+          echo "hello" > /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/hello.txt
+          echo "${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports"
+
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports"
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports
+          path: /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
+
+  run_quantization_torch_gpu:
+    name: Run all tests for a quantization
    if: ${{ needs.get-tests.outputs.quantizations != '[]' }}
-    uses: ./.github/workflows/self-scheduled.yml
-    needs: [get-pr-number, check-timestamps, get-tests, create_run]
-    with:
-      job: run_quantization_torch_gpu
-      slack_report_channel: "#transformers-ci-pr"
-      docker: huggingface/transformers-quantization-latest-gpu
-      ci_event: PR Comment CI
-      report_repo_id: hf-internal-testing/transformers_pr_ci
-      commit_sha: ${{ needs.check-timestamps.outputs.PR_MERGE_SHA }}
-      subdirs: ${{ needs.get-tests.outputs.quantizations }}
-      pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
-    secrets: inherit
+    needs: [get-pr-number, get-sha, get-tests, create_run]
+    strategy:
+      fail-fast: false
+      matrix:
+        folders: ${{ fromJson(needs.get-tests.outputs.quantizations) }}
+        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
+    runs-on:
+      group: '${{ matrix.machine_type }}'
+    container:
+      image: huggingface/transformers-quantization-latest-gpu
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: Echo folder ${{ matrix.folders }}
+        shell: bash
+        run: |
+          echo "${{ matrix.folders }}"
+          matrix_folders=${{ matrix.folders }}
+          matrix_folders=${matrix_folders/'quantization/'/'quantization_'}
+          echo "$matrix_folders"
+          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV

-  report:
-    name: Check & Report
-    needs: [get-pr-number, check-timestamps, create_run, model-ci, quantization-ci]
+      - name: Checkout to PR merge commit
+        working-directory: /transformers
+        run: |
+          git fetch origin refs/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge:refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge
+          git checkout refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge
+          git log -1 --format=%H
+
+      - name: Verify merge commit SHA
+        env:
+          VERIFIED_PR_MERGE_SHA: ${{ needs.get-sha.outputs.PR_MERGE_SHA }}
+        working-directory: /transformers
+        run: |
+          PR_MERGE_SHA=$(git log -1 --format=%H)
+          if [ $PR_MERGE_SHA != $VERIFIED_PR_MERGE_SHA ]; then
+            echo "The merged commit SHA is not the same as the verified one! Security issue detected, abort the workflow!";
+            exit -1;
+          fi
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
+      - name: Set `machine_type` for report and artifact names
+        working-directory: /transformers
+        shell: bash
+        run: |
+          echo "${{ matrix.machine_type }}"
+          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+            machine_type=single-gpu
+          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
+            machine_type=multi-gpu
+          else
+            machine_type=${{ matrix.machine_type }}
+          fi
+          echo "$machine_type"
+          echo "machine_type=$machine_type" >> $GITHUB_ENV
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Run quantization tests on GPU
+        working-directory: /transformers
+        run: |
+          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
+
+      - name: Make sure report directory exists
+        shell: bash
+        run: |
+          mkdir -p /transformers/reports/${{ env.machine_type }}_run_quantization_gpu_${{ matrix.folders }}_test_reports
+          echo "hello" > /transformers/reports/${{ env.machine_type }}_run_quantization_gpu_${{ matrix.folders }}_test_reports/hello.txt
+          echo "${{ env.machine_type }}_run_quantization_gpu_${{ matrix.folders }}_test_reports"
+
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports"
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ env.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports
+          path: /transformers/reports/${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports
+
+  update_run_status:
+    name: Update Check Run Status
+    needs: [get-sha, create_run, run_models_gpu, run_quantization_torch_gpu]
    permissions:
-      pull-requests: write
      statuses: write
    if: ${{ always() && needs.create_run.result == 'success' }}
    runs-on: ubuntu-22.04
+    env:
+      GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
+      STATUS_OK: ${{ contains(fromJSON('["skipped", "success"]'), needs.run_models_gpu.result) && contains(fromJSON('["skipped", "success"]'), needs.run_quantization_torch_gpu.result) }}
    steps:
-      - name: Show reports from jobs
-        env:
-          MODEL_REPORT: ${{ needs.model-ci.outputs.report }}
-          QUANT_REPORT: ${{ needs.quantization-ci.outputs.report }}
+      - name: Get `run_models_gpu` job status
        run: |
-          echo "$MODEL_REPORT"
-          echo "$QUANT_REPORT"
-
-      - name: Process and filter reports
-        env:
-          MODEL_REPORT: ${{ needs.model-ci.outputs.report }}
-          QUANT_REPORT: ${{ needs.quantization-ci.outputs.report }}
-        run: |
-          # Preprocess with Python
-          python3 << 'PYTHON_SCRIPT'
-          import json
-          import os
-          
-          def filter_and_format_report(data):
-            """
-            Filter out entries where commit is `None` (failing tests who status is not certain) and format as text
-            """
-            lines = []
-            
-            for model, model_result in data.items():
-                model_lines = []
-                for device, failures in model_result.items():
-                    
-                    # Filter out None commits and extract just the test names
-                    test_names = [
-                        failure['test'] 
-                        for failure in failures 
-                        if isinstance(failure, dict) and failure.get('commit') is not None
-                    ]
-
-                    # Add tests to model lines
-                    for idx, test_name in enumerate(test_names):
-                        if idx == 0:
-                            job_link = failures[idx]['job_link']
-                            model_lines.append(f"- [{model}]({job_link}):")
-          
-                        model_lines.append(f"    {test_name}")
-
-                # Only add model section if it has tests
-                if len(model_lines) > 0:
-                    lines.extend(model_lines)
-                    lines.append("")  # Empty line between models
-            
-            return "\n".join(lines).strip()
-          
-          # Load and filter reports
-          model_report_str = os.environ.get('MODEL_REPORT', '{}')
-          quant_report_str = os.environ.get('QUANT_REPORT', '{}')
-          
-          model_report = json.loads(model_report_str) if model_report_str else {}
-          quant_report = json.loads(quant_report_str) if quant_report_str else {}
-          
-          formatted_model = filter_and_format_report(model_report)
-          formatted_quant = filter_and_format_report(quant_report)
-          
-          # Write to files
-          with open('model_ci.txt', 'w') as f:
-              f.write(formatted_model)
-              if formatted_model:
-                  f.write('\n')
-          
-          with open('quantization_ci.txt', 'w') as f:
-              f.write(formatted_quant)
-              if formatted_quant:
-                  f.write('\n')
-          PYTHON_SCRIPT
-
-      - name: Post results as PR comment
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
-          github_repository: ${{ github.repository }}
-          pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
-          model_ci_result: ${{ needs.model-ci.result }}
-          quantization_ci_result: ${{ needs.quantization-ci.result }}
-        run: |
-          {
-            echo '## CI Results'
-            echo "[Workflow Run ⚙️]($GITHUB_RUN_URL)"
-            echo ''
-
-            # Check if both jobs were skipped or cancelled
-            if [[ "$model_ci_result" == "skipped" || "$model_ci_result" == "cancelled" ]] && \
-               [[ "$quantization_ci_result" == "skipped" || "$quantization_ci_result" == "cancelled" ]]; then
-              echo '⚠️ No test being reported (jobs are skipped or cancelled)!'
-              echo "STATUS=error" >> $GITHUB_ENV
-
-            # Check if either file has content
-            elif [ -s model_ci.txt ] || [ -s quantization_ci.txt ]; then
-              echo "STATUS=failure" >> $GITHUB_ENV
-
-              # Check if model_ci.txt has content
-              if [ -s model_ci.txt ]; then
-                echo '### Model CI Report'
-                echo ''
-                echo '#### ❌ Failed tests'
-                echo ''
-                cat model_ci.txt
-                echo ''
-              fi
-              
-              # Check if quantization_ci.txt has content
-              if [ -s quantization_ci.txt ]; then
-                echo '### Quantization CI Report'
-                echo ''
-                echo '#### ❌ Failed tests'
-                echo ''
-                cat quantization_ci.txt
-                echo ''
-              fi
-            else
-              echo "STATUS=success" >> $GITHUB_ENV
-              echo '✅ No failing test specific to this PR 🎉 !'
-            fi
-          } > comment_body.txt
-
-          gh api \
-            --method POST \
-            -H "Accept: application/vnd.github+json" \
-            -H "X-GitHub-Api-Version: 2022-11-28" \
-            "repos/${github_repository}/issues/${pr_number}/comments" \
-            -F body=@comment_body.txt
+          echo "${{ needs.run_models_gpu.result }}"
+          echo "${{ needs.run_quantization_torch_gpu.result }}"
+          echo $STATUS_OK
+          if [ "$STATUS_OK" = "true" ]; then
+            echo "STATUS=success" >> $GITHUB_ENV
+          else
+            echo "STATUS=failure" >> $GITHUB_ENV
+          fi

      - name: Update PR commit statuses
-        env:
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
-          github_repository: ${{ github.repository }}
-          pr_head_sha: ${{ needs.check-timestamps.outputs.PR_HEAD_SHA }}
-        # The env. variable `STATUS` used here is set in the previous step
        run: |
+          echo "${{ needs.run_models_gpu.result }}"
+          echo "${{ env.STATUS }}"
          gh api \
            --method POST \
            -H "Accept: application/vnd.github+json" \
            -H "X-GitHub-Api-Version: 2022-11-28" \
-            "repos/${github_repository}/statuses/${pr_head_sha}" \
-            -f "target_url=$GITHUB_RUN_URL" -f "state=$STATUS" -f "description=Slow CI job" -f "context=pytest/custom-tests"
+            repos/${{ github.repository }}/statuses/${{ needs.get-sha.outputs.PR_HEAD_SHA }} \
+            -f "target_url=$GITHUB_RUN_URL" -f "state=${{ env.STATUS }}" -f "description=Slow CI job" -f "context=pytest/custom-tests"
--- a/.github/workflows/self-nightly-caller.yml
+++ b/.github/workflows/self-nightly-caller.yml
@ -51,7 +51,6 @@ jobs:
      slack_report_channel: "#transformers-ci-past-future"
      docker: huggingface/transformers-all-latest-torch-nightly-gpu
      ci_event: Nightly CI
-      runner_type: "a10"
      report_repo_id: hf-internal-testing/transformers_daily_ci_with_torch_nightly
      commit_sha: ${{ github.event.workflow_run.head_sha || github.sha }}
    secrets: inherit
--- a/.github/workflows/self-scheduled-amd-caller.yml
+++ b/.github/workflows/self-scheduled-amd-caller.yml
@ -2,7 +2,7 @@ name: Self-hosted runner (AMD scheduled CI caller)

 on:
  schedule:
-    - cron: "17 5 * * *"
+    - cron: "17 2 * * *"

 jobs:
  run_scheduled_amd_ci:
--- a/.github/workflows/self-scheduled-amd-mi355-caller.yml
+++ b/.github/workflows/self-scheduled-amd-mi355-caller.yml
@ -21,7 +21,7 @@ jobs:
      job: run_models_gpu
      slack_report_channel: "#amd-hf-ci"
      runner_group: hfc-amd-mi355
-      docker: huggingface/transformers-pytorch-amd-gpu
+      docker: huggingface/testing-rocm7.0-preview
      ci_event: Scheduled CI (AMD) - mi355
      report_repo_id: hf-transformers-bot/transformers-ci-dummy
    secrets: inherit
@ -33,7 +33,7 @@ jobs:
      job: run_pipelines_torch_gpu
      slack_report_channel: "#amd-hf-ci"
      runner_group: hfc-amd-mi355
-      docker: huggingface/transformers-pytorch-amd-gpu
+      docker: huggingface/testing-rocm7.0-preview
      ci_event: Scheduled CI (AMD) - mi355
      report_repo_id: hf-transformers-bot/transformers-ci-dummy
    secrets: inherit
@ -45,7 +45,7 @@ jobs:
      job: run_examples_gpu
      slack_report_channel: "#amd-hf-ci"
      runner_group: hfc-amd-mi355
-      docker: huggingface/transformers-pytorch-amd-gpu
+      docker: huggingface/testing-rocm7.0-preview
      ci_event: Scheduled CI (AMD) - mi355
      report_repo_id: hf-transformers-bot/transformers-ci-dummy
    secrets: inherit
--- a/.github/workflows/self-scheduled-caller.yml
+++ b/.github/workflows/self-scheduled-caller.yml
@ -33,13 +33,10 @@ jobs:
    runs-on: ubuntu-22.04
    steps:
      - name: Setup
-        env:
-          prev_workflow_run_id: ${{ inputs.prev_workflow_run_id || env.prev_workflow_run_id }}
-          other_workflow_run_id: ${{ inputs.other_workflow_run_id || env.other_workflow_run_id }}
        run: |
          mkdir "setup_values"
-          echo "$prev_workflow_run_id" > "setup_values/prev_workflow_run_id.txt"
-          echo "$other_workflow_run_id" > "setup_values/other_workflow_run_id.txt"
+          echo "${{ inputs.prev_workflow_run_id || env.prev_workflow_run_id }}" > "setup_values/prev_workflow_run_id.txt"
+          echo "${{ inputs.other_workflow_run_id || env.other_workflow_run_id }}" > "setup_values/other_workflow_run_id.txt"

      - name: Upload artifacts
        uses: actions/upload-artifact@v4
@ -121,15 +118,3 @@ jobs:
      report_repo_id: hf-internal-testing/transformers_daily_ci
      commit_sha: ${{ github.sha }}
    secrets: inherit
-
-  kernels-ci:
-    name: Kernels CI
-    uses: ./.github/workflows/self-scheduled.yml
-    with:
-      job: run_kernels_gpu
-      slack_report_channel: "#transformers-ci-daily-kernels"
-      docker: huggingface/transformers-all-latest-gpu
-      ci_event: Daily CI
-      report_repo_id: hf-internal-testing/transformers_daily_ci
-      commit_sha: ${{ github.sha }}
-    secrets: inherit
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@ -34,20 +34,14 @@ on:
      runner_type:
        required: false
        type: string
-      subdirs:
+      models:
        default: ""
        required: false
        type: string
      pytest_marker:
        required: false
        type: string
-      pr_number:
-        required: false
-        type: string
-    outputs:
-      report:
-        description: "Content of the report of new failures"
-        value: ${{ jobs.check_new_failures.outputs.report }}
+

 env:
  HF_HOME: /mnt/cache
@ -60,6 +54,7 @@ env:
  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
  TF_FORCE_GPU_ALLOW_GROWTH: true
  CUDA_VISIBLE_DEVICES: 0,1
+  NUM_SLICES: 2

 jobs:
  setup:
@ -80,11 +75,8 @@ jobs:
    steps:
      - name: Update clone
        working-directory: /transformers
-        env:
-          commit_sha: ${{ inputs.commit_sha || github.sha }}
        run: |
-          git fetch origin $commit_sha
-          git fetch && git checkout $commit_sha
+          git fetch && git checkout ${{ inputs.commit_sha || github.sha }}

      - name: Cleanup
        working-directory: /transformers
@ -101,17 +93,11 @@ jobs:
        if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu"]'), inputs.job)
        name: Identify models to test
        working-directory: /transformers/tests
-        env:
-          job: ${{ inputs.job }}
-          subdirs: ${{ inputs.subdirs }}
-          NUM_SLICES: 2
        run: |
-          if [ "$job" = "run_models_gpu" ]; then
-            python3 ../utils/split_model_tests.py --subdirs "$subdirs" --num_splits "$NUM_SLICES" > folder_slices.txt
-            echo "folder_slices=$(cat folder_slices.txt)" >> $GITHUB_OUTPUT
-            python3 -c "import ast; folder_slices = ast.literal_eval(open('folder_slices.txt').read()); open('slice_ids.txt', 'w').write(str(list(range(len(folder_slices)))))"
-            echo "slice_ids=$(cat slice_ids.txt)" >> $GITHUB_OUTPUT
-          elif [ "$job" = "run_trainer_and_fsdp_gpu" ]; then
+          if [ "${{ inputs.job }}" = "run_models_gpu" ]; then
+            echo "folder_slices=$(python3 ../utils/split_model_tests.py --models '${{ inputs.models }}' --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
+            echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
+          elif [ "${{ inputs.job }}" = "run_trainer_and_fsdp_gpu" ]; then
            echo "folder_slices=[['trainer'], ['fsdp']]" >> $GITHUB_OUTPUT
            echo "slice_ids=[0, 1]" >> $GITHUB_OUTPUT
          fi
@ -120,10 +106,8 @@ jobs:
        if: ${{ inputs.job == 'run_quantization_torch_gpu' }}
        name: Identify quantization method to test
        working-directory: /transformers/tests
-        env:
-          subdirs: ${{ inputs.subdirs || 'None' }}
        run: |
-          echo "quantization_matrix=$(python3 -c 'import ast; import os; tests = os.getcwd(); quantization_tests = os.listdir(os.path.join(tests, "quantization")); subdirs = ast.literal_eval(os.environ["subdirs"]); quantization_tests = [x.removeprefix("quantization/") for x in subdirs] if subdirs is not None else quantization_tests; d = sorted(list(filter(os.path.isdir, [f"quantization/{x}" for x in quantization_tests]))); print(d)')" >> $GITHUB_OUTPUT
+          echo "quantization_matrix=$(python3 -c 'import os; tests = os.getcwd(); quantization_tests = os.listdir(os.path.join(tests, "quantization")); d = sorted(list(filter(os.path.isdir, [f"quantization/{x}" for x in quantization_tests]))) ;  print(d)')" >> $GITHUB_OUTPUT

      - name: NVIDIA-SMI
        run: |
@ -186,9 +170,7 @@ jobs:
    steps:
      - name: Update clone
        working-directory: /transformers
-        env:
-          commit_sha: ${{ inputs.commit_sha || github.sha }}
-        run: git fetch && git checkout "$commit_sha"
+        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}

      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
        working-directory: /transformers
@ -210,17 +192,15 @@ jobs:
      - name: Set `machine_type` for report and artifact names
        working-directory: /transformers
        shell: bash
-        env:
-          matrix_machine_type: ${{ matrix.machine_type }}
        run: |
-          echo "$matrix_machine_type"
+          echo "${{ matrix.machine_type }}"

-          if [ "$matrix_machine_type" = "aws-g5-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "$matrix_machine_type" = "aws-g5-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
-            machine_type="$matrix_machine_type"
+            machine_type=${{ matrix.machine_type }}
          fi

          echo "$machine_type"
@ -229,12 +209,12 @@ jobs:
      - name: Run all pipeline tests on GPU
        working-directory: /transformers
        run: |
-          python3 -m pytest -n 1 -v --dist=loadfile --make-reports="${machine_type}_run_pipelines_torch_gpu_test_reports" tests/pipelines
+          python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports tests/pipelines

      - name: Failure short reports
        if: ${{ failure() }}
        continue-on-error: true
-        run: cat "/transformers/reports/${machine_type}_run_pipelines_torch_gpu_test_reports/failures_short.txt"
+        run: cat /transformers/reports/${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports/failures_short.txt

      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports"
        if: ${{ always() }}
@ -258,9 +238,7 @@ jobs:
    steps:
      - name: Update clone
        working-directory: /transformers
-        env:
-          commit_sha: ${{ inputs.commit_sha || github.sha }}
-        run: git fetch && git checkout "$commit_sha"
+        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}

      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
        working-directory: /transformers
@ -282,17 +260,15 @@ jobs:
      - name: Set `machine_type` for report and artifact names
        working-directory: /transformers
        shell: bash
-        env:
-          matrix_machine_type: ${{ matrix.machine_type }}
        run: |
-          echo "$matrix_machine_type"
+          echo "${{ matrix.machine_type }}"

-          if [ "$matrix_machine_type" = "aws-g5-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "$matrix_machine_type" = "aws-g5-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
-            machine_type="$matrix_machine_type"
+            machine_type=${{ matrix.machine_type }}
          fi

          echo "$machine_type"
@ -302,12 +278,12 @@ jobs:
        working-directory: /transformers
        run: |
          pip install -r examples/pytorch/_tests_requirements.txt
-          python3 -m pytest -v --make-reports="${machine_type}_run_examples_gpu_test_reports" examples/pytorch
+          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_examples_gpu_test_reports examples/pytorch

      - name: Failure short reports
        if: ${{ failure() }}
        continue-on-error: true
-        run: cat "/transformers/reports/${machine_type}_run_examples_gpu_test_reports/failures_short.txt"
+        run: cat /transformers/reports/${{ env.machine_type }}_run_examples_gpu_test_reports/failures_short.txt

      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_examples_gpu_test_reports"
        if: ${{ always() }}
@ -331,9 +307,7 @@ jobs:
    steps:
      - name: Update clone
        working-directory: ${{ inputs.working-directory-prefix }}/transformers
-        env:
-          commit_sha: ${{ inputs.commit_sha || github.sha }}
-        run: git fetch && git checkout "$commit_sha"
+        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}

      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
        working-directory: ${{ inputs.working-directory-prefix }}/transformers
@ -355,7 +329,7 @@ jobs:
        working-directory: ${{ inputs.working-directory-prefix }}/
        run: |
          python3 -m pip uninstall -y deepspeed
-          DS_DISABLE_NINJA=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --no-build-isolation --config-settings="--build-option=build_ext" --config-settings="--build-option=-j8" --no-cache -v --disable-pip-version-check
+          DS_DISABLE_NINJA=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check

      # To avoid unknown test failures
      - name: Pre build DeepSpeed *again* (for nightly & Past CI)
@ -365,7 +339,7 @@ jobs:
          python3 -m pip uninstall -y deepspeed
          rm -rf DeepSpeed
          git clone https://github.com/deepspeedai/DeepSpeed && cd DeepSpeed && rm -rf build
-          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --no-build-isolation --config-settings="--build-option=build_ext" --config-settings="--build-option=-j8" --no-cache -v --disable-pip-version-check
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check

      - name: NVIDIA-SMI
        run: |
@ -383,17 +357,15 @@ jobs:
      - name: Set `machine_type` for report and artifact names
        working-directory: ${{ inputs.working-directory-prefix }}/transformers
        shell: bash
-        env:
-          matrix_machine_type: ${{ matrix.machine_type }}
        run: |
-          echo "$matrix_machine_type"
+          echo "${{ matrix.machine_type }}"

-          if [ "$matrix_machine_type" = "aws-g5-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "$matrix_machine_type" = "aws-g5-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
-            machine_type="$matrix_machine_type"
+            machine_type=${{ matrix.machine_type }}
          fi

          echo "$machine_type"
@ -402,14 +374,12 @@ jobs:
      - name: Run all tests on GPU
        working-directory: ${{ inputs.working-directory-prefix }}/transformers
        run: |
-          python3 -m pytest -v --make-reports="${machine_type}_run_torch_cuda_extensions_gpu_test_reports" tests/deepspeed tests/extended
+          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended

      - name: Failure short reports
        if: ${{ failure() }}
        continue-on-error: true
-        env:
-          working_directory_prefix: ${{ inputs.working-directory-prefix }}
-        run: cat "${working_directory_prefix}/transformers/reports/${machine_type}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt"
+        run: cat ${{ inputs.working-directory-prefix }}/transformers/reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt

      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports"
        if: ${{ always() }}
@ -436,19 +406,16 @@ jobs:
    steps:
      - name: Echo folder ${{ matrix.folders }}
        shell: bash
-        env:
-          matrix_folders_raw: ${{ matrix.folders }}
        run: |
-          echo "$matrix_folders_raw"
-          matrix_folders="${matrix_folders_raw/'quantization/'/'quantization_'}"
+          echo "${{ matrix.folders }}"
+          matrix_folders=${{ matrix.folders }}
+          matrix_folders=${matrix_folders/'quantization/'/'quantization_'}
          echo "$matrix_folders"
          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV

      - name: Update clone
        working-directory: /transformers
-        env:
-          commit_sha: ${{ inputs.commit_sha || github.sha }}
-        run: git fetch && git checkout "$commit_sha"
+        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}

      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
        working-directory: /transformers
@ -470,17 +437,15 @@ jobs:
      - name: Set `machine_type` for report and artifact names
        working-directory: /transformers
        shell: bash
-        env:
-          matrix_machine_type: ${{ matrix.machine_type }}
        run: |
-          echo "$matrix_machine_type"
+          echo "${{ matrix.machine_type }}"

-          if [ "$matrix_machine_type" = "aws-g5-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "$matrix_machine_type" = "aws-g5-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
-            machine_type="$matrix_machine_type"
+            machine_type=${{ matrix.machine_type }}
          fi

          echo "$machine_type"
@ -488,96 +453,20 @@ jobs:

      - name: Run quantization tests on GPU
        working-directory: /transformers
-        env:
-          folders: ${{ matrix.folders }}
        run: |
-          python3 -m pytest -v --make-reports="${machine_type}_run_quantization_torch_gpu_${matrix_folders}_test_reports" tests/${folders}
+          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}

      - name: Failure short reports
        if: ${{ failure() }}
        continue-on-error: true
-        run: cat "/transformers/reports/${machine_type}_run_quantization_torch_gpu_${matrix_folders}_test_reports/failures_short.txt"
+        run: cat /transformers/reports/${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports/failures_short.txt

      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports"
        if: ${{ always() }}
        uses: actions/upload-artifact@v4
        with:
          name: ${{ env.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports
-          path: /transformers/reports/${{ env.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports
-
-  run_kernels_gpu:
-    if: ${{ inputs.job == 'run_kernels_gpu' }}
-    name: Kernel tests
-    strategy:
-      fail-fast: false
-      matrix:
-        machine_type: [aws-g5-4xlarge-cache]
-    runs-on:
-      group: '${{ matrix.machine_type }}'
-    container:
-      image: ${{ inputs.docker }}
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    steps:
-      - name: Update clone
-        working-directory: /transformers
-        env:
-          commit_sha: ${{ inputs.commit_sha || github.sha }}
-        run: git fetch && git checkout "$commit_sha"
-
-      - name: Reinstall transformers in edit mode
-        working-directory: /transformers
-        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .[testing]
-  
-      - name: Install kernels
-        working-directory: /transformers
-        run: python3 -m pip install -U kernels
-  
-      - name: NVIDIA-SMI
-        run: nvidia-smi
-
-      - name: Environment
-        working-directory: /transformers
-        run: python3 utils/print_env.py
-
-      - name: Show installed libraries and their versions
-        working-directory: /transformers
-        run: pip freeze
-
-      - name: Set `machine_type` for report and artifact names
-        working-directory: /transformers
-        shell: bash
-        env:
-          matrix_machine_type: ${{ matrix.machine_type }}
-        run: |
-          echo "$matrix_machine_type"
-
-          if [ "$matrix_machine_type" = "aws-g5-4xlarge-cache" ]; then
-            machine_type=single-gpu
-          elif [ "$matrix_machine_type" = "aws-g5-12xlarge-cache" ]; then
-            machine_type=multi-gpu
-          else
-            machine_type="$matrix_machine_type"
-          fi
-
-          echo "$machine_type"
-          echo "machine_type=$machine_type" >> $GITHUB_ENV
-    
-      - name: Run kernel tests on GPU
-        working-directory: /transformers
-        run: |
-          python3 -m pytest -v --make-reports="${machine_type}_run_kernels_gpu_test_reports" tests/kernels/test_kernels.py
-
-      - name: Failure short reports
-        if: ${{ failure() }}
-        continue-on-error: true
-        run: cat "/transformers/reports/${machine_type}_run_kernels_gpu_test_reports/failures_short.txt"
-
-      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_kernels_gpu_test_reports"
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: ${{ env.machine_type }}_run_kernels_gpu_test_reports
-          path: /transformers/reports/${{ env.machine_type }}_run_kernels_gpu_test_reports
+          path: /transformers/reports/${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports

  run_extract_warnings:
    # Let's only do this for the job `run_models_gpu` to simplify the (already complex) logic.
@ -586,10 +475,11 @@ jobs:
    runs-on: ubuntu-22.04
    needs: [setup, run_models_gpu]
    steps:
-      # Checkout in order to run `utils/extract_warnings.py`. Avoid **explicit** checkout (i.e. don't specify `ref`) for
-      # security reason.
      - name: Checkout transformers
        uses: actions/checkout@v4
+        with:
+          fetch-depth: 2
+          ref: ${{ inputs.commit_sha || github.sha }}

      - name: Install transformers
        run: pip install transformers
@ -609,12 +499,9 @@ jobs:
        working-directory: warnings_in_ci

      - name: Extract warnings in CI artifacts
-        env:
-          github_run_id: ${{ github.run_id }}
-          access_token: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
        run: |
-          python3 utils/extract_warnings.py --workflow_run_id "$github_run_id" --output_dir warnings_in_ci --token "$access_token" --from_gh
-          echo "$(python3 -c 'import os; import json; fp = open("warnings_in_ci/selected_warnings.json"); d = json.load(fp); d = "\n".join(d); print(d)')"
+          python3 utils/extract_warnings.py --workflow_run_id ${{ github.run_id }} --output_dir warnings_in_ci --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }} --from_gh
+          echo "$(python3 -c 'import os; import json; fp = open("warnings_in_ci/selected_warnings.json"); d = json.load(fp); d = "\n".join(d) ;print(d)')"

      - name: Upload artifact
        if: ${{ always() }}
@ -633,7 +520,6 @@ jobs:
      run_examples_gpu,
      run_torch_cuda_extensions_gpu,
      run_quantization_torch_gpu,
-      run_kernels_gpu,
      run_extract_warnings
    ]
    if: always() && !cancelled()
@ -653,17 +539,16 @@ jobs:
    secrets: inherit

  check_new_failures:
-    if: ${{ always() && needs.send_results.result == 'success' }}
+    if: ${{ always() && inputs.ci_event == 'Daily CI' && needs.send_results.result == 'success' }}
    name: Check new failures
    needs: send_results
    uses: ./.github/workflows/check_failed_tests.yml
    with:
      docker: ${{ inputs.docker }}
-      commit_sha: ${{ inputs.commit_sha || github.sha }}
+      start_sha: ${{ inputs.commit_sha || github.sha }}
      job: ${{ inputs.job }}
      slack_report_channel: ${{ inputs.slack_report_channel }}
      ci_event: ${{ inputs.ci_event }}
      report_repo_id: ${{ inputs.report_repo_id }}
-      pr_number: ${{ inputs.pr_number }}

    secrets: inherit
--- a/.github/workflows/slack-report.yml
+++ b/.github/workflows/slack-report.yml
@ -41,16 +41,13 @@ jobs:
      - name: Preliminary job status
        shell: bash
        # For the meaning of these environment variables, see the job `Setup`
-        env:
-          setup_status: ${{ inputs.setup_status }}
        run: |
-          echo "Setup status: $setup_status"
+          echo "Setup status: ${{ inputs.setup_status }}"

      - uses: actions/checkout@v4
        with:
          fetch-depth: 2
-          # Security: checkout to the `main` branch for untrusted triggers (issue_comment, pull_request_target), otherwise use the specified ref
-          ref: ${{ (github.event_name == 'issue_comment' || github.event_name == 'pull_request_target') && 'main' || (inputs.commit_sha || github.sha) }}
+          ref: ${{ inputs.commit_sha || github.sha }}

      - uses: actions/download-artifact@v4

@ -84,8 +81,6 @@ jobs:
          CI_TEST_JOB: ${{ inputs.job }}
          SETUP_STATUS: ${{ inputs.setup_status }}
          REPORT_REPO_ID: ${{ inputs.report_repo_id }}
-          quantization_matrix: ${{ inputs.quantization_matrix }}
-          folder_slices: ${{ inputs.folder_slices }}
        # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
        # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
        # For a job that doesn't depend on (i.e. `needs`) `setup`, the value for `inputs.folder_slices` would be an
@ -94,10 +89,10 @@ jobs:
          pip install huggingface_hub
          pip install slack_sdk
          pip show slack_sdk
-          if [ "$quantization_matrix" != "" ]; then
-            python utils/notification_service.py "$quantization_matrix"
+          if [ "${{ inputs.quantization_matrix }}" != "" ]; then
+            python utils/notification_service.py "${{ inputs.quantization_matrix }}"
          else
-            python utils/notification_service.py "$folder_slices"
+            python utils/notification_service.py "${{ inputs.folder_slices }}"
          fi

      # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.
--- a/.github/workflows/ssh-runner.yml
+++ b/.github/workflows/ssh-runner.yml
@ -4,7 +4,7 @@ on:
  workflow_dispatch:
    inputs:
      runner_type:
-        description: 'Type of runner to test (a10)'
+        description: 'Type of runner to test (a10 or t4)'
        required: true
      docker_image:
        description: 'Name of the Docker image'
@ -36,10 +36,14 @@ jobs:
          NUM_GPUS: ${{ github.event.inputs.num_gpus }}
          RUNNER_TYPE: ${{ github.event.inputs.runner_type }}
        run: |
-          if [[ "$NUM_GPUS" == "single" && "$RUNNER_TYPE" == "a10" ]]; then
-            echo "RUNNER=aws-g5-4xlarge-cache-ssh" >> $GITHUB_ENV
+          if [[ "$NUM_GPUS" == "single" && "$RUNNER_TYPE" == "t4" ]]; then
+            echo "RUNNER=aws-g4dn-4xlarge-cache" >> $GITHUB_ENV
+          elif [[ "$NUM_GPUS" == "multi" && "$RUNNER_TYPE" == "t4" ]]; then
+            echo "RUNNER=aws-g4dn-12xlarge-cache" >> $GITHUB_ENV
+          elif [[ "$NUM_GPUS" == "single" && "$RUNNER_TYPE" == "a10" ]]; then
+            echo "RUNNER=aws-g5-4xlarge-cache" >> $GITHUB_ENV
          elif [[ "$NUM_GPUS" == "multi" && "$RUNNER_TYPE" == "a10" ]]; then
-            echo "RUNNER=aws-g5-12xlarge-cache-ssh" >> $GITHUB_ENV
+            echo "RUNNER=aws-g5-12xlarge-cache" >> $GITHUB_ENV
          else
            echo "RUNNER=" >> $GITHUB_ENV
          fi
@ -47,8 +51,8 @@ jobs:
      - name: Set runner to use
        id: set_runner
        run: |
-          echo "$RUNNER"
-          echo "RUNNER=$RUNNER" >> $GITHUB_OUTPUT
+          echo ${{ env.RUNNER }}
+          echo "RUNNER=${{ env.RUNNER }}" >> $GITHUB_OUTPUT

  ssh_runner:
    name: "SSH"
@ -57,13 +61,13 @@ jobs:
      group: ${{ needs.get_runner.outputs.RUNNER }}
    container:
      image: ${{ github.event.inputs.docker_image }}
+      options: --gpus all --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+
    steps:
      - name: Update clone
        working-directory: /transformers
-        env:
-          commit_sha: ${{ github.sha }}
        run: |
-          git fetch && git checkout "$commit_sha"
+          git fetch && git checkout ${{ github.sha }}

      - name: Cleanup
        working-directory: /transformers
@ -95,17 +99,14 @@ jobs:
      - name: Store Slack infos
        #because the SSH can be enabled dynamically if the workflow failed, so we need to store slack infos to be able to retrieve them during the waitforssh step
        shell: bash
-        env:
-          user_slack_id: ${{ secrets[format('{0}_{1}', env.github_actor, 'SLACK_ID')] }}
-          default_slack_channel: ${{ secrets.SLACK_CIFEEDBACK_CHANNEL }}
        run: |
-          echo "$github_actor"
-          if [ "$user_slack_id" != "" ]; then
-            echo "SLACKCHANNEL=$user_slack_id" >> $GITHUB_ENV
+          echo "${{ env.github_actor }}"
+          if [ "${{ secrets[format('{0}_{1}', env.github_actor, 'SLACK_ID')] }}" != "" ]; then
+            echo "SLACKCHANNEL=${{ secrets[format('{0}_{1}', env.github_actor, 'SLACK_ID')] }}" >> $GITHUB_ENV
          else
-            echo "SLACKCHANNEL=$default_slack_channel" >> $GITHUB_ENV
+            echo "SLACKCHANNEL=${{ secrets.SLACK_CIFEEDBACK_CHANNEL }}" >> $GITHUB_ENV
          fi
-        
+
      - name: Tailscale # In order to be able to SSH when a test fails
        uses: huggingface/tailscale-action@main
        with:
--- a/1
+++ b/1
@ -45,7 +45,6 @@ repo-consistency:
 	python utils/check_modular_conversion.py
 	python utils/check_dummies.py
 	python utils/check_repo.py
-	python utils/check_init_weights_data.py
 	python utils/check_inits.py
 	python utils/check_pipeline_typing.py
 	python utils/check_config_docstrings.py
--- a/benchmark/requirements.txt
+++ b/benchmark/requirements.txt
@ -1,5 +1,6 @@
 gpustat==1.1.1
 psutil==6.0.0
 psycopg2==2.9.9
+torch>=2.4.0
 hf_xet
-pandas>=1.5.0
+pandas>=1.5.0
--- a/benchmark_v2/framework/benchmark_config.py
+++ b/benchmark_v2/framework/benchmark_config.py
@ -1,11 +1,8 @@
 import hashlib
-import itertools
 import json
 import logging
 from typing import Any

-from transformers.utils.import_utils import is_flash_attn_2_available
-

 KERNELIZATION_AVAILABLE = False
 try:
@ -21,22 +18,11 @@ logger = logging.getLogger(__name__)
 class BenchmarkConfig:
    """Configuration for a single benchmark scenario."""

-    all_attn_implementations = [
-        ("flash_attention_2", None),
-        ("eager", None),
-        ("sdpa", "math"),
-        ("sdpa", "flash_attention"),
-        ("flex_attention", None),
-    ]
-
-    all_compiled_modes = [None, "default", "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs"]
-
    def __init__(
        self,
        warmup_iterations: int = 5,
        measurement_iterations: int = 20,
        gpu_monitoring: bool = True,  # NOTE: you may want to disable this at times as we have obsvered it could heavily slow down benchmarks on AMD
-        continuous_batching: bool = False,
        batch_size: int = 1,
        sequence_length: int = 128,
        num_tokens_to_generate: int = 128,
@ -52,7 +38,6 @@ class BenchmarkConfig:
        self.warmup_iterations = warmup_iterations
        self.measurement_iterations = measurement_iterations
        self.gpu_monitoring = gpu_monitoring
-        self.continuous_batching = continuous_batching
        # Input parameters
        self.batch_size = batch_size
        self.sequence_length = sequence_length
@ -74,35 +59,12 @@ class BenchmarkConfig:
    def check_validity(self, skip_validity_check: bool = False) -> None:
        if skip_validity_check:
            return
-        # Check FA is installed
-        if self.attn_implementation == "flash_attention_2" and not is_flash_attn_2_available():
-            logger.warning(
-                "Flash attention does not support compile mode. Defaulting to SDPA w/ flash attention backend."
-            )
-            self.attn_implementation = "sdpa"
-            self.sdpa_backend = "flash_attention"
        # Flash attention does not support compile mode, so we turn it off # FIXME: it would be better to support it
        is_fa = self.attn_implementation == "flash_attention_2"
        is_fa |= self.attn_implementation == "sdpa" and self.sdpa_backend == "flash_attention"
        if is_fa:
            logger.warning("Flash attention does not support compile mode. Turning off compile mode.")
            self.compile_mode = None
-        # Handle SDPA backend if not determined by the config (needs to be done before skipping duplicates)
-        if self.attn_implementation == "sdpa" and self.sdpa_backend is None:
-            default_backend = "flash_attention"  # FIXME: torch has a _cur_sdpa_kernel_backends but it fails
-            logger.warning(f"No SDPA backend provided, using {default_backend} instead.")
-            self.sdpa_backend = default_backend
-        if self.continuous_batching:
-            if self.attn_implementation == "flex_attention":
-                logger.error(
-                    "disabling continuous batching because of invalid configuration: flex attention is not supported"
-                )
-                self.continuous_batching = False
-            elif self.attn_implementation == "sdpa" and self.sdpa_backend is not None:
-                logger.warning(
-                    "when continuous batching is enabled, sdpa_backend must be None because of the attention mask, setting it to None"
-                )
-                self.sdpa_backend = "math"

    @property
    def hash(self) -> str:
@ -118,7 +80,6 @@ class BenchmarkConfig:
            attn_code += f"_{self.sdpa_backend}" if self.attn_implementation == "sdpa" else ""
            compile_str = f"compiled_{self.compile_mode}" if self.compile_mode is not None else "uncompiled"
            kernelize_str = "kernelized" if self.kernelize else "unkernelized"
-            continuous_batching_str = "cb" if self.continuous_batching else "generate"
            sep = "-"
        else:
            iter_str = f"{self.warmup_iterations} warmup, {self.measurement_iterations} iterations"
@ -128,11 +89,8 @@ class BenchmarkConfig:
            attn_code += f" with {self.sdpa_backend} backend" if self.attn_implementation == "sdpa" else ""
            compile_str = "compiled" if self.compile_mode is not None else "not compiled"
            kernelize_str = "kernelized" if self.kernelize else "not kernelized"
-            continuous_batching_str = "continuous batching" if self.continuous_batching else "regular generate"
            sep = ", "
-        return sep.join(
-            [iter_str, gpu_monitor_str, dimensions_str, attn_code, compile_str, kernelize_str, continuous_batching_str]
-        )
+        return sep.join([iter_str, gpu_monitor_str, dimensions_str, attn_code, compile_str, kernelize_str])

    def to_dict(self) -> dict[str, Any]:
        return {
@ -140,7 +98,6 @@ class BenchmarkConfig:
            "warmup_iterations": self.warmup_iterations,
            "measurement_iterations": self.measurement_iterations,
            "gpu_monitoring": self.gpu_monitoring,
-            "continuous_batching": self.continuous_batching,
            "batch_size": self.batch_size,
            "sequence_length": self.sequence_length,
            "num_tokens_to_generate": self.num_tokens_to_generate,
@ -157,7 +114,6 @@ class BenchmarkConfig:
            warmup_iterations=data.get("warmup_iterations", 5),
            measurement_iterations=data.get("measurement_iterations", 20),
            gpu_monitoring=data.get("gpu_monitoring", False),
-            continuous_batching=data.get("continuous_batching", False),
            batch_size=data.get("batch_size", 1),
            sequence_length=data.get("sequence_length", 128),
            num_tokens_to_generate=data.get("num_tokens_to_generate", 128),
@ -171,72 +127,88 @@ class BenchmarkConfig:
        )


-def adapt_configs(
-    configs: list[BenchmarkConfig],
-    warmup_iterations: int | list[int] = 5,
-    measurement_iterations: int | list[int] = 20,
-    batch_size: int | list[int] = 1,
-    sequence_length: int | list[int] = 128,
-    num_tokens_to_generate: int | list[int] = 128,
-    gpu_monitoring: bool | list[bool] = True,
+def cross_generate_configs(
+    attn_impl_and_sdpa_backend: list[tuple[str, str | None]],
+    compiled_mode: list[str | None],
+    kernelized: list[bool],
+    warmup_iterations: int = 5,
+    measurement_iterations: int = 20,
+    batch_size: int = 1,
+    sequence_length: int = 128,
+    num_tokens_to_generate: int = 128,
+    gpu_monitoring: bool = True,
 ) -> list[BenchmarkConfig]:
-    parameters = (
-        x if isinstance(x, list) else [x]
-        for x in [
-            warmup_iterations,
-            measurement_iterations,
-            batch_size,
-            sequence_length,
-            num_tokens_to_generate,
-            gpu_monitoring,
-        ]
-    )
-    iterator = itertools.product(*parameters)
-
-    adapted_configs = []
-    for warmup_iters, measurement_iters, bs, seqlen, ntok, monitor in iterator:
-        for config in configs:
-            config = config.to_dict()
-            config["warmup_iterations"] = warmup_iters
-            config["measurement_iterations"] = measurement_iters
-            config["batch_size"] = bs
-            config["sequence_length"] = seqlen
-            config["num_tokens_to_generate"] = ntok
-            config["gpu_monitoring"] = monitor
-            adapted_configs.append(BenchmarkConfig.from_dict(config))
-    return adapted_configs
-
-
-def get_config_by_level(level: int) -> list[BenchmarkConfig]:
+    # Create kwargs common to all configs
+    kwargs = {
+        "warmup_iterations": warmup_iterations,
+        "measurement_iterations": measurement_iterations,
+        "batch_size": batch_size,
+        "sequence_length": sequence_length,
+        "num_tokens_to_generate": num_tokens_to_generate,
+        "gpu_monitoring": gpu_monitoring,
+    }
+    # Cross-generate all combinations of attn_implementation, compiled_mode, and kernelized
    configs = []
-    # Early return if level is greater than 3: we generate all combinations of configs, maybe even w/ all compile modes
-    if level >= 3:
-        for attn_implementation, sdpa_backend in BenchmarkConfig.all_attn_implementations:
-            # Usually there is not much to gain by compiling with other modes, but we allow it for level 4
-            compile_modes = BenchmarkConfig.all_compiled_modes if level >= 4 else [None, "default"]
-            for cm in compile_modes:
-                for kernelize_on in {False, KERNELIZATION_AVAILABLE}:
-                    for cb_on in [False, True]:
-                        configs.append(
-                            BenchmarkConfig(
-                                attn_implementation=attn_implementation,
-                                sdpa_backend=sdpa_backend,
-                                compile_mode=cm,
-                                kernelize=kernelize_on,
-                                continuous_batching=cb_on,
-                            )
-                        )
-        return configs
-    # Otherwise, we add the configs for the given level
-    if level >= 0:
-        configs.append(BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default"))
-    if level >= 1:
-        configs.append(BenchmarkConfig(attn_implementation="flash_attention_2"))
-        configs.append(BenchmarkConfig(attn_implementation="eager", compile_mode="default"))
-        configs.append(BenchmarkConfig(attn_implementation="flash_attention_2", continuous_batching=True))
-    if level >= 2:
-        configs.append(BenchmarkConfig(attn_implementation="sdpa", compile_mode="default"))
-        configs.append(BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default", kernelize=True))
-        configs.append(BenchmarkConfig(attn_implementation="flash_attention_2", kernelize=True))
-        configs.append(BenchmarkConfig(attn_implementation="paged|sdpa", continuous_batching=True))
+    for attn_implementation, sdpa_backend in list(dict.fromkeys(attn_impl_and_sdpa_backend)):
+        for cm in list(dict.fromkeys(compiled_mode)):
+            for kernelize_on in list(dict.fromkeys(kernelized)):
+                config = BenchmarkConfig(
+                    attn_implementation=attn_implementation,
+                    sdpa_backend=sdpa_backend,
+                    compile_mode=cm,
+                    kernelize=kernelize_on,
+                    **kwargs,
+                )
+                configs.append(config)
    return configs
+
+
+def generate_all_configs(
+    warmup_iterations: int = 5,
+    measurement_iterations: int = 20,
+    batch_size: int = 1,
+    sequence_length: int = 128,
+    num_tokens_to_generate: int = 128,
+    gpu_monitoring: bool = True,
+) -> list[BenchmarkConfig]:
+    all_attn_implementations = [
+        ("flash_attention_2", None),
+        ("eager", None),
+        ("sdpa", "math"),
+        ("sdpa", "flash_attention"),
+        ("flex_attention", None),
+    ]
+    return cross_generate_configs(
+        attn_impl_and_sdpa_backend=all_attn_implementations,
+        compiled_mode=[None, "default", "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs"],
+        kernelized=[False, KERNELIZATION_AVAILABLE],
+        warmup_iterations=warmup_iterations,
+        measurement_iterations=measurement_iterations,
+        batch_size=batch_size,
+        sequence_length=sequence_length,
+        num_tokens_to_generate=num_tokens_to_generate,
+        gpu_monitoring=gpu_monitoring,
+    )
+
+
+def generate_main_configs(
+    warmup_iterations: int = 5,
+    measurement_iterations: int = 20,
+    batch_size: int = 1,
+    sequence_length: int = 128,
+    num_tokens_to_generate: int = 128,
+) -> list[BenchmarkConfig]:
+    # Create kwargs common to all configs
+    kwargs = {
+        "warmup_iterations": warmup_iterations,
+        "measurement_iterations": measurement_iterations,
+        "batch_size": batch_size,
+        "sequence_length": sequence_length,
+        "num_tokens_to_generate": num_tokens_to_generate,
+    }
+    return [  # TODO: test max-autotune instead of default
+        BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default", gpu_monitoring=False, **kwargs),
+        BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default", gpu_monitoring=True, **kwargs),
+        BenchmarkConfig(attn_implementation="eager", compile_mode="default", gpu_monitoring=True, **kwargs),
+        BenchmarkConfig(attn_implementation="flash_attention_2", gpu_monitoring=True, **kwargs),
+    ]
--- a/benchmark_v2/framework/benchmark_runner.py
+++ b/benchmark_v2/framework/benchmark_runner.py
@ -234,9 +234,8 @@ class BenchmarkRunner:
            self.logger.info(f"Running benchmark scenario: {config.name}")

            # Quick validation: try one measurement first to see if this scenario works
-            generate_fn = self.time_generate_batch if config.continuous_batching else self.time_generate
            flush_memory()
-            e2e_latency, token_generation_times, shape_and_decoded_output, gpu_metrics = generate_fn(
+            e2e_latency, token_generation_times, shape_and_decoded_output, gpu_metrics = self.time_generate(
                max_new_tokens=1, gpu_monitor=None
            )
            if e2e_latency < 0:
@ -246,14 +245,14 @@ class BenchmarkRunner:
            # Warmup runs
            self.logger.info(f"Warming up with {config.warmup_iterations} iterations...")
            for _ in trange(config.warmup_iterations):
-                _ = generate_fn(max_new_tokens=config.num_tokens_to_generate)
+                _ = self.time_generate(max_new_tokens=config.num_tokens_to_generate)
            self.logger.info("Warmup over.")

            # Measurement runs
            result = BenchmarkResult()
            self.logger.info(f"Benchmarking with {config.measurement_iterations} iterations.")
            for _ in trange(config.measurement_iterations):
-                e2e_latency, token_generation_times, shape_and_decoded_output, gpu_metrics = generate_fn(
+                e2e_latency, token_generation_times, shape_and_decoded_output, gpu_metrics = self.time_generate(
                    max_new_tokens=config.num_tokens_to_generate,
                    gpu_monitor=(GPUMonitor(logger=self.logger) if config.gpu_monitoring else None),
                )
@ -275,58 +274,6 @@ class BenchmarkRunner:
                "config": config,
            }

-    # TODO: refactor `generate_batch` to handle streaming so we can use it here
-    def time_generate_batch(
-        self,
-        max_new_tokens: int,
-        gpu_monitor: GPUMonitor | None = None,
-    ) -> tuple[float, list[float], str, GPURawMetrics | None]:
-        if gpu_monitor is not None:
-            gpu_monitor.start()
-        config = GenerationConfig(
-            max_new_tokens=max_new_tokens,
-            eos_token_id=self.tokenizer.eos_token_id,
-            pad_token_id=self.tokenizer.pad_token_id,
-            do_sample=True,
-        )
-        manager = self.model.init_continuous_batching(config)
-        manager.start()
-        try:
-            first_req_results = []
-            timestamps = []
-            wall_time_0 = time.perf_counter()
-            inputs = self.inputs["input_ids"].tolist()
-            manager.add_requests(inputs, max_new_tokens=max_new_tokens, streaming=True)
-            first_req_id = None
-            num_requests = len(inputs)
-            finished_requests = 0
-            while finished_requests < num_requests:
-                # NOTE: I don't like having the extra if stmt here, but hopefully won't degrade perf too much
-                result = manager.get_result()
-                if result:
-                    timestamps.append(time.perf_counter() - wall_time_0)
-                    if result.is_finished():
-                        finished_requests += 1
-                    if first_req_id is None:
-                        first_req_id = result.request_id
-                    if result.request_id == first_req_id:
-                        first_req_results.append(result)
-                else:
-                    if not manager.is_running():
-                        raise RuntimeError("Generation thread exited unexpectedly")
-            wall_time_1 = time.perf_counter()
-            gpu_metrics = gpu_monitor.stop_and_collect() if gpu_monitor is not None else None
-            decoded_output = self.tokenizer.decode(
-                [res.generated_tokens[0] for res in first_req_results], skip_special_tokens=True
-            )
-            shape_and_decoded_output = f"{(1, len(first_req_results))} | {decoded_output}"
-            e2e_latency = wall_time_1 - wall_time_0
-            return e2e_latency, timestamps, shape_and_decoded_output, gpu_metrics
-        except Exception as e:
-            raise e
-        finally:
-            manager.stop()
-
    def time_generate(
        self,
        max_new_tokens: int,
@ -392,6 +339,12 @@ class BenchmarkRunner:

        n_configs = len(benchmark_configs)
        for i, config in enumerate(benchmark_configs):
+            # Handle SDPA backend if not determined by the config (needs to be done before skipping duplicates)
+            if config.attn_implementation == "sdpa" and config.sdpa_backend is None:
+                default_backend = "flash_attention"  # FIXME: torch has a _cur_sdpa_kernel_backends but it fails
+                self.logger.warning(f"No SDPA backend provided, using {default_backend} instead.")
+                config.sdpa_backend = default_backend
+
            # Skip if already run
            if config.hash in all_results:
                self.logger.info(f"Skipping duplicate config {config.name} for model {model_id} ({i + 1}/{n_configs})")
@ -415,27 +368,21 @@ class BenchmarkRunner:
            self.cleanup()
            self.save_results(model_id, all_results, timestamp=timestamp)

-        if len(all_results) < 1:
-            raise RuntimeError("No benchmark was run succesfully")
-
        if pretty_print_summary:
            print()
            print("=" * 100)
            print(f"Finished benchmarks in {time.perf_counter() - start_time:.2f} seconds")
            print(f"Total number of benchmarks: {len(all_results)}")
-            print("First run metadata:")
-            first_key = list(all_results.keys())[0]
-            first_metadata = all_results[first_key]["metadata"].to_dict()
-            hardware_info = first_metadata.pop("hardware_info")
-            pretty_print_dict(first_metadata | hardware_info, tabs=1)
+            if len(all_results) > 0:
+                print("First run metadata:")
+                first_key = list(all_results.keys())[0]
+                first_metadata = all_results[first_key]["metadata"].to_dict()
+                hardware_info = first_metadata.pop("hardware_info")
+                pretty_print_dict(first_metadata | hardware_info, tabs=1)
            for result in all_results.values():
                print("=" * 100)
                print(f"Config: {result['config'].infer_name(compact=False)}\n")
-                result["measurements"].pprint(
-                    batch_size=result["config"].batch_size,
-                    num_generated_tokens=result["config"].num_tokens_to_generate,
-                    tabs=1,
-                )
+                result["measurements"].pprint(batch_size=result["config"].batch_size, tabs=1)
            print("=" * 100)

        return (timestamp, all_results)
--- a/benchmark_v2/framework/data_classes.py
+++ b/benchmark_v2/framework/data_classes.py
@ -36,17 +36,16 @@ def add_unit_to_duration(stats: dict[str, float]) -> dict[str, str]:
    return stats


-def equalize_lengths_and_collate(stats: dict[str, dict[str, str]]) -> dict[str, str]:
-    """Note: This operation is destructive as it will update values in place before returning a new correctly formatted dict"""
+def equalize_lengths_and_collate(stats: list[dict[str, str]]) -> list[str]:
    keys = ["avg", "std", "min", "med", "max", "p95"]
    for key in keys:
-        max_length = max(len(stat[key]) for stat in stats.values())
-        for stat in stats.values():
+        max_length = max(len(stat[key]) for stat in stats)
+        for stat in stats:
            stat[key] = stat[key].ljust(max_length, " ")
-    return {name: " ".join([f"{key}={stat[key]}" for key in keys]) for name, stat in stats.items()}
+    return [" ".join([f"{key}={stat[key]}" for key in keys]) for stat in stats]


-def pretty_print_dict(data: dict[str, str], tabs: int = 0) -> None:
+def pretty_print_dict(data: dict[str, Any], tabs: int = 0) -> None:
    max_key_length = max([len(key) for key in data.keys()])
    for key, value in data.items():
        tabs_str = "  " * tabs
@ -142,19 +141,27 @@ class BenchmarkResult:
    def get_measured_itl(self) -> list[float]:
        return [(dt[-1] - dt[0]) / (len(dt) - 1) for dt in self.token_generation_times if len(dt) > 1]

-    def get_throughput(self, total_generated_tokens: int) -> list[float]:
-        return [total_generated_tokens / e2e_latency for e2e_latency in self.e2e_latency]
+    def get_throughput(self, batch_size: int) -> float:
+        return [
+            batch_size * len(dt) / e2e_latency
+            for e2e_latency, dt in zip(self.e2e_latency, self.token_generation_times)
+        ]

-    def pprint(self, batch_size: int = 0, num_generated_tokens: int = 0, tabs: int = 0) -> None:
-        measurements = {
-            "E2E Latency": add_unit_to_duration(compute_basic_statistics(self.e2e_latency)),
-            "Time to First Token": add_unit_to_duration(compute_basic_statistics(self.get_measured_ttft())),
-        }
-        itl_values = self.get_measured_itl()
-        if len(itl_values) > 0:
-            measurements["Inter-Token Latency"] = add_unit_to_duration(compute_basic_statistics(itl_values))
+    def pprint(self, batch_size: int = 0, tabs: int = 0) -> None:
+        stats_to_collate = [
+            add_unit_to_duration(compute_basic_statistics(self.e2e_latency)),
+            add_unit_to_duration(compute_basic_statistics(self.get_measured_ttft())),
+            add_unit_to_duration(compute_basic_statistics(self.get_measured_itl())),
+        ]
        if batch_size > 0:
-            throughput_stats = compute_basic_statistics(self.get_throughput(batch_size * num_generated_tokens))
-            measurements["Throughput"] = {key: f"{value:.2f}tok/s" for key, value in throughput_stats.items()}
-        dict_to_pprint = equalize_lengths_and_collate(measurements)
+            throughput_stats = compute_basic_statistics(self.get_throughput(batch_size))
+            stats_to_collate.append({key: f"{value:.2f}tok/s" for key, value in throughput_stats.items()})
+        collated_stats = equalize_lengths_and_collate(stats_to_collate)
+        dict_to_pprint = {
+            "E2E Latency": collated_stats[0],
+            "Time to First Token": collated_stats[1],
+            "Inter-Token Latency": collated_stats[2],
+        }
+        if batch_size > 0:
+            dict_to_pprint["Throughput"] = collated_stats[3]
        pretty_print_dict(dict_to_pprint, tabs=tabs)
--- a/benchmark_v2/requirements.txt
+++ b/benchmark_v2/requirements.txt
@ -2,5 +2,6 @@ numpy>=1.21.0
 psutil>=5.8.0
 gpustat>=1.0.0
 torch>=2.0.0
+transformers>=4.30.0
 datasets>=2.10.0
 huggingface_hub>=0.16.0
--- a/benchmark_v2/run_benchmarks.py
+++ b/benchmark_v2/run_benchmarks.py
@ -23,7 +23,7 @@ import logging
 import sys
 import uuid

-from framework.benchmark_config import adapt_configs, get_config_by_level
+from framework.benchmark_config import BenchmarkConfig, generate_all_configs, generate_main_configs
 from framework.benchmark_runner import BenchmarkRunner


@ -40,14 +40,7 @@ if __name__ == "__main__":
    parser.add_argument("--sequence-length", "-s", type=int, nargs="+", help="Sequence length")
    parser.add_argument("--num-tokens-to-generate", "-n", type=int, nargs="+", help="Number of tokens to generate")

-    parser.add_argument(
-        "--level",
-        type=int,
-        default=1,
-        help="Level of coverage for the benchmark. 0: only the main config, 1: a few important configs, 2: a config for"
-        " each attn implementation an option, 3: cross-generate all combinations of configs, 4: cross-generate all"
-        " combinations of configs w/ all compile modes",
-    )
+    parser.add_argument("--cross-generate", action="store_true", help="Cross-generate all combinations of configs")
    parser.add_argument("--num-tokens-to-profile", "-p", type=int, default=0, help="Number of tokens to profile")

    parser.add_argument("--branch-name", type=str, help="Git branch name")
@ -80,34 +73,70 @@ if __name__ == "__main__":
    logger.info(f"Benchmark run UUID: {benchmark_run_uuid}")
    logger.info(f"Output directory: {args.output_dir}")

-    # We cannot compute ITL if we don't have at least two measurements
-    if any(n <= 1 for n in args.num_tokens_to_generate):
-        raise ValueError("--num_tokens_to_generate arguments should be larger than 1")
-
    # Error out if one of the arguments is not provided
    if len(args.batch_size) * len(args.sequence_length) * len(args.num_tokens_to_generate) == 0:
        raise ValueError(
            "At least one of the arguments --batch-size, --sequence-length, or --num-tokens-to-generate is required"
        )

-    # Get the configs for the given coverage level
-    configs = get_config_by_level(args.level)
-    # Adapt the configs to the given arguments
-    configs = adapt_configs(
-        configs,
-        args.warmup,
-        args.iterations,
-        args.batch_size,
-        args.sequence_length,
-        args.num_tokens_to_generate,
-        not args.no_gpu_monitoring,
-    )
+    # If there is only one (batch_size, sequence_length, num_tokens_to_generate), we benchmark across configs
+    elif len(args.batch_size) * len(args.sequence_length) * len(args.num_tokens_to_generate) == 1:
+        if args.cross_generate:
+            benchmark_configs = generate_all_configs(
+                warmup_iterations=args.warmup,
+                measurement_iterations=args.iterations,
+                batch_size=args.batch_size[0],
+                sequence_length=args.sequence_length[0],
+                num_tokens_to_generate=args.num_tokens_to_generate[0],
+                gpu_monitoring=not args.no_gpu_monitoring,
+            )
+        else:
+            benchmark_configs = generate_main_configs(
+                warmup_iterations=args.warmup,
+                measurement_iterations=args.iterations,
+                batch_size=args.batch_size[0],
+                sequence_length=args.sequence_length[0],
+                num_tokens_to_generate=args.num_tokens_to_generate[0],
+            )

-    runner = BenchmarkRunner(logger, args.output_dir, args.branch_name, args.commit_id, args.commit_message)
+    # Otherwise, we benchmark across all combinations of dimensions
+    else:
+        main_config = generate_main_configs(
+            warmup_iterations=args.warmup,
+            measurement_iterations=args.iterations,
+            batch_size=args.batch_size[0],
+            sequence_length=args.sequence_length[0],
+            num_tokens_to_generate=args.num_tokens_to_generate[0],
+        )[0]
+        benchmark_configs = []
+        for num_tokens_to_generate in args.num_tokens_to_generate:
+            for sequence_length in args.sequence_length:
+                for batch_size in args.batch_size:
+                    cfg_dict = main_config.to_dict()
+                    cfg_dict["batch_size"] = batch_size
+                    cfg_dict["sequence_length"] = sequence_length
+                    cfg_dict["num_tokens_to_generate"] = num_tokens_to_generate
+                    cfg_dict.pop("name")
+                    benchmark_configs.append(BenchmarkConfig.from_dict(cfg_dict))
+
+    runner = BenchmarkRunner(
+        logger,
+        args.output_dir,
+        args.branch_name,
+        args.commit_id,
+        args.commit_message,
+    )
    timestamp, results = runner.run_benchmarks(
-        args.model_id, configs, args.num_tokens_to_profile, pretty_print_summary=True
+        args.model_id,
+        benchmark_configs,
+        args.num_tokens_to_profile,
+        pretty_print_summary=True,
    )

    dataset_id = args.push_result_to_dataset
    if dataset_id is not None and len(results) > 0:
-        runner.push_results_to_hub(dataset_id, results, timestamp)
+        runner.push_results_to_hub(
+            dataset_id,
+            results,
+            timestamp,
+        )
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@ -67,7 +67,7 @@ RUN set -e; \

 RUN python3 -m pip install --no-cache-dir -U timm

-RUN [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir --no-build-isolation git+https://github.com/facebookresearch/detectron2.git || echo "Don't install detectron2 with nightly torch"
+RUN [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git || echo "Don't install detectron2 with nightly torch"

 RUN python3 -m pip install --no-cache-dir pytesseract

--- a/docker/transformers-doc-builder/Dockerfile
+++ b/docker/transformers-doc-builder/Dockerfile
@ -10,7 +10,7 @@ RUN apt-get -y update && apt-get install -y libsndfile1-dev && apt install -y te
 # Torch needs to be installed before deepspeed
 RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed]

-RUN python3 -m pip install --no-cache-dir --no-build-isolation torchvision git+https://github.com/facebookresearch/detectron2.git pytesseract
+RUN python3 -m pip install --no-cache-dir torchvision git+https://github.com/facebookresearch/detectron2.git pytesseract
 RUN python3 -m pip install -U "itsdangerous<2.1.0"

 # Test if the image could successfully build the doc. before publishing the image
--- a/docker/transformers-pytorch-amd-gpu/Dockerfile
+++ b/docker/transformers-pytorch-amd-gpu/Dockerfile
@ -39,7 +39,7 @@ RUN python3 -m pip install --no-cache-dir "torchcodec==0.5"
 # Install flash attention from source. Tested with commit 6387433156558135a998d5568a9d74c1778666d8
 RUN git clone https://github.com/ROCm/flash-attention/ -b tridao && \
    cd flash-attention && \
-    GPU_ARCHS="gfx942" python setup.py install  
-# GPU_ARCHS builds for MI300, MI325 but not MI355: we would need to add `;gfx950` but it takes too long to build.
+    GPU_ARCHS="gfx942;gfx950" python setup.py install  
+# GPU_ARCHS builds for MI300, MI325 and MI355

 RUN python3 -m pip install --no-cache-dir einops
--- a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
@ -29,7 +29,7 @@ RUN python3 -m pip uninstall -y apex torch torchvision torchaudio
 RUN python3 -m pip install torch==$PYTORCH torchvision==$TORCH_VISION torchaudio==$TORCH_AUDIO --index-url https://download.pytorch.org/whl/rocm$ROCM --no-cache-dir

 # Pre-build DeepSpeed, so it's be ready for testing (to avoid timeout)
-RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --no-build-isolation --config-settings="--build-option=build_ext" --config-settings="--build-option=-j8" --no-cache-dir -v --disable-pip-version-check 2>&1
+RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache-dir -v --disable-pip-version-check 2>&1

 ARG REF=main
 WORKDIR /
--- a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
@ -21,7 +21,7 @@ RUN python3 -m pip install --no-cache-dir './transformers[deepspeed-testing]' 'p
 # Install latest release PyTorch
 # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
 # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
-RUN python3 -m pip uninstall -y torch torchvision torchaudio torchcodec && python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA
+RUN python3 -m pip uninstall -y torch torchvision torchaudio && python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/$CUDA

 RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate

@ -43,7 +43,7 @@ RUN python3 -m pip uninstall -y deepspeed
 # This has to be run (again) inside the GPU VMs running the tests.
 # The installation works here, but some tests fail, if we don't pre-build deepspeed again in the VMs running the tests.
 # TODO: Find out why test fail.
-RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --no-build-isolation --config-settings="--build-option=build_ext" --config-settings="--build-option=-j8" --no-cache -v --disable-pip-version-check 2>&1
+RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1

 # `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
 RUN python3 -m pip uninstall -y kernels
--- a/docker/transformers-quantization-latest-gpu/Dockerfile
+++ b/docker/transformers-quantization-latest-gpu/Dockerfile
@ -24,7 +24,7 @@ RUN [ ${#PYTORCH} -gt 0 ] && VERSION='torch=='$PYTORCH'.*' ||  VERSION='torch';
 RUN echo torch=$VERSION
 # `torchvision` and `torchaudio` should be installed along with `torch`, especially for nightly build.
 # Currently, let's just use their latest releases (when `torch` is installed with a release version)
-RUN python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA
+RUN python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/$CUDA

 RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate

@ -50,7 +50,7 @@ RUN python3 -m pip install --no-cache-dir hqq
 RUN python3 -m pip install --no-cache-dir gguf

 # Add autoawq for quantization testing
-RUN python3 -m pip install --no-cache-dir --no-build-isolation autoawq[kernels]
+RUN python3 -m pip install --no-cache-dir autoawq[kernels]

 # Add quanto for quantization testing
 RUN python3 -m pip install --no-cache-dir optimum-quanto
@ -81,7 +81,7 @@ RUN python3 -m pip uninstall -y flash-attn
 RUN cd transformers && python3 setup.py develop

 # Add fp-quant for quantization testing
-RUN python3 -m pip install --no-cache-dir "fp-quant>=0.3.2"
+RUN python3 -m pip install --no-cache-dir "fp-quant>=0.2.0"

 # Low usage or incompatible lib, will enable later on

--- a/docs/source/de/add_new_model.md
+++ b/docs/source/de/add_new_model.md
@ -508,16 +508,16 @@ BERT `_init_weights` Methode:
 def _init_weights(self, module):
    """Initialize the weights"""
    if isinstance(module, nn.Linear):
-        module.weight.normal_(mean=0.0, std=self.config.initializer_range)
+        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        if module.bias is not None:
-            module.bias.zero_()
+            module.bias.data.zero_()
    elif isinstance(module, nn.Embedding):
-        module.weight.normal_(mean=0.0, std=self.config.initializer_range)
+        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        if module.padding_idx is not None:
            module.weight.data[module.padding_idx].zero_()
    elif isinstance(module, nn.LayerNorm):
-        module.bias.zero_()
-        module.weight.fill_(1.0)
+        module.bias.data.zero_()
+        module.weight.data.fill_(1.0)
 ```

 Sie können weitere benutzerdefinierte Schemata verwenden, wenn Sie eine spezielle Initialisierung für einige Module benötigen. Zum Beispiel in
@ -533,9 +533,9 @@ def _init_weights(self, module):
        module.project_hid._is_hf_initialized = True
        module.project_q._is_hf_initialized = True
    elif isinstance(module, nn.Linear):
-        module.weight.normal_(mean=0.0, std=self.config.initializer_range)
+        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        if module.bias is not None:
-            module.bias.zero_()
+            module.bias.data.zero_()
 ```

 Das Flag `_is_hf_initialized` wird intern verwendet, um sicherzustellen, dass wir ein Submodul nur einmal initialisieren. Wenn Sie es auf
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@ -119,8 +119,6 @@
    title: Tools
  - local: transformers_as_backend
    title: Inference server backends
-  - local: continuous_batching
-    title: Continuous Batching
  title: Inference
 - isExpanded: false
  sections:
--- a/docs/source/en/add_new_model.md
+++ b/docs/source/en/add_new_model.md
@ -314,16 +314,16 @@ Random initialization occurs in the `_init_weights` method of `BrandNewLlamaPreT
 def _init_weights(self, module):
    """Initialize the weights"""
    if isinstance(module, nn.Linear):
-        module.weight.normal_(mean=0.0, std=self.config.initializer_range)
+        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        if module.bias is not None:
-            module.bias.zero_()
+            module.bias.data.zero_()
    elif isinstance(module, nn.Embedding):
-        module.weight.normal_(mean=0.0, std=self.config.initializer_range)
+        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        if module.padding_idx is not None:
            module.weight.data[module.padding_idx].zero_()
    elif isinstance(module, nn.LayerNorm):
-        module.bias.zero_()
-        module.weight.fill_(1.0)
+        module.bias.data.zero_()
+        module.weight.data.fill_(1.0)
 ```

 The initialization scheme can look different if you need to adapt it to your model. For example, [`Wav2Vec2ForPreTraining`] initializes [nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html) in its last two linear layers.
@ -339,9 +339,9 @@ def _init_weights(self, module):
        module.project_hid._is_hf_initialized = True
        module.project_q._is_hf_initialized = True
    elif isinstance(module, nn.Linear):
-        module.weight.normal_(mean=0.0, std=self.config.initializer_range)
+        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        if module.bias is not None:
-            module.bias.zero_()
+            module.bias.data.zero_()
 ```

 ### Convert checkpoints to Transformers
--- a/docs/source/en/continuous_batching.md
+++ b/docs/source/en/continuous_batching.md
@ -1,194 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Continuous Batching
-
-Continuous Batching (CB) is an advanced technique to optimize the inference of transformer models by dynamically grouping multiple requests into batches. This approach maximizes GPU utilization and throughput, specifically for workloads with many variable-length inputs.
-
-We are particularly interested in having Continuous Batching in transformers for the following use cases:
- Evaluation of models on large datasets with variable-length inputs
- Generating outputs for multiple sequences for GRPO policies
-
-CB is what makes inference engines like vLLM or SGLang efficient. That being said, transformers does not aim to be a production-ready inference engine, but a complete framework for model development. For this reason, CB is available in `transformers serve`.
-
-If you are not familiar with some of the core concepts CB is built upon, we invite you to read the associated blog post: [Continuous Batching: Efficient Inference for Large Language Models](https://huggingface.co/blog/continuous-batching). _broken link for now_
-
-## API Reference
-
-## Usage Examples
-
-The main way to use CB in transformers is via the `generate_batch` method.
-
-Unlike `generate`, CB takes already tokenized inputs, known as input IDs. Each sequence of input IDs is represented as a list of integers, in python: `list[int]`. Since 
-
-For a more detailed example, please refer to: [examples/continuous_batching](./path/to/example)
-
-### `generate_batch` example
-
-We have created a `ContinuousMixin` that is inherited by the `GenerationMixin` so that all auto regressive text models support CB.
-
-This adds the `generate_batch` method to all models that inherit from `GenerationMixin`.
-
-You can use it as follows:
-
-```py
-import datasets
-import torch
-
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from transformers.generation import GenerationConfig
-
-model = AutoModelForCausalLM.from_pretrained(
-    "Qwen/Qwen3-4B-Instruct-2507",
-    attn_implementation="spda_paged",
-    device_map="cuda",  # if you need cuda
-    dtype=torch.bfloat16,
-)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, padding_side="left")
-
-# prepare a batch of inputs
-dataset = datasets.load_dataset("openai/gsm8k", "socratic", split="test")
-dataset = dataset.select(range(args.samples))
-tokenized_datasets = dataset.map(lambda x: tokenizer(x["question"]), batched=True)
-simple_batch_inputs = [item["input_ids"] for item in tokenized_datasets]
-
-generation_config = GenerationConfig(
-    max_new_tokens=32,
-    use_cuda_graph=False,  # Not supported for simple version
-    eos_token_id=tokenizer.eos_token_id,
-    pad_token_id=tokenizer.pad_token_id,
-    do_sample=False,
-    max_batch_tokens=512,  # max number of tokens in a batch, this is just a default value you should tune based on your hardware
-)
-
-batch_outputs = model.generate_batch(
-    inputs=simple_batch_inputs,
-    generation_config=generation_config,
-)
-
-for request_id, output in batch_outputs.items():
-    generated_text = tokenizer.decode(output.generated_tokens, skip_special_tokens=True)
-    print(f"Request {request_id} output: {generated_text}")
-```
-
-### `ContinuousBatchingManager` example
-
-If you want more control w.r.t. how you want to schedule requests using CB, you can use the `ContinuousBatchingManager` class directly.
-
-This is what we use in `transformers serve` because requests arrive asynchronously and we can leverage the asynchronous nature of the CB process to make things more efficient.
-
-Under the hood, the `ContinuousBatchingManager` creates a background thread that receives inputs from a python `queue.Queue` which it uses to get requests to batch in each forward pass.
-
-Note that the manager is thread safe!
-
-```py
-import datasets
-import torch
-
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from transformers.generation import GenerationConfig
-from transformers.generation.continuous_batching import RequestStatus
-
-model = AutoModelForCausalLM.from_pretrained(
-    "Qwen/Qwen3-4B-Instruct-2507",
-    attn_implementation="spda_paged",
-    device_map="cuda",  # if you need cuda
-    dtype=torch.bfloat16,
-)
-tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, padding_side="left")
-
-# prepare a batch of inputs
-dataset = datasets.load_dataset("openai/gsm8k", "socratic", split="test")
-dataset = dataset.select(range(args.samples))
-tokenized_datasets = dataset.map(lambda x: tokenizer(x["question"]), batched=True)
-simple_batch_inputs = [item["input_ids"] for item in tokenized_datasets]
-
-# initialize the manager, available method thanks to the `ContinuousMixin`
-manager = model.init_continuous_batching(generation_config=generation_config)
-
-# start the background thread
-manager.start()
-
-# this is for demonstration purposes only, in practice this is most useful to do concurrently
-for i, input in enumerate(simple_batch_inputs):
-    request_id = manager.add_request(input_ids=input, request_id=f"request_{i}")  # if you do not specify a request_id, one will be generated for you
-
-# Can be done in an other thread
-for id, request in manager.get_result():
-    generated_text = tokenizer.decode(request.generated_tokens, skip_special_tokens=True)
-    print(f"Request {id} output: {generated_text}")
-
-# you can also get results for a specific request id
-result = manager.get_result(request_id="request_5")  # this is blocking and will wait for the result to be ready
-
-# or get results for a request that is streaming
-manager.add_request(
-    input_ids=input,
-    request_id="streaming_request",
-    stream=True,
-)
-for chunk in manager.request_id_iter(request_id="streaming_request"):
-    generated_text = tokenizer.decode(chunk.generated_tokens, skip_special_tokens=True)
-    print(generated_text)
-    # FIXME: stop iteration in `request_id_iter` when finished instead of doing it externally
-    if chunk.status == RequestStatus.FINISHED:
-        break
-
-# stop the background thread before exiting the process
-manager.stop()
-```
-
-## Supported & Unsupported Features
-
-### Supported Features
-
- Dynamic scheduling of variable-length requests
- Chunked prefill
- Paged Attention Cache
- Sliding window attention
- Chat templates
-
-### Unsupported Features
-
-At the moment, the following features are not supported with CB. We plan to add support to the following:
-
- Prefix caching
- Beam search
- tool calling
-
-The others are unplanned, but depending on community requests we might consider adding them:
-
- MTP (multi token prediction)
- Medusa
-
-## Performance Considerations
-
-
-## Integration with Serving
-
-You can use CB in `transformers serve` by passing the `--continuous-batching` flag when starting the server.
-
-## Monitoring
-
-We have added `opentelemetry` support to Continuous Batching to help you monitor its performance in production. To enable it, you need to install the `opentelemetry` extra when installing `transformers`:
-
-```sh
-# this installs `opentelemetry-api`, `opentelemetry-sdk` and `opentelemetry-exporter-otlp`
-pip install transformers[open-telemetry]
-```
-
-This will enable traces and metrics collection in CB. You will then have to setup the backend to collect and visualize the traces and metrics.
-
--- a/docs/source/en/llm_optims.md
+++ b/docs/source/en/llm_optims.md
@ -393,9 +393,3 @@ model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-v0.1", quantization_config=quant_config, device_map="auto"
 )
 ```
-
-## Continuous Batching
-
-When serving LLMs for inference, you may have multiple requests arriving at different times. Continuous Batching (CB) is a technique that groups incoming requests into batches to maximize GPU utilization and throughput.
-
-See the [Continuous Batching](./continuous_batching) guide for more details on how to use CB in transformers.
--- a/docs/source/en/model_doc/colqwen2.md
+++ b/docs/source/en/model_doc/colqwen2.md
@ -158,24 +158,6 @@ print("Retrieval scores (query x image):")
 print(scores)
 ```

-You can also use checkpoints for `ColQwen2.5` that are **compatible with the ColQwen2 architecture**. This version of the model uses [Qwen2_5_VL](./qwen2_5_vl) as the backbone.
-
-```python
-import torch
-from transformers import ColQwen2ForRetrieval, ColQwen2Processor
-from transformers.utils.import_utils import is_flash_attn_2_available
-
-model_name = "Sahil-Kabir/colqwen2.5-v0.2-hf" # An existing compatible checkpoint
-
-model = ColQwen2ForRetrieval.from_pretrained(
-    model_name,
-    dtype=torch.bfloat16,
-    device_map="auto",
-    attn_implementation="flash_attention_2" if is_flash_attn_2_available() else "sdpa"
-)
-processor = ColQwen2Processor.from_pretrained(model_name)
-```
-
 ## Notes

 - [`~ColQwen2Processor.score_retrieval`] returns a 2D tensor where the first dimension is the number of queries and the second dimension is the number of images. A higher score indicates more similarity between the query and image.
--- a/docs/source/en/model_doc/dinov3.md
+++ b/docs/source/en/model_doc/dinov3.md
@ -169,9 +169,6 @@ print("Pooled output shape:", pooled_output.shape)
 [[autodoc]] DINOv3ViTModel
    - forward

-## DINOv3ViTBackbone    
-[[autodoc]] DINOv3ViTBackbone
-
 ## DINOv3ConvNextModel

 [[autodoc]] DINOv3ConvNextModel
--- a/docs/source/en/model_doc/fuyu.md
+++ b/docs/source/en/model_doc/fuyu.md
@ -75,11 +75,11 @@ A processor requires an image_processor and a tokenizer. Hence, inputs can be lo
 from PIL import Image
 from transformers import AutoTokenizer
 from transformers.models.fuyu.processing_fuyu import FuyuProcessor
-from transformers.models.fuyu.image_processing_fuyu_fast import FuyuImageProcessorFast
+from transformers.models.fuyu.image_processing_fuyu import FuyuImageProcessor


 tokenizer = AutoTokenizer.from_pretrained('adept-hf-collab/fuyu-8b')
-image_processor = FuyuImageProcessorFast()
+image_processor = FuyuImageProcessor()


 processor = FuyuProcessor(image_processor=image_processor, tokenizer=tokenizer)
@ -118,11 +118,6 @@ The `LlamaTokenizer` is used as it is a standard wrapper around sentencepiece.
 [[autodoc]] FuyuImageProcessor
    - __call__

-## FuyuImageProcessor
-
-[[autodoc]] FuyuImageProcessorFast
-    - __call__
-
 ## FuyuProcessor

 [[autodoc]] FuyuProcessor
--- a/docs/source/en/model_doc/glpn.md
+++ b/docs/source/en/model_doc/glpn.md
@ -61,11 +61,6 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] GLPNImageProcessor
    - preprocess

-## GLPNImageProcessorFast
-
-[[autodoc]] GLPNImageProcessorFast
-    - preprocess
-
 ## GLPNModel

 [[autodoc]] GLPNModel
--- a/docs/source/en/model_doc/smolvlm.md
+++ b/docs/source/en/model_doc/smolvlm.md
@ -159,7 +159,7 @@ conversation3 = [

 conversations = [conversation1, conversation2, conversation3]
 inputs = processor.apply_chat_template(
-    conversations,
+    conversation,
    add_generation_prompt=True,
    tokenize=True,
    return_dict=True,
--- a/docs/source/en/perf_infer_gpu_multi.md
+++ b/docs/source/en/perf_infer_gpu_multi.md
@ -149,7 +149,7 @@ The example below packs `up_proj` and `gate_proj` into a single `gate_up_proj` m
 ```python
 class Llama4TextExperts(nn.Module):
    ...
-    self.gate_up_proj = nn.Parameter(torch.zeros(self.num_experts, self.hidden_size, 2 * self.expert_dim))
+    self.gate_up_proj = nn.Parameter(torch.empty(self.num_experts, self.hidden_size, 2 * self.expert_dim))
 ```

 Batch matrix multiplication can be used in the `forward` pass to compute the output of the `gate_up_proj` module.
--- a/docs/source/en/quantization/fp_quant.md
+++ b/docs/source/en/quantization/fp_quant.md
@ -40,7 +40,7 @@ You can choose between MXFP4 and NVFP4 with `FPQuantConfig(forward_dtype="mxfp4"

 A **Blackwell-generation GPU is required** to run the kernels. Runtime support for FP-Quant is implemented through the [QuTLASS](https://github.com/IST-DASLab/qutlass) library and a lightweight PyTorch interface lib [`fp_quant`](https://github.com/IST-DASLab/FP-Quant/tree/master/inference_lib). We recommend installing the former **from source** and the latter with  `pip install fp_quant`.

-Users **without a Blackwell-generation GPU** , can use the method with `quantization_config=FPQuantConfig(pseudoquantization=True)` without having to install [QuTLASS](https://github.com/IST-DASLab/qutlass). This would provide no speedups but would fully emulate the effect of quantization.
+Users **without a Blackwell-generation GPU** , can use the method with `quantization_config=FPQuantConfig(pseudoquant=True)` without having to install [QuTLASS](https://github.com/IST-DASLab/qutlass). This would provide no speedups but would fully emulate the effect of quantization.

 > [!TIP]
 > Find models pre-quantized with FP-Quant in the official ISTA-DASLab [collection](https://huggingface.co/collections/ISTA-DASLab/fp-quant-6877c186103a21d3a02568ee).
--- a/docs/source/en/trainer.md
+++ b/docs/source/en/trainer.md
@ -187,7 +187,7 @@ from torch import nn
 from transformers import Trainer

 class CustomTrainer(Trainer):
-    def compute_loss(self, model: nn.Module, inputs: dict[str, Union[torch.Tensor, Any]], return_outputs: bool = False, num_items_in_batch: Optional[torch.Tensor] = None):
+    def compute_loss(self, model: nn.Module, inputs: dict[str, Union[torch.Tensor, Any]], return_outputs: bool = False num_items_in_batch: Optional[torch.Tensor] = None):
        labels = inputs.pop("labels")
        # forward pass
        outputs = model(**inputs)
--- a/docs/source/it/migration.md
+++ b/docs/source/it/migration.md
@ -170,7 +170,7 @@ Per quanto riguarda la classe `TrainingArguments`:
 - L'argomento `evaluate_during_training` di `TrainingArguments` è deprecato a favore di `eval_strategy`.

 Per quanto riguarda il modello Transfo-XL:
- L'attributo di configurazione `tie_weight` di Transfo-XL diventa `tie_word_embeddings`.
+- L'attributo di configurazione `tie_weight` di Transfo-XL diventa `tie_words_embeddings`.
 - Il metodo di modellazione `reset_length` di Transfo-XL diventa `reset_memory_length`.

 Per quanto riguarda le pipeline:
--- a/docs/source/ja/add_new_model.md
+++ b/docs/source/ja/add_new_model.md
@ -406,16 +406,16 @@ model = BrandNewBertModel(BrandNewBertConfig())
 def _init_weights(self, module):
    """Initialize the weights"""
    if isinstance(module, nn.Linear):
-        module.weight.normal_(mean=0.0, std=self.config.initializer_range)
+        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        if module.bias is not None:
-            module.bias.zero_()
+            module.bias.data.zero_()
    elif isinstance(module, nn.Embedding):
-        module.weight.normal_(mean=0.0, std=self.config.initializer_range)
+        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        if module.padding_idx is not None:
            module.weight.data[module.padding_idx].zero_()
    elif isinstance(module, nn.LayerNorm):
-        module.bias.zero_()
-        module.weight.fill_(1.0)
+        module.bias.data.zero_()
+        module.weight.data.fill_(1.0)
 ```

 特定のモジュールに特別な初期化が必要な場合、カスタムスキームをさらに持つことができます。たとえば、
@ -431,9 +431,9 @@ def _init_weights(self, module):
        module.project_hid._is_hf_initialized = True
        module.project_q._is_hf_initialized = True
    elif isinstance(module, nn.Linear):
-        module.weight.normal_(mean=0.0, std=self.config.initializer_range)
+        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        if module.bias is not None:
-            module.bias.zero_()
+            module.bias.data.zero_()
 ```

 `_is_hf_initialized`フラグは、サブモジュールを一度だけ初期化することを確実にするために内部で使用されます。
--- a/docs/source/ko/add_new_model.md
+++ b/docs/source/ko/add_new_model.md
@ -348,16 +348,16 @@ model = BrandNewBertModel(BrandNewBertConfig())
 def _init_weights(self, module):
    """Initialize the weights"""
    if isinstance(module, nn.Linear):
-        module.weight.normal_(mean=0.0, std=self.config.initializer_range)
+        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        if module.bias is not None:
-            module.bias.zero_()
+            module.bias.data.zero_()
    elif isinstance(module, nn.Embedding):
-        module.weight.normal_(mean=0.0, std=self.config.initializer_range)
+        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        if module.padding_idx is not None:
            module.weight.data[module.padding_idx].zero_()
    elif isinstance(module, nn.LayerNorm):
-        module.bias.zero_()
-        module.weight.fill_(1.0)
+        module.bias.data.zero_()
+        module.weight.data.fill_(1.0)
 ```

 몇 가지 모듈에 대해 특별한 초기화가 필요한 경우 사용자 정의 방식을 사용할 수도 있습니다. 예를 들어, `Wav2Vec2ForPreTraining`에서 마지막 두 개의 선형 레이어는 일반적인 PyTorch `nn.Linear`의 초기화를 가져야 하지만, 다른 모든 레이어는 위와 같은 초기화를 사용해야 합니다. 이는 다음과 같이 코드화됩니다:
@ -371,9 +371,9 @@ def _init_weights(self, module):
        module.project_hid._is_hf_initialized = True
        module.project_q._is_hf_initialized = True
    elif isinstance(module, nn.Linear):
-        module.weight.normal_(mean=0.0, std=self.config.initializer_range)
+        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
        if module.bias is not None:
-            module.bias.zero_()
+            module.bias.data.zero_()
 ```

 `_is_hf_initialized` 플래그는 서브모듈을 한 번만 초기화하도록 내부적으로 사용됩니다. `module.project_q` 및 `module.project_hid`에 대해 `True`로 설정함으로써, 우리가 수행한 사용자 정의 초기화가 이후에 덮어쓰이지 않도록 합니다. 즉, `_init_weights` 함수가 이들에게 적용되지 않습니다.
--- a/docs/source/ko/perf_infer_gpu_multi.md
+++ b/docs/source/ko/perf_infer_gpu_multi.md
@ -152,7 +152,7 @@ class ParallelInterface(MutableMapping):
 ```python
 class Llama4TextExperts(nn.Module):
    ...
-    self.gate_up_proj = nn.Parameter(torch.zeros(self.num_experts, self.hidden_size, 2 * self.expert_dim))
+    self.gate_up_proj = nn.Parameter(torch.empty(self.num_experts, self.hidden_size, 2 * self.expert_dim))
 ```

 배치 행렬 곱셈을 `forward` 패스에서 사용하여 `gate_up_proj` 모듈의 출력을 계산할 수 있습니다.
--- a/examples/legacy/README.md
+++ b/examples/legacy/README.md
@ -0,0 +1,21 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Legacy examples
+
+This folder contains examples which are not actively maintained (mostly contributed by the community).
+
+Using these examples together with a recent version of the library usually requires to make small (sometimes big) adaptations to get the scripts working.
--- a/examples/legacy/benchmarking/README.md
+++ b/examples/legacy/benchmarking/README.md
@ -0,0 +1,26 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# 🤗 Benchmark results
+
+Here, you can find a list of the different benchmark results created by the community.
+
+If you would like to list benchmark results on your favorite models of the [model hub](https://huggingface.co/models) here, please open a Pull Request and add it below.
+
+| Benchmark description | Results | Environment info |      Author      |
+|:----------|:-------------|:-------------|------:|
+| PyTorch Benchmark on inference for `google-bert/bert-base-cased` |[memory](https://github.com/patrickvonplaten/files_to_link_to/blob/master/bert_benchmark/inference_memory.csv) | [env](https://github.com/patrickvonplaten/files_to_link_to/blob/master/bert_benchmark/env.csv) | [Patrick von Platen](https://github.com/patrickvonplaten) | 
+| PyTorch Benchmark on inference for `google-bert/bert-base-cased` |[time](https://github.com/patrickvonplaten/files_to_link_to/blob/master/bert_benchmark/inference_time.csv) | [env](https://github.com/patrickvonplaten/files_to_link_to/blob/master/bert_benchmark/env.csv) | [Patrick von Platen](https://github.com/patrickvonplaten) | 
--- a/examples/legacy/benchmarking/plot_csv_file.py
+++ b/examples/legacy/benchmarking/plot_csv_file.py
@ -0,0 +1,178 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import csv
+from collections import defaultdict
+from dataclasses import dataclass, field
+from typing import Optional
+
+import matplotlib.pyplot as plt
+import numpy as np
+from matplotlib.ticker import ScalarFormatter
+
+from transformers import HfArgumentParser
+
+
+def list_field(default=None, metadata=None):
+    return field(default_factory=lambda: default, metadata=metadata)
+
+
+@dataclass
+class PlotArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
+    """
+
+    csv_file: str = field(
+        metadata={"help": "The csv file to plot."},
+    )
+    plot_along_batch: bool = field(
+        default=False,
+        metadata={"help": "Whether to plot along batch size or sequence length. Defaults to sequence length."},
+    )
+    is_time: bool = field(
+        default=False,
+        metadata={"help": "Whether the csv file has time results or memory results. Defaults to memory results."},
+    )
+    no_log_scale: bool = field(
+        default=False,
+        metadata={"help": "Disable logarithmic scale when plotting"},
+    )
+    is_train: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether the csv file has training results or inference results. Defaults to inference results."
+        },
+    )
+    figure_png_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "Filename under which the plot will be saved. If unused no plot is saved."},
+    )
+    short_model_names: Optional[list[str]] = list_field(
+        default=None, metadata={"help": "List of model names that are used instead of the ones in the csv file."}
+    )
+
+
+def can_convert_to_int(string):
+    try:
+        int(string)
+        return True
+    except ValueError:
+        return False
+
+
+def can_convert_to_float(string):
+    try:
+        float(string)
+        return True
+    except ValueError:
+        return False
+
+
+class Plot:
+    def __init__(self, args):
+        self.args = args
+        self.result_dict = defaultdict(lambda: {"bsz": [], "seq_len": [], "result": {}})
+
+        with open(self.args.csv_file, newline="") as csv_file:
+            reader = csv.DictReader(csv_file)
+            for row in reader:
+                model_name = row["model"]
+                self.result_dict[model_name]["bsz"].append(int(row["batch_size"]))
+                self.result_dict[model_name]["seq_len"].append(int(row["sequence_length"]))
+                if can_convert_to_int(row["result"]):
+                    # value is not None
+                    self.result_dict[model_name]["result"][(int(row["batch_size"]), int(row["sequence_length"]))] = (
+                        int(row["result"])
+                    )
+                elif can_convert_to_float(row["result"]):
+                    # value is not None
+                    self.result_dict[model_name]["result"][(int(row["batch_size"]), int(row["sequence_length"]))] = (
+                        float(row["result"])
+                    )
+
+    def plot(self):
+        fig, ax = plt.subplots()
+        title_str = "Time usage" if self.args.is_time else "Memory usage"
+        title_str = title_str + " for training" if self.args.is_train else title_str + " for inference"
+
+        if not self.args.no_log_scale:
+            # set logarithm scales
+            ax.set_xscale("log")
+            ax.set_yscale("log")
+
+        for axis in [ax.xaxis, ax.yaxis]:
+            axis.set_major_formatter(ScalarFormatter())
+
+        for model_name_idx, model_name in enumerate(self.result_dict.keys()):
+            batch_sizes = sorted(set(self.result_dict[model_name]["bsz"]))
+            sequence_lengths = sorted(set(self.result_dict[model_name]["seq_len"]))
+            results = self.result_dict[model_name]["result"]
+
+            (x_axis_array, inner_loop_array) = (
+                (batch_sizes, sequence_lengths) if self.args.plot_along_batch else (sequence_lengths, batch_sizes)
+            )
+
+            label_model_name = (
+                model_name if self.args.short_model_names is None else self.args.short_model_names[model_name_idx]
+            )
+
+            for inner_loop_value in inner_loop_array:
+                if self.args.plot_along_batch:
+                    y_axis_array = np.asarray(
+                        [results[(x, inner_loop_value)] for x in x_axis_array if (x, inner_loop_value) in results],
+                        dtype=int,
+                    )
+                else:
+                    y_axis_array = np.asarray(
+                        [results[(inner_loop_value, x)] for x in x_axis_array if (inner_loop_value, x) in results],
+                        dtype=np.float32,
+                    )
+
+                (x_axis_label, inner_loop_label) = (
+                    ("batch_size", "len") if self.args.plot_along_batch else ("in #tokens", "bsz")
+                )
+
+                x_axis_array = np.asarray(x_axis_array, int)[: len(y_axis_array)]
+                plt.scatter(
+                    x_axis_array, y_axis_array, label=f"{label_model_name} - {inner_loop_label}: {inner_loop_value}"
+                )
+                plt.plot(x_axis_array, y_axis_array, "--")
+
+            title_str += f" {label_model_name} vs."
+
+        title_str = title_str[:-4]
+        y_axis_label = "Time in s" if self.args.is_time else "Memory in MB"
+
+        # plot
+        plt.title(title_str)
+        plt.xlabel(x_axis_label)
+        plt.ylabel(y_axis_label)
+        plt.legend()
+
+        if self.args.figure_png_file is not None:
+            plt.savefig(self.args.figure_png_file)
+        else:
+            plt.show()
+
+
+def main():
+    parser = HfArgumentParser(PlotArguments)
+    plot_args = parser.parse_args_into_dataclasses()[0]
+    plot = Plot(args=plot_args)
+    plot.plot()
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/legacy/benchmarking/requirements.txt
+++ b/examples/legacy/benchmarking/requirements.txt
@ -0,0 +1 @@
+torch >= 1.3
--- a/examples/legacy/benchmarking/run_benchmark.py
+++ b/examples/legacy/benchmarking/run_benchmark.py
@ -0,0 +1,47 @@
+#!/usr/bin/env python
+# Copyright 2020 The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Benchmarking the library on inference and training"""
+
+from transformers import HfArgumentParser, PyTorchBenchmark, PyTorchBenchmarkArguments
+
+
+def main():
+    parser = HfArgumentParser(PyTorchBenchmarkArguments)
+    try:
+        benchmark_args = parser.parse_args_into_dataclasses()[0]
+    except ValueError as e:
+        arg_error_msg = "Arg --no_{0} is no longer used, please use --no-{0} instead."
+        begin_error_msg = " ".join(str(e).split(" ")[:-1])
+        full_error_msg = ""
+        depreciated_args = eval(str(e).split(" ")[-1])
+        wrong_args = []
+        for arg in depreciated_args:
+            # arg[2:] removes '--'
+            if arg[2:] in PyTorchBenchmarkArguments.deprecated_args:
+                # arg[5:] removes '--no_'
+                full_error_msg += arg_error_msg.format(arg[5:])
+            else:
+                wrong_args.append(arg)
+        if len(wrong_args) > 0:
+            full_error_msg = full_error_msg + begin_error_msg + str(wrong_args)
+        raise ValueError(full_error_msg)
+
+    benchmark = PyTorchBenchmark(args=benchmark_args)
+    benchmark.run()
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/legacy/multiple_choice/run_multiple_choice.py
+++ b/examples/legacy/multiple_choice/run_multiple_choice.py
@ -0,0 +1,232 @@
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Finetuning the library models for multiple choice (Bert, Roberta, XLNet)."""
+
+import logging
+import os
+from dataclasses import dataclass, field
+from typing import Optional
+
+import numpy as np
+from utils_multiple_choice import MultipleChoiceDataset, Split, processors
+
+import transformers
+from transformers import (
+    AutoConfig,
+    AutoModelForMultipleChoice,
+    AutoTokenizer,
+    DataCollatorWithPadding,
+    EvalPrediction,
+    HfArgumentParser,
+    Trainer,
+    TrainingArguments,
+    set_seed,
+)
+from transformers.trainer_utils import is_main_process
+
+
+logger = logging.getLogger(__name__)
+
+
+def simple_accuracy(preds, labels):
+    return (preds == labels).mean()
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
+    )
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+
+    task_name: str = field(metadata={"help": "The name of the task to train on: " + ", ".join(processors.keys())})
+    data_dir: str = field(metadata={"help": "Should contain the data files for the task."})
+    max_seq_length: int = field(
+        default=128,
+        metadata={
+            "help": (
+                "The maximum total input sequence length after tokenization. Sequences longer "
+                "than this will be truncated, sequences shorter will be padded."
+            )
+        },
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+
+
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if training_args.local_process_index in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        training_args.local_process_index,
+        training_args.device,
+        training_args.n_gpu,
+        bool(training_args.parallel_mode.value == "distributed"),
+        training_args.fp16,
+    )
+    # Set the verbosity to info of the Transformers logger (on main process only):
+    if is_main_process(training_args.local_process_index):
+        transformers.utils.logging.set_verbosity_info()
+        transformers.utils.logging.enable_default_handler()
+        transformers.utils.logging.enable_explicit_format()
+    logger.info("Training/evaluation parameters %s", training_args)
+
+    # Set seed
+    set_seed(training_args.seed)
+
+    try:
+        processor = processors[data_args.task_name]()
+        label_list = processor.get_labels()
+        num_labels = len(label_list)
+    except KeyError:
+        raise ValueError("Task not found: %s" % (data_args.task_name))
+
+    # Load pretrained model and tokenizer
+    #
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+
+    config = AutoConfig.from_pretrained(
+        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
+        num_labels=num_labels,
+        finetuning_task=data_args.task_name,
+        cache_dir=model_args.cache_dir,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+    )
+    model = AutoModelForMultipleChoice.from_pretrained(
+        model_args.model_name_or_path,
+        from_tf=bool(".ckpt" in model_args.model_name_or_path),
+        config=config,
+        cache_dir=model_args.cache_dir,
+    )
+
+    # Get datasets
+    train_dataset = (
+        MultipleChoiceDataset(
+            data_dir=data_args.data_dir,
+            tokenizer=tokenizer,
+            task=data_args.task_name,
+            max_seq_length=data_args.max_seq_length,
+            overwrite_cache=data_args.overwrite_cache,
+            mode=Split.train,
+        )
+        if training_args.do_train
+        else None
+    )
+    eval_dataset = (
+        MultipleChoiceDataset(
+            data_dir=data_args.data_dir,
+            tokenizer=tokenizer,
+            task=data_args.task_name,
+            max_seq_length=data_args.max_seq_length,
+            overwrite_cache=data_args.overwrite_cache,
+            mode=Split.dev,
+        )
+        if training_args.do_eval
+        else None
+    )
+
+    def compute_metrics(p: EvalPrediction) -> dict:
+        preds = np.argmax(p.predictions, axis=1)
+        return {"acc": simple_accuracy(preds, p.label_ids)}
+
+    # Data collator
+    data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) if training_args.fp16 else None
+
+    # Initialize our Trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        compute_metrics=compute_metrics,
+        data_collator=data_collator,
+    )
+
+    # Training
+    if training_args.do_train:
+        trainer.train(
+            model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
+        )
+        trainer.save_model()
+        # For convenience, we also re-save the tokenizer to the same directory,
+        # so that you can share your model easily on huggingface.co/models =)
+        if trainer.is_world_master():
+            tokenizer.save_pretrained(training_args.output_dir)
+
+    # Evaluation
+    results = {}
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+
+        result = trainer.evaluate()
+
+        output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
+        if trainer.is_world_master():
+            with open(output_eval_file, "w") as writer:
+                logger.info("***** Eval results *****")
+                for key, value in result.items():
+                    logger.info("  %s = %s", key, value)
+                    writer.write("{} = {}\n".format(key, value))
+
+                results.update(result)
+
+    return results
+
+
+def _mp_fn(index):
+    # For xla_spawn (TPUs)
+    main()
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/legacy/multiple_choice/utils_multiple_choice.py
+++ b/examples/legacy/multiple_choice/utils_multiple_choice.py
@ -0,0 +1,483 @@
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Multiple choice fine-tuning: utilities to work with multiple choice tasks of reading comprehension"""
+
+import csv
+import glob
+import json
+import logging
+import os
+from dataclasses import dataclass
+from enum import Enum
+from typing import Optional
+
+import tqdm
+from filelock import FileLock
+
+from transformers import PreTrainedTokenizer, is_torch_available
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass(frozen=True)
+class InputExample:
+    """
+    A single training/test example for multiple choice
+
+    Args:
+        example_id: Unique id for the example.
+        question: string. The untokenized text of the second sequence (question).
+        contexts: list of str. The untokenized text of the first sequence (context of corresponding question).
+        endings: list of str. multiple choice's options. Its length must be equal to contexts' length.
+        label: (Optional) string. The label of the example. This should be
+        specified for train and dev examples, but not for test examples.
+    """
+
+    example_id: str
+    question: str
+    contexts: list[str]
+    endings: list[str]
+    label: Optional[str]
+
+
+@dataclass(frozen=True)
+class InputFeatures:
+    """
+    A single set of features of data.
+    Property names are the same names as the corresponding inputs to a model.
+    """
+
+    example_id: str
+    input_ids: list[list[int]]
+    attention_mask: Optional[list[list[int]]]
+    token_type_ids: Optional[list[list[int]]]
+    label: Optional[int]
+
+
+class Split(Enum):
+    train = "train"
+    dev = "dev"
+    test = "test"
+
+
+if is_torch_available():
+    import torch
+    from torch.utils.data import Dataset
+
+    class MultipleChoiceDataset(Dataset):
+        features: list[InputFeatures]
+
+        def __init__(
+            self,
+            data_dir: str,
+            tokenizer: PreTrainedTokenizer,
+            task: str,
+            max_seq_length: Optional[int] = None,
+            overwrite_cache=False,
+            mode: Split = Split.train,
+        ):
+            processor = processors[task]()
+
+            cached_features_file = os.path.join(
+                data_dir,
+                "cached_{}_{}_{}_{}".format(
+                    mode.value,
+                    tokenizer.__class__.__name__,
+                    str(max_seq_length),
+                    task,
+                ),
+            )
+
+            # Make sure only the first process in distributed training processes the dataset,
+            # and the others will use the cache.
+            lock_path = cached_features_file + ".lock"
+            with FileLock(lock_path):
+                if os.path.exists(cached_features_file) and not overwrite_cache:
+                    logger.info(f"Loading features from cached file {cached_features_file}")
+                    self.features = torch.load(cached_features_file, weights_only=True)
+                else:
+                    logger.info(f"Creating features from dataset file at {data_dir}")
+                    label_list = processor.get_labels()
+                    if mode == Split.dev:
+                        examples = processor.get_dev_examples(data_dir)
+                    elif mode == Split.test:
+                        examples = processor.get_test_examples(data_dir)
+                    else:
+                        examples = processor.get_train_examples(data_dir)
+                    logger.info("Training examples: %s", len(examples))
+                    self.features = convert_examples_to_features(
+                        examples,
+                        label_list,
+                        max_seq_length,
+                        tokenizer,
+                    )
+                    logger.info("Saving features into cached file %s", cached_features_file)
+                    torch.save(self.features, cached_features_file)
+
+        def __len__(self):
+            return len(self.features)
+
+        def __getitem__(self, i) -> InputFeatures:
+            return self.features[i]
+
+
+class DataProcessor:
+    """Base class for data converters for multiple choice data sets."""
+
+    def get_train_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the train set."""
+        raise NotImplementedError()
+
+    def get_dev_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the dev set."""
+        raise NotImplementedError()
+
+    def get_test_examples(self, data_dir):
+        """Gets a collection of `InputExample`s for the test set."""
+        raise NotImplementedError()
+
+    def get_labels(self):
+        """Gets the list of labels for this data set."""
+        raise NotImplementedError()
+
+
+class RaceProcessor(DataProcessor):
+    """Processor for the RACE data set."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        logger.info(f"LOOKING AT {data_dir} train")
+        high = os.path.join(data_dir, "train/high")
+        middle = os.path.join(data_dir, "train/middle")
+        high = self._read_txt(high)
+        middle = self._read_txt(middle)
+        return self._create_examples(high + middle, "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        logger.info(f"LOOKING AT {data_dir} dev")
+        high = os.path.join(data_dir, "dev/high")
+        middle = os.path.join(data_dir, "dev/middle")
+        high = self._read_txt(high)
+        middle = self._read_txt(middle)
+        return self._create_examples(high + middle, "dev")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        logger.info(f"LOOKING AT {data_dir} test")
+        high = os.path.join(data_dir, "test/high")
+        middle = os.path.join(data_dir, "test/middle")
+        high = self._read_txt(high)
+        middle = self._read_txt(middle)
+        return self._create_examples(high + middle, "test")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1", "2", "3"]
+
+    def _read_txt(self, input_dir):
+        lines = []
+        files = glob.glob(input_dir + "/*txt")
+        for file in tqdm.tqdm(files, desc="read files"):
+            with open(file, encoding="utf-8") as fin:
+                data_raw = json.load(fin)
+                data_raw["race_id"] = file
+                lines.append(data_raw)
+        return lines
+
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for _, data_raw in enumerate(lines):
+            race_id = "{}-{}".format(set_type, data_raw["race_id"])
+            article = data_raw["article"]
+            for i in range(len(data_raw["answers"])):
+                truth = str(ord(data_raw["answers"][i]) - ord("A"))
+                question = data_raw["questions"][i]
+                options = data_raw["options"][i]
+
+                examples.append(
+                    InputExample(
+                        example_id=race_id,
+                        question=question,
+                        contexts=[article, article, article, article],  # this is not efficient but convenient
+                        endings=[options[0], options[1], options[2], options[3]],
+                        label=truth,
+                    )
+                )
+        return examples
+
+
+class SynonymProcessor(DataProcessor):
+    """Processor for the Synonym data set."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        logger.info(f"LOOKING AT {data_dir} train")
+        return self._create_examples(self._read_csv(os.path.join(data_dir, "mctrain.csv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        logger.info(f"LOOKING AT {data_dir} dev")
+        return self._create_examples(self._read_csv(os.path.join(data_dir, "mchp.csv")), "dev")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        logger.info(f"LOOKING AT {data_dir} dev")
+
+        return self._create_examples(self._read_csv(os.path.join(data_dir, "mctest.csv")), "test")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1", "2", "3", "4"]
+
+    def _read_csv(self, input_file):
+        with open(input_file, encoding="utf-8") as f:
+            return list(csv.reader(f))
+
+    def _create_examples(self, lines: list[list[str]], type: str):
+        """Creates examples for the training and dev sets."""
+
+        examples = [
+            InputExample(
+                example_id=line[0],
+                question="",  # in the swag dataset, the
+                # common beginning of each
+                # choice is stored in "sent2".
+                contexts=[line[1], line[1], line[1], line[1], line[1]],
+                endings=[line[2], line[3], line[4], line[5], line[6]],
+                label=line[7],
+            )
+            for line in lines  # we skip the line with the column names
+        ]
+
+        return examples
+
+
+class SwagProcessor(DataProcessor):
+    """Processor for the SWAG data set."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        logger.info(f"LOOKING AT {data_dir} train")
+        return self._create_examples(self._read_csv(os.path.join(data_dir, "train.csv")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        logger.info(f"LOOKING AT {data_dir} dev")
+        return self._create_examples(self._read_csv(os.path.join(data_dir, "val.csv")), "dev")
+
+    def get_test_examples(self, data_dir):
+        """See base class."""
+        logger.info(f"LOOKING AT {data_dir} dev")
+        raise ValueError(
+            "For swag testing, the input file does not contain a label column. It can not be tested in current code "
+            "setting!"
+        )
+        return self._create_examples(self._read_csv(os.path.join(data_dir, "test.csv")), "test")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1", "2", "3"]
+
+    def _read_csv(self, input_file):
+        with open(input_file, encoding="utf-8") as f:
+            return list(csv.reader(f))
+
+    def _create_examples(self, lines: list[list[str]], type: str):
+        """Creates examples for the training and dev sets."""
+        if type == "train" and lines[0][-1] != "label":
+            raise ValueError("For training, the input file must contain a label column.")
+
+        examples = [
+            InputExample(
+                example_id=line[2],
+                question=line[5],  # in the swag dataset, the
+                # common beginning of each
+                # choice is stored in "sent2".
+                contexts=[line[4], line[4], line[4], line[4]],
+                endings=[line[7], line[8], line[9], line[10]],
+                label=line[11],
+            )
+            for line in lines[1:]  # we skip the line with the column names
+        ]
+
+        return examples
+
+
+class ArcProcessor(DataProcessor):
+    """Processor for the ARC data set (request from allennlp)."""
+
+    def get_train_examples(self, data_dir):
+        """See base class."""
+        logger.info(f"LOOKING AT {data_dir} train")
+        return self._create_examples(self._read_json(os.path.join(data_dir, "train.jsonl")), "train")
+
+    def get_dev_examples(self, data_dir):
+        """See base class."""
+        logger.info(f"LOOKING AT {data_dir} dev")
+        return self._create_examples(self._read_json(os.path.join(data_dir, "dev.jsonl")), "dev")
+
+    def get_test_examples(self, data_dir):
+        logger.info(f"LOOKING AT {data_dir} test")
+        return self._create_examples(self._read_json(os.path.join(data_dir, "test.jsonl")), "test")
+
+    def get_labels(self):
+        """See base class."""
+        return ["0", "1", "2", "3"]
+
+    def _read_json(self, input_file):
+        with open(input_file, encoding="utf-8") as fin:
+            lines = fin.readlines()
+            return lines
+
+    def _create_examples(self, lines, type):
+        """Creates examples for the training and dev sets."""
+
+        # There are two types of labels. They should be normalized
+        def normalize(truth):
+            if truth in "ABCD":
+                return ord(truth) - ord("A")
+            elif truth in "1234":
+                return int(truth) - 1
+            else:
+                logger.info("truth ERROR! %s", str(truth))
+                return None
+
+        examples = []
+        three_choice = 0
+        four_choice = 0
+        five_choice = 0
+        other_choices = 0
+        # we deleted example which has more than or less than four choices
+        for line in tqdm.tqdm(lines, desc="read arc data"):
+            data_raw = json.loads(line.strip("\n"))
+            if len(data_raw["question"]["choices"]) == 3:
+                three_choice += 1
+                continue
+            elif len(data_raw["question"]["choices"]) == 5:
+                five_choice += 1
+                continue
+            elif len(data_raw["question"]["choices"]) != 4:
+                other_choices += 1
+                continue
+            four_choice += 1
+            truth = str(normalize(data_raw["answerKey"]))
+            assert truth != "None"
+            question_choices = data_raw["question"]
+            question = question_choices["stem"]
+            id = data_raw["id"]
+            options = question_choices["choices"]
+            if len(options) == 4:
+                examples.append(
+                    InputExample(
+                        example_id=id,
+                        question=question,
+                        contexts=[
+                            options[0]["para"].replace("_", ""),
+                            options[1]["para"].replace("_", ""),
+                            options[2]["para"].replace("_", ""),
+                            options[3]["para"].replace("_", ""),
+                        ],
+                        endings=[options[0]["text"], options[1]["text"], options[2]["text"], options[3]["text"]],
+                        label=truth,
+                    )
+                )
+
+        if type == "train":
+            assert len(examples) > 1
+            assert examples[0].label is not None
+        logger.info("len examples: %s}", str(len(examples)))
+        logger.info("Three choices: %s", str(three_choice))
+        logger.info("Five choices: %s", str(five_choice))
+        logger.info("Other choices: %s", str(other_choices))
+        logger.info("four choices: %s", str(four_choice))
+
+        return examples
+
+
+def convert_examples_to_features(
+    examples: list[InputExample],
+    label_list: list[str],
+    max_length: int,
+    tokenizer: PreTrainedTokenizer,
+) -> list[InputFeatures]:
+    """
+    Loads a data file into a list of `InputFeatures`
+    """
+
+    label_map = {label: i for i, label in enumerate(label_list)}
+
+    features = []
+    for ex_index, example in tqdm.tqdm(enumerate(examples), desc="convert examples to features"):
+        if ex_index % 10000 == 0:
+            logger.info("Writing example %d of %d" % (ex_index, len(examples)))
+        choices_inputs = []
+        for ending_idx, (context, ending) in enumerate(zip(example.contexts, example.endings)):
+            text_a = context
+            if example.question.find("_") != -1:
+                # this is for cloze question
+                text_b = example.question.replace("_", ending)
+            else:
+                text_b = example.question + " " + ending
+
+            inputs = tokenizer(
+                text_a,
+                text_b,
+                add_special_tokens=True,
+                max_length=max_length,
+                padding="max_length",
+                truncation=True,
+                return_overflowing_tokens=True,
+            )
+            if "num_truncated_tokens" in inputs and inputs["num_truncated_tokens"] > 0:
+                logger.info(
+                    "Attention! you are cropping tokens (swag task is ok). "
+                    "If you are training ARC and RACE and you are popping question + options, "
+                    "you need to try to use a bigger max seq length!"
+                )
+
+            choices_inputs.append(inputs)
+
+        label = label_map[example.label]
+
+        input_ids = [x["input_ids"] for x in choices_inputs]
+        attention_mask = (
+            [x["attention_mask"] for x in choices_inputs] if "attention_mask" in choices_inputs[0] else None
+        )
+        token_type_ids = (
+            [x["token_type_ids"] for x in choices_inputs] if "token_type_ids" in choices_inputs[0] else None
+        )
+
+        features.append(
+            InputFeatures(
+                example_id=example.example_id,
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                token_type_ids=token_type_ids,
+                label=label,
+            )
+        )
+
+    for f in features[:2]:
+        logger.info("*** Example ***")
+        logger.info("feature: %s" % f)
+
+    return features
+
+
+processors = {"race": RaceProcessor, "swag": SwagProcessor, "arc": ArcProcessor, "syn": SynonymProcessor}
+MULTIPLE_CHOICE_TASKS_NUM_LABELS = {"race", 4, "swag", 4, "arc", 4, "syn", 5}
--- a/examples/legacy/pytorch-lightning/lightning_base.py
+++ b/examples/legacy/pytorch-lightning/lightning_base.py
@ -0,0 +1,397 @@
+import argparse
+import logging
+import os
+from pathlib import Path
+from typing import Any
+
+import pytorch_lightning as pl
+from pytorch_lightning.utilities import rank_zero_info
+
+from transformers import (
+    AutoConfig,
+    AutoModel,
+    AutoModelForPreTraining,
+    AutoModelForQuestionAnswering,
+    AutoModelForSeq2SeqLM,
+    AutoModelForSequenceClassification,
+    AutoModelForTokenClassification,
+    AutoModelWithLMHead,
+    AutoTokenizer,
+    PreTrainedConfig,
+    PreTrainedTokenizer,
+    is_torch_available,
+)
+from transformers.optimization import (
+    Adafactor,
+    get_cosine_schedule_with_warmup,
+    get_cosine_with_hard_restarts_schedule_with_warmup,
+    get_linear_schedule_with_warmup,
+    get_polynomial_decay_schedule_with_warmup,
+)
+from transformers.utils.versions import require_version
+
+
+if is_torch_available():
+    import torch
+
+
+logger = logging.getLogger(__name__)
+
+require_version("pytorch_lightning>=1.0.4")
+
+MODEL_MODES = {
+    "base": AutoModel,
+    "sequence-classification": AutoModelForSequenceClassification,
+    "question-answering": AutoModelForQuestionAnswering,
+    "pretraining": AutoModelForPreTraining,
+    "token-classification": AutoModelForTokenClassification,
+    "language-modeling": AutoModelWithLMHead,
+    "summarization": AutoModelForSeq2SeqLM,
+    "translation": AutoModelForSeq2SeqLM,
+}
+
+
+# update this and the import above to support new schedulers from transformers.optimization
+arg_to_scheduler = {
+    "linear": get_linear_schedule_with_warmup,
+    "cosine": get_cosine_schedule_with_warmup,
+    "cosine_w_restarts": get_cosine_with_hard_restarts_schedule_with_warmup,
+    "polynomial": get_polynomial_decay_schedule_with_warmup,
+    # '': get_constant_schedule,             # not supported for now
+    # '': get_constant_schedule_with_warmup, # not supported for now
+}
+arg_to_scheduler_choices = sorted(arg_to_scheduler.keys())
+arg_to_scheduler_metavar = "{" + ", ".join(arg_to_scheduler_choices) + "}"
+
+
+class BaseTransformer(pl.LightningModule):
+    def __init__(
+        self,
+        hparams: argparse.Namespace,
+        num_labels=None,
+        mode="base",
+        config=None,
+        tokenizer=None,
+        model=None,
+        **config_kwargs,
+    ):
+        """Initialize a model, tokenizer and config."""
+        super().__init__()
+        # TODO: move to self.save_hyperparameters()
+        # self.save_hyperparameters()
+        # can also expand arguments into trainer signature for easier reading
+
+        self.save_hyperparameters(hparams)
+        self.step_count = 0
+        self.output_dir = Path(self.hparams.output_dir)
+        cache_dir = self.hparams.cache_dir if self.hparams.cache_dir else None
+        if config is None:
+            self.config = AutoConfig.from_pretrained(
+                self.hparams.config_name if self.hparams.config_name else self.hparams.model_name_or_path,
+                **({"num_labels": num_labels} if num_labels is not None else {}),
+                cache_dir=cache_dir,
+                **config_kwargs,
+            )
+        else:
+            self.config: PreTrainedConfig = config
+
+        extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "dropout", "attention_dropout")
+        for p in extra_model_params:
+            if getattr(self.hparams, p, None):
+                assert hasattr(self.config, p), f"model config doesn't have a `{p}` attribute"
+                setattr(self.config, p, getattr(self.hparams, p))
+
+        if tokenizer is None:
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                self.hparams.tokenizer_name if self.hparams.tokenizer_name else self.hparams.model_name_or_path,
+                cache_dir=cache_dir,
+            )
+        else:
+            self.tokenizer: PreTrainedTokenizer = tokenizer
+        self.model_type = MODEL_MODES[mode]
+        if model is None:
+            self.model = self.model_type.from_pretrained(
+                self.hparams.model_name_or_path,
+                from_tf=bool(".ckpt" in self.hparams.model_name_or_path),
+                config=self.config,
+                cache_dir=cache_dir,
+            )
+        else:
+            self.model = model
+
+    def load_hf_checkpoint(self, *args, **kwargs):
+        self.model = self.model_type.from_pretrained(*args, **kwargs)
+
+    def get_lr_scheduler(self):
+        get_schedule_func = arg_to_scheduler[self.hparams.lr_scheduler]
+        scheduler = get_schedule_func(
+            self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.total_steps()
+        )
+        scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}
+        return scheduler
+
+    def configure_optimizers(self):
+        """Prepare optimizer and schedule (linear warmup and decay)"""
+        model = self.model
+        no_decay = ["bias", "LayerNorm.weight"]
+        optimizer_grouped_parameters = [
+            {
+                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+                "weight_decay": self.hparams.weight_decay,
+            },
+            {
+                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
+                "weight_decay": 0.0,
+            },
+        ]
+        if self.hparams.adafactor:
+            optimizer = Adafactor(
+                optimizer_grouped_parameters, lr=self.hparams.learning_rate, scale_parameter=False, relative_step=False
+            )
+
+        else:
+            optimizer = torch.optim.AdamW(
+                optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon
+            )
+        self.opt = optimizer
+
+        scheduler = self.get_lr_scheduler()
+
+        return [optimizer], [scheduler]
+
+    def test_step(self, batch, batch_nb):
+        return self.validation_step(batch, batch_nb)
+
+    def test_epoch_end(self, outputs):
+        return self.validation_end(outputs)
+
+    def total_steps(self) -> int:
+        """The number of total training steps that will be run. Used for lr scheduler purposes."""
+        num_devices = max(1, self.hparams.gpus)  # TODO: consider num_tpu_cores
+        effective_batch_size = self.hparams.train_batch_size * self.hparams.accumulate_grad_batches * num_devices
+        return (self.dataset_size / effective_batch_size) * self.hparams.max_epochs
+
+    def setup(self, mode):
+        if mode == "test":
+            self.dataset_size = len(self.test_dataloader().dataset)
+        else:
+            self.train_loader = self.get_dataloader("train", self.hparams.train_batch_size, shuffle=True)
+            self.dataset_size = len(self.train_dataloader().dataset)
+
+    def get_dataloader(self, type_path: str, batch_size: int, shuffle: bool = False):
+        raise NotImplementedError("You must implement this for your task")
+
+    def train_dataloader(self):
+        return self.train_loader
+
+    def val_dataloader(self):
+        return self.get_dataloader("dev", self.hparams.eval_batch_size, shuffle=False)
+
+    def test_dataloader(self):
+        return self.get_dataloader("test", self.hparams.eval_batch_size, shuffle=False)
+
+    def _feature_file(self, mode):
+        return os.path.join(
+            self.hparams.data_dir,
+            "cached_{}_{}_{}".format(
+                mode,
+                list(filter(None, self.hparams.model_name_or_path.split("/"))).pop(),
+                str(self.hparams.max_seq_length),
+            ),
+        )
+
+    @pl.utilities.rank_zero_only
+    def on_save_checkpoint(self, checkpoint: dict[str, Any]) -> None:
+        save_path = self.output_dir.joinpath("best_tfmr")
+        self.model.config.save_step = self.step_count
+        self.model.save_pretrained(save_path)
+        self.tokenizer.save_pretrained(save_path)
+
+    @staticmethod
+    def add_model_specific_args(parser, root_dir):
+        parser.add_argument(
+            "--model_name_or_path",
+            default=None,
+            type=str,
+            required=True,
+            help="Path to pretrained model or model identifier from huggingface.co/models",
+        )
+        parser.add_argument(
+            "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
+        )
+        parser.add_argument(
+            "--tokenizer_name",
+            default=None,
+            type=str,
+            help="Pretrained tokenizer name or path if not the same as model_name",
+        )
+        parser.add_argument(
+            "--cache_dir",
+            default="",
+            type=str,
+            help="Where do you want to store the pre-trained models downloaded from huggingface.co",
+        )
+        parser.add_argument(
+            "--encoder_layerdrop",
+            type=float,
+            help="Encoder layer dropout probability (Optional). Goes into model.config",
+        )
+        parser.add_argument(
+            "--decoder_layerdrop",
+            type=float,
+            help="Decoder layer dropout probability (Optional). Goes into model.config",
+        )
+        parser.add_argument(
+            "--dropout",
+            type=float,
+            help="Dropout probability (Optional). Goes into model.config",
+        )
+        parser.add_argument(
+            "--attention_dropout",
+            type=float,
+            help="Attention dropout probability (Optional). Goes into model.config",
+        )
+        parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+        parser.add_argument(
+            "--lr_scheduler",
+            default="linear",
+            choices=arg_to_scheduler_choices,
+            metavar=arg_to_scheduler_metavar,
+            type=str,
+            help="Learning rate scheduler",
+        )
+        parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
+        parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+        parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
+        parser.add_argument("--num_workers", default=4, type=int, help="kwarg passed to DataLoader")
+        parser.add_argument("--num_train_epochs", dest="max_epochs", default=3, type=int)
+        parser.add_argument("--train_batch_size", default=32, type=int)
+        parser.add_argument("--eval_batch_size", default=32, type=int)
+        parser.add_argument("--adafactor", action="store_true")
+
+
+class LoggingCallback(pl.Callback):
+    def on_batch_end(self, trainer, pl_module):
+        lr_scheduler = trainer.lr_schedulers[0]["scheduler"]
+        lrs = {f"lr_group_{i}": lr for i, lr in enumerate(lr_scheduler.get_lr())}
+        pl_module.logger.log_metrics(lrs)
+
+    def on_validation_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
+        rank_zero_info("***** Validation results *****")
+        metrics = trainer.callback_metrics
+        # Log results
+        for key in sorted(metrics):
+            if key not in ["log", "progress_bar"]:
+                rank_zero_info(f"{key} = {str(metrics[key])}\n")
+
+    def on_test_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
+        rank_zero_info("***** Test results *****")
+        metrics = trainer.callback_metrics
+        # Log and save results to file
+        output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
+        with open(output_test_results_file, "w") as writer:
+            for key in sorted(metrics):
+                if key not in ["log", "progress_bar"]:
+                    rank_zero_info(f"{key} = {str(metrics[key])}\n")
+                    writer.write(f"{key} = {str(metrics[key])}\n")
+
+
+def add_generic_args(parser, root_dir) -> None:
+    #  To allow all pl args uncomment the following line
+    #  parser = pl.Trainer.add_argparse_args(parser)
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O2",
+        help=(
+            "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. "
+            "See details at https://nvidia.github.io/apex/amp.html"
+        ),
+    )
+    parser.add_argument("--n_tpu_cores", dest="tpu_cores", type=int)
+    parser.add_argument("--max_grad_norm", dest="gradient_clip_val", default=1.0, type=float, help="Max gradient norm")
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.")
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        dest="accumulate_grad_batches",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
+    parser.add_argument(
+        "--data_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The input data dir. Should contain the training files for the CoNLL-2003 NER task.",
+    )
+
+
+def generic_train(
+    model: BaseTransformer,
+    args: argparse.Namespace,
+    early_stopping_callback=None,
+    logger=True,  # can pass WandbLogger() here
+    extra_callbacks=[],
+    checkpoint_callback=None,
+    logging_callback=None,
+    **extra_train_kwargs,
+):
+    pl.seed_everything(args.seed)
+
+    # init model
+    odir = Path(model.hparams.output_dir)
+    odir.mkdir(exist_ok=True)
+
+    # add custom checkpoints
+    if checkpoint_callback is None:
+        checkpoint_callback = pl.callbacks.ModelCheckpoint(
+            filepath=args.output_dir, prefix="checkpoint", monitor="val_loss", mode="min", save_top_k=1
+        )
+    if early_stopping_callback:
+        extra_callbacks.append(early_stopping_callback)
+    if logging_callback is None:
+        logging_callback = LoggingCallback()
+
+    train_params = {}
+
+    # TODO: remove with PyTorch 1.6 since pl uses native amp
+    if args.fp16:
+        train_params["precision"] = 16
+        train_params["amp_level"] = args.fp16_opt_level
+
+    if args.gpus > 1:
+        train_params["distributed_backend"] = "ddp"
+
+    train_params["accumulate_grad_batches"] = args.accumulate_grad_batches
+    train_params["accelerator"] = extra_train_kwargs.get("accelerator")
+    train_params["profiler"] = extra_train_kwargs.get("profiler")
+
+    trainer = pl.Trainer.from_argparse_args(
+        args,
+        weights_summary=None,
+        callbacks=[logging_callback] + extra_callbacks,
+        logger=logger,
+        checkpoint_callback=checkpoint_callback,
+        **train_params,
+    )
+
+    if args.do_train:
+        trainer.fit(model)
+
+    return trainer
--- a/examples/legacy/pytorch-lightning/requirements.txt
+++ b/examples/legacy/pytorch-lightning/requirements.txt
@ -0,0 +1,21 @@
+tensorboard
+scikit-learn
+seqeval
+psutil
+sacrebleu
+rouge-score
+tensorflow_datasets
+matplotlib
+git-python==1.0.3
+faiss-cpu
+streamlit
+elasticsearch
+nltk
+pandas
+datasets >= 1.1.3
+fire
+pytest<8.0.1
+conllu
+sentencepiece != 0.1.92
+protobuf
+ray
--- a/examples/legacy/pytorch-lightning/run_glue.py
+++ b/examples/legacy/pytorch-lightning/run_glue.py
@ -0,0 +1,201 @@
+import argparse
+import glob
+import logging
+import os
+import time
+from argparse import Namespace
+
+import numpy as np
+import torch
+from lightning_base import BaseTransformer, add_generic_args, generic_train
+from torch.utils.data import DataLoader, TensorDataset
+
+from transformers import glue_compute_metrics as compute_metrics
+from transformers import glue_convert_examples_to_features as convert_examples_to_features
+from transformers import glue_output_modes, glue_tasks_num_labels
+from transformers import glue_processors as processors
+
+
+logger = logging.getLogger(__name__)
+
+
+class GLUETransformer(BaseTransformer):
+    mode = "sequence-classification"
+
+    def __init__(self, hparams):
+        if isinstance(hparams, dict):
+            hparams = Namespace(**hparams)
+        hparams.glue_output_mode = glue_output_modes[hparams.task]
+        num_labels = glue_tasks_num_labels[hparams.task]
+
+        super().__init__(hparams, num_labels, self.mode)
+
+    def forward(self, **inputs):
+        return self.model(**inputs)
+
+    def training_step(self, batch, batch_idx):
+        inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
+
+        if self.config.model_type not in ["distilbert", "bart"]:
+            inputs["token_type_ids"] = batch[2] if self.config.model_type in ["bert", "xlnet", "albert"] else None
+
+        outputs = self(**inputs)
+        loss = outputs[0]
+
+        lr_scheduler = self.trainer.lr_schedulers[0]["scheduler"]
+        tensorboard_logs = {"loss": loss, "rate": lr_scheduler.get_last_lr()[-1]}
+        return {"loss": loss, "log": tensorboard_logs}
+
+    def prepare_data(self):
+        "Called to initialize data. Use the call to construct features"
+        args = self.hparams
+        processor = processors[args.task]()
+        self.labels = processor.get_labels()
+
+        for mode in ["train", "dev"]:
+            cached_features_file = self._feature_file(mode)
+            if os.path.exists(cached_features_file) and not args.overwrite_cache:
+                logger.info("Loading features from cached file %s", cached_features_file)
+            else:
+                logger.info("Creating features from dataset file at %s", args.data_dir)
+                examples = (
+                    processor.get_dev_examples(args.data_dir)
+                    if mode == "dev"
+                    else processor.get_train_examples(args.data_dir)
+                )
+                features = convert_examples_to_features(
+                    examples,
+                    self.tokenizer,
+                    max_length=args.max_seq_length,
+                    label_list=self.labels,
+                    output_mode=args.glue_output_mode,
+                )
+                logger.info("Saving features into cached file %s", cached_features_file)
+                torch.save(features, cached_features_file)
+
+    def get_dataloader(self, mode: str, batch_size: int, shuffle: bool = False) -> DataLoader:
+        "Load datasets. Called after prepare data."
+
+        # We test on dev set to compare to benchmarks without having to submit to GLUE server
+        mode = "dev" if mode == "test" else mode
+
+        cached_features_file = self._feature_file(mode)
+        logger.info("Loading features from cached file %s", cached_features_file)
+        features = torch.load(cached_features_file, weights_only=True)
+        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+        all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
+        all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
+        if self.hparams.glue_output_mode == "classification":
+            all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
+        elif self.hparams.glue_output_mode == "regression":
+            all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
+
+        return DataLoader(
+            TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels),
+            batch_size=batch_size,
+            shuffle=shuffle,
+        )
+
+    def validation_step(self, batch, batch_idx):
+        inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
+
+        if self.config.model_type not in ["distilbert", "bart"]:
+            inputs["token_type_ids"] = batch[2] if self.config.model_type in ["bert", "xlnet", "albert"] else None
+
+        outputs = self(**inputs)
+        tmp_eval_loss, logits = outputs[:2]
+        preds = logits.detach().cpu().numpy()
+        out_label_ids = inputs["labels"].detach().cpu().numpy()
+
+        return {"val_loss": tmp_eval_loss.detach().cpu(), "pred": preds, "target": out_label_ids}
+
+    def _eval_end(self, outputs) -> tuple:
+        val_loss_mean = torch.stack([x["val_loss"] for x in outputs]).mean().detach().cpu().item()
+        preds = np.concatenate([x["pred"] for x in outputs], axis=0)
+
+        if self.hparams.glue_output_mode == "classification":
+            preds = np.argmax(preds, axis=1)
+        elif self.hparams.glue_output_mode == "regression":
+            preds = np.squeeze(preds)
+
+        out_label_ids = np.concatenate([x["target"] for x in outputs], axis=0)
+        out_label_list = [[] for _ in range(out_label_ids.shape[0])]
+        preds_list = [[] for _ in range(out_label_ids.shape[0])]
+
+        results = {"val_loss": val_loss_mean, **compute_metrics(self.hparams.task, preds, out_label_ids)}
+
+        ret = dict(results.items())
+        ret["log"] = results
+        return ret, preds_list, out_label_list
+
+    def validation_epoch_end(self, outputs: list) -> dict:
+        ret, preds, targets = self._eval_end(outputs)
+        logs = ret["log"]
+        return {"val_loss": logs["val_loss"], "log": logs, "progress_bar": logs}
+
+    def test_epoch_end(self, outputs) -> dict:
+        ret, predictions, targets = self._eval_end(outputs)
+        logs = ret["log"]
+        # `val_loss` is the key returned by `self._eval_end()` but actually refers to `test_loss`
+        return {"avg_test_loss": logs["val_loss"], "log": logs, "progress_bar": logs}
+
+    @staticmethod
+    def add_model_specific_args(parser, root_dir):
+        BaseTransformer.add_model_specific_args(parser, root_dir)
+        parser.add_argument(
+            "--max_seq_length",
+            default=128,
+            type=int,
+            help=(
+                "The maximum total input sequence length after tokenization. Sequences longer "
+                "than this will be truncated, sequences shorter will be padded."
+            ),
+        )
+
+        parser.add_argument(
+            "--task",
+            default="",
+            type=str,
+            required=True,
+            help="The GLUE task to run",
+        )
+        parser.add_argument(
+            "--gpus",
+            default=0,
+            type=int,
+            help="The number of GPUs allocated for this, it is by default 0 meaning none",
+        )
+
+        parser.add_argument(
+            "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+        )
+
+        return parser
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    add_generic_args(parser, os.getcwd())
+    parser = GLUETransformer.add_model_specific_args(parser, os.getcwd())
+    args = parser.parse_args()
+
+    # If output_dir not provided, a folder will be generated in pwd
+    if args.output_dir is None:
+        args.output_dir = os.path.join(
+            "./results",
+            f"{args.task}_{time.strftime('%Y%m%d_%H%M%S')}",
+        )
+        os.makedirs(args.output_dir)
+
+    model = GLUETransformer(args)
+    trainer = generic_train(model, args)
+
+    # Optionally, predict on dev set and write to output_dir
+    if args.do_predict:
+        checkpoints = sorted(glob.glob(os.path.join(args.output_dir, "checkpoint-epoch=*.ckpt"), recursive=True))
+        model = model.load_from_checkpoint(checkpoints[-1])
+        return trainer.test(model)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/legacy/pytorch-lightning/run_glue.sh
+++ b/examples/legacy/pytorch-lightning/run_glue.sh
@ -0,0 +1,34 @@
+# Install example requirements
+pip install -r ../requirements.txt
+
+# Download glue data
+python3 ../../utils/download_glue_data.py
+
+export TASK=mrpc
+export DATA_DIR=./glue_data/MRPC/
+export MAX_LENGTH=128
+export LEARNING_RATE=2e-5
+export BERT_MODEL=bert-base-cased
+export BATCH_SIZE=32
+export NUM_EPOCHS=3
+export SEED=2
+export OUTPUT_DIR_NAME=mrpc-pl-bert
+export CURRENT_DIR=${PWD}
+export OUTPUT_DIR=${CURRENT_DIR}/${OUTPUT_DIR_NAME}
+
+# Make output directory if it doesn't exist
+mkdir -p $OUTPUT_DIR
+# Add parent directory to python path to access lightning_base.py
+export PYTHONPATH="../":"${PYTHONPATH}"
+
+python3 run_glue.py --gpus 1 --data_dir $DATA_DIR \
+--task $TASK \
+--model_name_or_path $BERT_MODEL \
+--output_dir $OUTPUT_DIR \
+--max_seq_length  $MAX_LENGTH \
+--learning_rate $LEARNING_RATE \
+--num_train_epochs $NUM_EPOCHS \
+--train_batch_size $BATCH_SIZE \
+--seed $SEED \
+--do_train \
+--do_predict
--- a/examples/legacy/pytorch-lightning/run_ner.py
+++ b/examples/legacy/pytorch-lightning/run_ner.py
@ -0,0 +1,216 @@
+import argparse
+import glob
+import logging
+import os
+from argparse import Namespace
+from importlib import import_module
+
+import numpy as np
+import torch
+from lightning_base import BaseTransformer, add_generic_args, generic_train
+from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
+from torch.nn import CrossEntropyLoss
+from torch.utils.data import DataLoader, TensorDataset
+from utils_ner import TokenClassificationTask
+
+
+logger = logging.getLogger(__name__)
+
+
+class NERTransformer(BaseTransformer):
+    """
+    A training module for NER. See BaseTransformer for the core options.
+    """
+
+    mode = "token-classification"
+
+    def __init__(self, hparams):
+        if isinstance(hparams, dict):
+            hparams = Namespace(**hparams)
+        module = import_module("tasks")
+        try:
+            token_classification_task_clazz = getattr(module, hparams.task_type)
+            self.token_classification_task: TokenClassificationTask = token_classification_task_clazz()
+        except AttributeError:
+            raise ValueError(
+                f"Task {hparams.task_type} needs to be defined as a TokenClassificationTask subclass in {module}. "
+                f"Available tasks classes are: {TokenClassificationTask.__subclasses__()}"
+            )
+        self.labels = self.token_classification_task.get_labels(hparams.labels)
+        self.pad_token_label_id = CrossEntropyLoss().ignore_index
+        super().__init__(hparams, len(self.labels), self.mode)
+
+    def forward(self, **inputs):
+        return self.model(**inputs)
+
+    def training_step(self, batch, batch_num):
+        "Compute loss and log."
+        inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
+        if self.config.model_type != "distilbert":
+            inputs["token_type_ids"] = (
+                batch[2] if self.config.model_type in ["bert", "xlnet"] else None
+            )  # XLM and RoBERTa don"t use token_type_ids
+
+        outputs = self(**inputs)
+        loss = outputs[0]
+        # tensorboard_logs = {"loss": loss, "rate": self.lr_scheduler.get_last_lr()[-1]}
+        return {"loss": loss}
+
+    def prepare_data(self):
+        "Called to initialize data. Use the call to construct features"
+        args = self.hparams
+        for mode in ["train", "dev", "test"]:
+            cached_features_file = self._feature_file(mode)
+            if os.path.exists(cached_features_file) and not args.overwrite_cache:
+                logger.info("Loading features from cached file %s", cached_features_file)
+                features = torch.load(cached_features_file, weights_only=True)
+            else:
+                logger.info("Creating features from dataset file at %s", args.data_dir)
+                examples = self.token_classification_task.read_examples_from_file(args.data_dir, mode)
+                features = self.token_classification_task.convert_examples_to_features(
+                    examples,
+                    self.labels,
+                    args.max_seq_length,
+                    self.tokenizer,
+                    cls_token_at_end=bool(self.config.model_type == "xlnet"),
+                    cls_token=self.tokenizer.cls_token,
+                    cls_token_segment_id=2 if self.config.model_type == "xlnet" else 0,
+                    sep_token=self.tokenizer.sep_token,
+                    sep_token_extra=False,
+                    pad_on_left=bool(self.config.model_type == "xlnet"),
+                    pad_token=self.tokenizer.pad_token_id,
+                    pad_token_segment_id=self.tokenizer.pad_token_type_id,
+                    pad_token_label_id=self.pad_token_label_id,
+                )
+                logger.info("Saving features into cached file %s", cached_features_file)
+                torch.save(features, cached_features_file)
+
+    def get_dataloader(self, mode: int, batch_size: int, shuffle: bool = False) -> DataLoader:
+        "Load datasets. Called after prepare data."
+        cached_features_file = self._feature_file(mode)
+        logger.info("Loading features from cached file %s", cached_features_file)
+        features = torch.load(cached_features_file, weights_only=True)
+        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+        all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
+        if features[0].token_type_ids is not None:
+            all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
+        else:
+            all_token_type_ids = torch.tensor([0 for f in features], dtype=torch.long)
+            # HACK(we will not use this anymore soon)
+        all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long)
+        return DataLoader(
+            TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_label_ids), batch_size=batch_size
+        )
+
+    def validation_step(self, batch, batch_nb):
+        """Compute validation""" ""
+        inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
+        if self.config.model_type != "distilbert":
+            inputs["token_type_ids"] = (
+                batch[2] if self.config.model_type in ["bert", "xlnet"] else None
+            )  # XLM and RoBERTa don"t use token_type_ids
+        outputs = self(**inputs)
+        tmp_eval_loss, logits = outputs[:2]
+        preds = logits.detach().cpu().numpy()
+        out_label_ids = inputs["labels"].detach().cpu().numpy()
+        return {"val_loss": tmp_eval_loss.detach().cpu(), "pred": preds, "target": out_label_ids}
+
+    def _eval_end(self, outputs):
+        "Evaluation called for both Val and Test"
+        val_loss_mean = torch.stack([x["val_loss"] for x in outputs]).mean()
+        preds = np.concatenate([x["pred"] for x in outputs], axis=0)
+        preds = np.argmax(preds, axis=2)
+        out_label_ids = np.concatenate([x["target"] for x in outputs], axis=0)
+
+        label_map = dict(enumerate(self.labels))
+        out_label_list = [[] for _ in range(out_label_ids.shape[0])]
+        preds_list = [[] for _ in range(out_label_ids.shape[0])]
+
+        for i in range(out_label_ids.shape[0]):
+            for j in range(out_label_ids.shape[1]):
+                if out_label_ids[i, j] != self.pad_token_label_id:
+                    out_label_list[i].append(label_map[out_label_ids[i][j]])
+                    preds_list[i].append(label_map[preds[i][j]])
+
+        results = {
+            "val_loss": val_loss_mean,
+            "accuracy_score": accuracy_score(out_label_list, preds_list),
+            "precision": precision_score(out_label_list, preds_list),
+            "recall": recall_score(out_label_list, preds_list),
+            "f1": f1_score(out_label_list, preds_list),
+        }
+
+        ret = dict(results.items())
+        ret["log"] = results
+        return ret, preds_list, out_label_list
+
+    def validation_epoch_end(self, outputs):
+        # when stable
+        ret, preds, targets = self._eval_end(outputs)
+        logs = ret["log"]
+        return {"val_loss": logs["val_loss"], "log": logs, "progress_bar": logs}
+
+    def test_epoch_end(self, outputs):
+        # updating to test_epoch_end instead of deprecated test_end
+        ret, predictions, targets = self._eval_end(outputs)
+
+        # Converting to the dict required by pl
+        # https://github.com/PyTorchLightning/pytorch-lightning/blob/master/\
+        # pytorch_lightning/trainer/logging.py#L139
+        logs = ret["log"]
+        # `val_loss` is the key returned by `self._eval_end()` but actually refers to `test_loss`
+        return {"avg_test_loss": logs["val_loss"], "log": logs, "progress_bar": logs}
+
+    @staticmethod
+    def add_model_specific_args(parser, root_dir):
+        # Add NER specific options
+        BaseTransformer.add_model_specific_args(parser, root_dir)
+        parser.add_argument(
+            "--task_type", default="NER", type=str, help="Task type to fine tune in training (e.g. NER, POS, etc)"
+        )
+        parser.add_argument(
+            "--max_seq_length",
+            default=128,
+            type=int,
+            help=(
+                "The maximum total input sequence length after tokenization. Sequences longer "
+                "than this will be truncated, sequences shorter will be padded."
+            ),
+        )
+
+        parser.add_argument(
+            "--labels",
+            default="",
+            type=str,
+            help="Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.",
+        )
+        parser.add_argument(
+            "--gpus",
+            default=0,
+            type=int,
+            help="The number of GPUs allocated for this, it is by default 0 meaning none",
+        )
+
+        parser.add_argument(
+            "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+        )
+
+        return parser
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    add_generic_args(parser, os.getcwd())
+    parser = NERTransformer.add_model_specific_args(parser, os.getcwd())
+    args = parser.parse_args()
+    model = NERTransformer(args)
+    trainer = generic_train(model, args)
+
+    if args.do_predict:
+        # See https://github.com/huggingface/transformers/issues/3159
+        # pl use this default format to create a checkpoint:
+        # https://github.com/PyTorchLightning/pytorch-lightning/blob/master\
+        # /pytorch_lightning/callbacks/model_checkpoint.py#L322
+        checkpoints = sorted(glob.glob(os.path.join(args.output_dir, "checkpoint-epoch=*.ckpt"), recursive=True))
+        model = model.load_from_checkpoint(checkpoints[-1])
+        trainer.test(model)
--- a/examples/legacy/pytorch-lightning/run_ner.sh
+++ b/examples/legacy/pytorch-lightning/run_ner.sh
@ -0,0 +1,44 @@
+#!/usr/bin/env bash
+
+# for seqeval metrics import
+pip install -r ../requirements.txt
+
+## The relevant files are currently on a shared Google
+## drive at https://drive.google.com/drive/folders/1kC0I2UGl2ltrluI9NqDjaQJGw5iliw_J
+## Monitor for changes and eventually migrate to use the `datasets` library
+curl -L 'https://drive.google.com/uc?export=download&id=1Jjhbal535VVz2ap4v4r_rN1UEHTdLK5P' \
+| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > train.txt.tmp
+curl -L 'https://drive.google.com/uc?export=download&id=1ZfRcQThdtAR5PPRjIDtrVP7BtXSCUBbm' \
+| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > dev.txt.tmp
+curl -L 'https://drive.google.com/uc?export=download&id=1u9mb7kNJHWQCWyweMDRMuTFoOHOfeBTH' \
+| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > test.txt.tmp
+
+export MAX_LENGTH=128
+export BERT_MODEL=bert-base-multilingual-cased
+python3 scripts/preprocess.py train.txt.tmp $BERT_MODEL $MAX_LENGTH > train.txt
+python3 scripts/preprocess.py dev.txt.tmp $BERT_MODEL $MAX_LENGTH > dev.txt
+python3 scripts/preprocess.py test.txt.tmp $BERT_MODEL $MAX_LENGTH > test.txt
+cat train.txt dev.txt test.txt | cut -d " " -f 2 | grep -v "^$"| sort | uniq > labels.txt
+export BATCH_SIZE=32
+export NUM_EPOCHS=3
+export SEED=1
+
+export OUTPUT_DIR_NAME=germeval-model
+export CURRENT_DIR=${PWD}
+export OUTPUT_DIR=${CURRENT_DIR}/${OUTPUT_DIR_NAME}
+mkdir -p $OUTPUT_DIR
+
+# Add parent directory to python path to access lightning_base.py
+export PYTHONPATH="../":"${PYTHONPATH}"
+
+python3 run_ner.py --data_dir ./ \
+--labels ./labels.txt \
+--model_name_or_path $BERT_MODEL \
+--output_dir $OUTPUT_DIR \
+--max_seq_length  $MAX_LENGTH \
+--num_train_epochs $NUM_EPOCHS \
+--train_batch_size $BATCH_SIZE \
+--seed $SEED \
+--gpus 1 \
+--do_train \
+--do_predict
--- a/examples/legacy/pytorch-lightning/run_pos.sh
+++ b/examples/legacy/pytorch-lightning/run_pos.sh
@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+if ! [ -f ./dev.txt ]; then
+  echo "Download dev dataset...."
+  curl -L -o ./dev.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-dev.conllu'
+fi
+
+if ! [ -f ./test.txt ]; then
+  echo "Download test dataset...."
+  curl -L -o ./test.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-test.conllu'
+fi
+
+if ! [ -f ./train.txt ]; then
+  echo "Download train dataset...."
+  curl -L -o ./train.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-train.conllu'
+fi
+
+export MAX_LENGTH=200
+export BERT_MODEL=bert-base-uncased
+export OUTPUT_DIR=postagger-model
+export BATCH_SIZE=32
+export NUM_EPOCHS=3
+export SAVE_STEPS=750
+export SEED=1
+
+
+# Add parent directory to python path to access lightning_base.py
+export PYTHONPATH="../":"${PYTHONPATH}"
+
+python3 run_ner.py --data_dir ./ \
+--task_type POS \
+--model_name_or_path $BERT_MODEL \
+--output_dir $OUTPUT_DIR \
+--max_seq_length  $MAX_LENGTH \
+--num_train_epochs $NUM_EPOCHS \
+--train_batch_size $BATCH_SIZE \
+--seed $SEED \
+--gpus 1 \
+--do_train \
+--do_predict
--- a/examples/legacy/question-answering/README.md
+++ b/examples/legacy/question-answering/README.md
@ -0,0 +1,126 @@
+#### Fine-tuning BERT on SQuAD1.0 with relative position embeddings
+
+The following examples show how to fine-tune BERT models with different relative position embeddings. The BERT model 
+`google-bert/bert-base-uncased` was pretrained with default absolute position embeddings. We provide the following pretrained 
+models which were pre-trained on the same training data (BooksCorpus and English Wikipedia) as in the BERT model 
+training, but with different relative position embeddings. 
+
+* `zhiheng-huang/bert-base-uncased-embedding-relative-key`, trained from scratch with relative embedding proposed by 
+Shaw et al., [Self-Attention with Relative Position Representations](https://huggingface.co/papers/1803.02155)
+* `zhiheng-huang/bert-base-uncased-embedding-relative-key-query`, trained from scratch with relative embedding method 4 
+in Huang et al. [Improve Transformer Models with Better Relative Position Embeddings](https://huggingface.co/papers/2009.13658)
+* `zhiheng-huang/bert-large-uncased-whole-word-masking-embedding-relative-key-query`, fine-tuned from model 
+`google-bert/bert-large-uncased-whole-word-masking` with 3 additional epochs with relative embedding method 4 in Huang et al. 
+[Improve Transformer Models with Better Relative Position Embeddings](https://huggingface.co/papers/2009.13658)
+
+
+##### Base models fine-tuning
+
+```bash
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+torchrun --nproc_per_node=8 ./examples/question-answering/run_squad.py \
+    --model_name_or_path zhiheng-huang/bert-base-uncased-embedding-relative-key-query \
+    --dataset_name squad \
+    --do_train \
+    --do_eval \
+    --learning_rate 3e-5 \
+    --num_train_epochs 2 \
+    --max_seq_length 512 \
+    --doc_stride 128 \
+    --output_dir relative_squad \
+    --per_device_eval_batch_size=60 \
+    --per_device_train_batch_size=6
+```
+Training with the above command leads to the following results. It boosts the BERT default from f1 score of 88.52 to 90.54.
+
+```bash
+'exact': 83.6802270577105, 'f1': 90.54772098174814
+```
+
+The change of `max_seq_length` from 512 to 384 in the above command leads to the f1 score of 90.34. Replacing the above 
+model `zhiheng-huang/bert-base-uncased-embedding-relative-key-query` with 
+`zhiheng-huang/bert-base-uncased-embedding-relative-key` leads to the f1 score of 89.51. The changing of 8 gpus to one 
+gpu training leads to the f1 score of 90.71.
+
+##### Large models fine-tuning
+
+```bash
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+torchrun --nproc_per_node=8 ./examples/question-answering/run_squad.py \
+    --model_name_or_path zhiheng-huang/bert-large-uncased-whole-word-masking-embedding-relative-key-query \
+    --dataset_name squad \
+    --do_train \
+    --do_eval \
+    --learning_rate 3e-5 \
+    --num_train_epochs 2 \
+    --max_seq_length 512 \
+    --doc_stride 128 \
+    --output_dir relative_squad \
+    --per_gpu_eval_batch_size=6 \
+    --per_gpu_train_batch_size=2 \
+    --gradient_accumulation_steps 3
+```
+Training with the above command leads to the f1 score of 93.52, which is slightly better than the f1 score of 93.15 for 
+`google-bert/bert-large-uncased-whole-word-masking`.
+
+#### Distributed training
+
+Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD1.1:
+
+```bash
+torchrun --nproc_per_node=8 ./examples/question-answering/run_squad.py \
+    --model_name_or_path google-bert/bert-large-uncased-whole-word-masking \
+    --dataset_name squad \
+    --do_train \
+    --do_eval \
+    --learning_rate 3e-5 \
+    --num_train_epochs 2 \
+    --max_seq_length 384 \
+    --doc_stride 128 \
+    --output_dir ./examples/models/wwm_uncased_finetuned_squad/ \
+    --per_device_eval_batch_size=3   \
+    --per_device_train_batch_size=3   \
+```
+
+Training with the previously defined hyper-parameters yields the following results:
+
+```bash
+f1 = 93.15
+exact_match = 86.91
+```
+
+This fine-tuned model is available as a checkpoint under the reference
+[`google-bert/bert-large-uncased-whole-word-masking-finetuned-squad`](https://huggingface.co/google-bert/bert-large-uncased-whole-word-masking-finetuned-squad).
+
+## Results
+
+Larger batch size may improve the performance while costing more memory.
+
+##### Results for SQuAD1.0 with the previously defined hyper-parameters:
+
+```python
+{
+"exact": 85.45884578997162,
+"f1": 92.5974600601065,
+"total": 10570,
+"HasAns_exact": 85.45884578997162,
+"HasAns_f1": 92.59746006010651,
+"HasAns_total": 10570
+}
+```
+
+##### Results for SQuAD2.0 with the previously defined hyper-parameters:
+
+```python
+{
+"exact": 80.4177545691906,
+"f1": 84.07154997729623,
+"total": 11873,
+"HasAns_exact": 76.73751686909581,
+"HasAns_f1": 84.05558584352873,
+"HasAns_total": 5928,
+"NoAns_exact": 84.0874684608915,
+"NoAns_f1": 84.0874684608915,
+"NoAns_total": 5945
+}
+```
--- a/examples/legacy/question-answering/run_squad.py
+++ b/examples/legacy/question-answering/run_squad.py
@ -0,0 +1,824 @@
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Finetuning the library models for question-answering on SQuAD (DistilBERT, Bert, XLM, XLNet)."""
+
+import argparse
+import glob
+import logging
+import os
+import random
+import timeit
+
+import numpy as np
+import torch
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
+from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm, trange
+
+import transformers
+from transformers import (
+    MODEL_FOR_QUESTION_ANSWERING_MAPPING,
+    WEIGHTS_NAME,
+    AutoConfig,
+    AutoModelForQuestionAnswering,
+    AutoTokenizer,
+    get_linear_schedule_with_warmup,
+    squad_convert_examples_to_features,
+)
+from transformers.data.metrics.squad_metrics import (
+    compute_predictions_log_probs,
+    compute_predictions_logits,
+    squad_evaluate,
+)
+from transformers.data.processors.squad import SquadResult, SquadV1Processor, SquadV2Processor
+from transformers.trainer_utils import is_main_process
+
+
+try:
+    from torch.utils.tensorboard import SummaryWriter
+except ImportError:
+    from tensorboardX import SummaryWriter
+
+
+logger = logging.getLogger(__name__)
+
+MODEL_CONFIG_CLASSES = list(MODEL_FOR_QUESTION_ANSWERING_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+
+
+def set_seed(args):
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if args.n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+
+def to_list(tensor):
+    return tensor.tolist()
+
+
+def train(args, train_dataset, model, tokenizer):
+    """Train the model"""
+    if args.local_rank in [-1, 0]:
+        tb_writer = SummaryWriter()
+
+    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
+    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
+    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
+
+    if args.max_steps > 0:
+        t_total = args.max_steps
+        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
+    else:
+        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+
+    # Prepare optimizer and schedule (linear warmup and decay)
+    no_decay = ["bias", "LayerNorm.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+    ]
+    optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
+    scheduler = get_linear_schedule_with_warmup(
+        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+    )
+
+    # Check if saved optimizer or scheduler states exist
+    if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
+        os.path.join(args.model_name_or_path, "scheduler.pt")
+    ):
+        # Load in optimizer and scheduler states
+        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"), weights_only=True))
+        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"), weights_only=True))
+
+    if args.fp16:
+        try:
+            from apex import amp
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
+
+        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
+
+    # multi-gpu training (should be after apex fp16 initialization)
+    if args.n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
+    # Distributed training (should be after apex fp16 initialization)
+    if args.local_rank != -1:
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
+        )
+
+    # Train!
+    logger.info("***** Running training *****")
+    logger.info("  Num examples = %d", len(train_dataset))
+    logger.info("  Num Epochs = %d", args.num_train_epochs)
+    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
+    logger.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size
+        * args.gradient_accumulation_steps
+        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
+    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
+    logger.info("  Total optimization steps = %d", t_total)
+
+    global_step = 1
+    epochs_trained = 0
+    steps_trained_in_current_epoch = 0
+    # Check if continuing training from a checkpoint
+    if os.path.exists(args.model_name_or_path):
+        try:
+            # set global_step to global_step of last saved checkpoint from model path
+            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
+            global_step = int(checkpoint_suffix)
+            epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
+            steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
+
+            logger.info("  Continuing training from checkpoint, will skip to saved global_step")
+            logger.info("  Continuing training from epoch %d", epochs_trained)
+            logger.info("  Continuing training from global step %d", global_step)
+            logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
+        except ValueError:
+            logger.info("  Starting fine-tuning.")
+
+    tr_loss, logging_loss = 0.0, 0.0
+    model.zero_grad()
+    train_iterator = trange(
+        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
+    )
+    # Added here for reproducibility
+    set_seed(args)
+
+    for _ in train_iterator:
+        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
+        for step, batch in enumerate(epoch_iterator):
+            # Skip past any already trained steps if resuming training
+            if steps_trained_in_current_epoch > 0:
+                steps_trained_in_current_epoch -= 1
+                continue
+
+            model.train()
+            batch = tuple(t.to(args.device) for t in batch)
+
+            inputs = {
+                "input_ids": batch[0],
+                "attention_mask": batch[1],
+                "token_type_ids": batch[2],
+                "start_positions": batch[3],
+                "end_positions": batch[4],
+            }
+
+            if args.model_type in ["xlm", "roberta", "distilbert", "camembert", "bart", "longformer"]:
+                del inputs["token_type_ids"]
+
+            if args.model_type in ["xlnet", "xlm"]:
+                inputs.update({"cls_index": batch[5], "p_mask": batch[6]})
+                if args.version_2_with_negative:
+                    inputs.update({"is_impossible": batch[7]})
+                if hasattr(model, "config") and hasattr(model.config, "lang2id"):
+                    inputs.update(
+                        {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)}
+                    )
+
+            outputs = model(**inputs)
+            # model outputs are always tuple in transformers (see doc)
+            loss = outputs[0]
+
+            if args.n_gpu > 1:
+                loss = loss.mean()  # mean() to average on multi-gpu parallel (not distributed) training
+            if args.gradient_accumulation_steps > 1:
+                loss = loss / args.gradient_accumulation_steps
+
+            if args.fp16:
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+            else:
+                loss.backward()
+
+            tr_loss += loss.item()
+            if (step + 1) % args.gradient_accumulation_steps == 0:
+                if args.fp16:
+                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+                else:
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+
+                optimizer.step()
+                scheduler.step()  # Update learning rate schedule
+                model.zero_grad()
+                global_step += 1
+
+                # Log metrics
+                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
+                    # Only evaluate when single GPU otherwise metrics may not average well
+                    if args.local_rank == -1 and args.evaluate_during_training:
+                        results = evaluate(args, model, tokenizer)
+                        for key, value in results.items():
+                            tb_writer.add_scalar(f"eval_{key}", value, global_step)
+                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
+                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
+                    logging_loss = tr_loss
+
+                # Save model checkpoint
+                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
+                    output_dir = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                    # Take care of distributed/parallel training
+                    model_to_save = model.module if hasattr(model, "module") else model
+                    model_to_save.save_pretrained(output_dir)
+                    tokenizer.save_pretrained(output_dir)
+
+                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
+                    logger.info("Saving model checkpoint to %s", output_dir)
+
+                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
+                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
+                    logger.info("Saving optimizer and scheduler states to %s", output_dir)
+
+            if args.max_steps > 0 and global_step > args.max_steps:
+                epoch_iterator.close()
+                break
+        if args.max_steps > 0 and global_step > args.max_steps:
+            train_iterator.close()
+            break
+
+    if args.local_rank in [-1, 0]:
+        tb_writer.close()
+
+    return global_step, tr_loss / global_step
+
+
+def evaluate(args, model, tokenizer, prefix=""):
+    dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)
+
+    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
+        os.makedirs(args.output_dir)
+
+    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
+
+    # Note that DistributedSampler samples randomly
+    eval_sampler = SequentialSampler(dataset)
+    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
+
+    # multi-gpu evaluate
+    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
+        model = torch.nn.DataParallel(model)
+
+    # Eval!
+    logger.info(f"***** Running evaluation {prefix} *****")
+    logger.info("  Num examples = %d", len(dataset))
+    logger.info("  Batch size = %d", args.eval_batch_size)
+
+    all_results = []
+    start_time = timeit.default_timer()
+
+    for batch in tqdm(eval_dataloader, desc="Evaluating"):
+        model.eval()
+        batch = tuple(t.to(args.device) for t in batch)
+
+        with torch.no_grad():
+            inputs = {
+                "input_ids": batch[0],
+                "attention_mask": batch[1],
+                "token_type_ids": batch[2],
+            }
+
+            if args.model_type in ["xlm", "roberta", "distilbert", "camembert", "bart", "longformer"]:
+                del inputs["token_type_ids"]
+
+            feature_indices = batch[3]
+
+            # XLNet and XLM use more arguments for their predictions
+            if args.model_type in ["xlnet", "xlm"]:
+                inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
+                # for lang_id-sensitive xlm models
+                if hasattr(model, "config") and hasattr(model.config, "lang2id"):
+                    inputs.update(
+                        {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)}
+                    )
+            outputs = model(**inputs)
+
+        for i, feature_index in enumerate(feature_indices):
+            eval_feature = features[feature_index.item()]
+            unique_id = int(eval_feature.unique_id)
+
+            output = [to_list(output[i]) for output in outputs.to_tuple()]
+
+            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
+            # models only use two.
+            if len(output) >= 5:
+                start_logits = output[0]
+                start_top_index = output[1]
+                end_logits = output[2]
+                end_top_index = output[3]
+                cls_logits = output[4]
+
+                result = SquadResult(
+                    unique_id,
+                    start_logits,
+                    end_logits,
+                    start_top_index=start_top_index,
+                    end_top_index=end_top_index,
+                    cls_logits=cls_logits,
+                )
+
+            else:
+                start_logits, end_logits = output
+                result = SquadResult(unique_id, start_logits, end_logits)
+
+            all_results.append(result)
+
+    evalTime = timeit.default_timer() - start_time
+    logger.info("  Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset))
+
+    # Compute predictions
+    output_prediction_file = os.path.join(args.output_dir, f"predictions_{prefix}.json")
+    output_nbest_file = os.path.join(args.output_dir, f"nbest_predictions_{prefix}.json")
+
+    if args.version_2_with_negative:
+        output_null_log_odds_file = os.path.join(args.output_dir, f"null_odds_{prefix}.json")
+    else:
+        output_null_log_odds_file = None
+
+    # XLNet and XLM use a more complex post-processing procedure
+    if args.model_type in ["xlnet", "xlm"]:
+        start_n_top = model.config.start_n_top if hasattr(model, "config") else model.module.config.start_n_top
+        end_n_top = model.config.end_n_top if hasattr(model, "config") else model.module.config.end_n_top
+
+        predictions = compute_predictions_log_probs(
+            examples,
+            features,
+            all_results,
+            args.n_best_size,
+            args.max_answer_length,
+            output_prediction_file,
+            output_nbest_file,
+            output_null_log_odds_file,
+            start_n_top,
+            end_n_top,
+            args.version_2_with_negative,
+            tokenizer,
+            args.verbose_logging,
+        )
+    else:
+        predictions = compute_predictions_logits(
+            examples,
+            features,
+            all_results,
+            args.n_best_size,
+            args.max_answer_length,
+            args.do_lower_case,
+            output_prediction_file,
+            output_nbest_file,
+            output_null_log_odds_file,
+            args.verbose_logging,
+            args.version_2_with_negative,
+            args.null_score_diff_threshold,
+            tokenizer,
+        )
+
+    # Compute the F1 and exact scores.
+    results = squad_evaluate(examples, predictions)
+    return results
+
+
+def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
+    if args.local_rank not in [-1, 0] and not evaluate:
+        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+        torch.distributed.barrier()
+
+    # Load data features from cache or dataset file
+    input_dir = args.data_dir if args.data_dir else "."
+    cached_features_file = os.path.join(
+        input_dir,
+        "cached_{}_{}_{}".format(
+            "dev" if evaluate else "train",
+            list(filter(None, args.model_name_or_path.split("/"))).pop(),
+            str(args.max_seq_length),
+        ),
+    )
+
+    # Init features and dataset from cache if it exists
+    if os.path.exists(cached_features_file) and not args.overwrite_cache:
+        logger.info("Loading features from cached file %s", cached_features_file)
+        features_and_dataset = torch.load(cached_features_file, weights_only=True)
+        features, dataset, examples = (
+            features_and_dataset["features"],
+            features_and_dataset["dataset"],
+            features_and_dataset["examples"],
+        )
+    else:
+        logger.info("Creating features from dataset file at %s", input_dir)
+
+        if not args.data_dir and ((evaluate and not args.predict_file) or (not evaluate and not args.train_file)):
+            try:
+                import tensorflow_datasets as tfds
+            except ImportError:
+                raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.")
+
+            if args.version_2_with_negative:
+                logger.warning("tensorflow_datasets does not handle version 2 of SQuAD.")
+
+            tfds_examples = tfds.load("squad")
+            examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
+        else:
+            processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()
+            if evaluate:
+                examples = processor.get_dev_examples(args.data_dir, filename=args.predict_file)
+            else:
+                examples = processor.get_train_examples(args.data_dir, filename=args.train_file)
+
+        features, dataset = squad_convert_examples_to_features(
+            examples=examples,
+            tokenizer=tokenizer,
+            max_seq_length=args.max_seq_length,
+            doc_stride=args.doc_stride,
+            max_query_length=args.max_query_length,
+            is_training=not evaluate,
+            return_dataset="pt",
+            threads=args.threads,
+        )
+
+        if args.local_rank in [-1, 0]:
+            logger.info("Saving features into cached file %s", cached_features_file)
+            torch.save({"features": features, "dataset": dataset, "examples": examples}, cached_features_file)
+
+    if args.local_rank == 0 and not evaluate:
+        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+        torch.distributed.barrier()
+
+    if output_examples:
+        return dataset, examples, features
+    return dataset
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    # Required parameters
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " + ", ".join(MODEL_TYPES),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models",
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model checkpoints and predictions will be written.",
+    )
+
+    # Other parameters
+    parser.add_argument(
+        "--data_dir",
+        default=None,
+        type=str,
+        help="The input data dir. Should contain the .json files for the task."
+        + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
+    )
+    parser.add_argument(
+        "--train_file",
+        default=None,
+        type=str,
+        help="The input training file. If a data dir is specified, will look for the file there"
+        + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
+    )
+    parser.add_argument(
+        "--predict_file",
+        default=None,
+        type=str,
+        help="The input evaluation file. If a data dir is specified, will look for the file there"
+        + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
+    )
+    parser.add_argument(
+        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="",
+        type=str,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        default="",
+        type=str,
+        help="Where do you want to store the pre-trained models downloaded from huggingface.co",
+    )
+
+    parser.add_argument(
+        "--version_2_with_negative",
+        action="store_true",
+        help="If true, the SQuAD examples contain some that do not have an answer.",
+    )
+    parser.add_argument(
+        "--null_score_diff_threshold",
+        type=float,
+        default=0.0,
+        help="If null_score - best_non_null is greater than the threshold predict null.",
+    )
+
+    parser.add_argument(
+        "--max_seq_length",
+        default=384,
+        type=int,
+        help=(
+            "The maximum total input sequence length after WordPiece tokenization. Sequences "
+            "longer than this will be truncated, and sequences shorter than this will be padded."
+        ),
+    )
+    parser.add_argument(
+        "--doc_stride",
+        default=128,
+        type=int,
+        help="When splitting up a long document into chunks, how much stride to take between chunks.",
+    )
+    parser.add_argument(
+        "--max_query_length",
+        default=64,
+        type=int,
+        help=(
+            "The maximum number of tokens for the question. Questions longer than this will "
+            "be truncated to this length."
+        ),
+    )
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
+    parser.add_argument(
+        "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step."
+    )
+    parser.add_argument(
+        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
+    )
+
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
+    parser.add_argument(
+        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
+    )
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
+    )
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
+    parser.add_argument(
+        "--n_best_size",
+        default=20,
+        type=int,
+        help="The total number of n-best predictions to generate in the nbest_predictions.json output file.",
+    )
+    parser.add_argument(
+        "--max_answer_length",
+        default=30,
+        type=int,
+        help=(
+            "The maximum length of an answer that can be generated. This is needed because the start "
+            "and end predictions are not conditioned on one another."
+        ),
+    )
+    parser.add_argument(
+        "--verbose_logging",
+        action="store_true",
+        help=(
+            "If true, all of the warnings related to data processing will be printed. "
+            "A number of warnings are expected for a normal SQuAD evaluation."
+        ),
+    )
+    parser.add_argument(
+        "--lang_id",
+        default=0,
+        type=int,
+        help=(
+            "language id of input for language-specific xlm models (see"
+            " tokenization_xlm.PRETRAINED_INIT_CONFIGURATION)"
+        ),
+    )
+
+    parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
+    parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--eval_all_checkpoints",
+        action="store_true",
+        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
+
+    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help=(
+            "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. "
+            "See details at https://nvidia.github.io/apex/amp.html"
+        ),
+    )
+    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
+
+    parser.add_argument("--threads", type=int, default=1, help="multiple threads for converting example to features")
+    args = parser.parse_args()
+
+    if args.doc_stride >= args.max_seq_length - args.max_query_length:
+        logger.warning(
+            "WARNING - You've set a doc stride which may be superior to the document length in some "
+            "examples. This could result in errors when building features from the examples. Please reduce the doc "
+            "stride or increase the maximum length to ensure the features are correctly built."
+        )
+
+    # Setup distant debugging if needed
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
+
+    # Setup CUDA, GPU & distributed training
+    if args.local_rank == -1 or args.no_cuda:
+        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
+    else:  # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
+        torch.cuda.set_device(args.local_rank)
+        device = torch.device("cuda", args.local_rank)
+        torch.distributed.init_process_group(backend="nccl")
+        args.n_gpu = 1
+    args.device = device
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        args.local_rank,
+        device,
+        args.n_gpu,
+        bool(args.local_rank != -1),
+        args.fp16,
+    )
+    # Set the verbosity to info of the Transformers logger (on main process only):
+    if is_main_process(args.local_rank):
+        transformers.utils.logging.set_verbosity_info()
+        transformers.utils.logging.enable_default_handler()
+        transformers.utils.logging.enable_explicit_format()
+    # Set seed
+    set_seed(args)
+
+    # Load pretrained model and tokenizer
+    if args.local_rank not in [-1, 0]:
+        # Make sure only the first process in distributed training will download model & vocab
+        torch.distributed.barrier()
+
+    args.model_type = args.model_type.lower()
+    config = AutoConfig.from_pretrained(
+        args.config_name if args.config_name else args.model_name_or_path,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+        do_lower_case=args.do_lower_case,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+        use_fast=False,  # SquadDataset is not compatible with Fast tokenizers which have a smarter overflow handling
+    )
+    model = AutoModelForQuestionAnswering.from_pretrained(
+        args.model_name_or_path,
+        from_tf=bool(".ckpt" in args.model_name_or_path),
+        config=config,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+
+    if args.local_rank == 0:
+        # Make sure only the first process in distributed training will download model & vocab
+        torch.distributed.barrier()
+
+    model.to(args.device)
+
+    logger.info("Training/evaluation parameters %s", args)
+
+    # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is set.
+    # Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running `--fp16_opt_level="O2"` will
+    # remove the need for this code, but it is still valid.
+    if args.fp16:
+        try:
+            import apex
+
+            apex.amp.register_half_function(torch, "einsum")
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
+
+    # Training
+    if args.do_train:
+        train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False)
+        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
+        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
+
+    # Save the trained model and the tokenizer
+    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
+        logger.info("Saving model checkpoint to %s", args.output_dir)
+        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
+        # They can then be reloaded using `from_pretrained()`
+        # Take care of distributed/parallel training
+        model_to_save = model.module if hasattr(model, "module") else model
+        model_to_save.save_pretrained(args.output_dir)
+        tokenizer.save_pretrained(args.output_dir)
+
+        # Good practice: save your training arguments together with the trained model
+        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
+
+        # Load a trained model and vocabulary that you have fine-tuned
+        model = AutoModelForQuestionAnswering.from_pretrained(args.output_dir)  # , force_download=True)
+
+        # SquadDataset is not compatible with Fast tokenizers which have a smarter overflow handling
+        # So we use use_fast=False here for now until Fast-tokenizer-compatible-examples are out
+        tokenizer = AutoTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case, use_fast=False)
+        model.to(args.device)
+
+    # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
+    results = {}
+    if args.do_eval and args.local_rank in [-1, 0]:
+        if args.do_train:
+            logger.info("Loading checkpoints saved during training for evaluation")
+            checkpoints = [args.output_dir]
+            if args.eval_all_checkpoints:
+                checkpoints = [
+                    os.path.dirname(c)
+                    for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
+                ]
+
+        else:
+            logger.info("Loading checkpoint %s for evaluation", args.model_name_or_path)
+            checkpoints = [args.model_name_or_path]
+
+        logger.info("Evaluate the following checkpoints: %s", checkpoints)
+
+        for checkpoint in checkpoints:
+            # Reload the model
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
+            model = AutoModelForQuestionAnswering.from_pretrained(checkpoint)  # , force_download=True)
+            model.to(args.device)
+
+            # Evaluate
+            result = evaluate(args, model, tokenizer, prefix=global_step)
+
+            result = {k + (f"_{global_step}" if global_step else ""): v for k, v in result.items()}
+            results.update(result)
+
+    logger.info(f"Results: {results}")
+
+    return results
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/legacy/question-answering/run_squad_trainer.py
+++ b/examples/legacy/question-answering/run_squad_trainer.py
@ -0,0 +1,174 @@
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fine-tuning the library models for question-answering."""
+
+import logging
+import os
+import sys
+from dataclasses import dataclass, field
+from typing import Optional
+
+import transformers
+from transformers import (
+    AutoConfig,
+    AutoModelForQuestionAnswering,
+    AutoTokenizer,
+    DataCollatorWithPadding,
+    HfArgumentParser,
+    SquadDataset,
+    Trainer,
+    TrainingArguments,
+)
+from transformers import SquadDataTrainingArguments as DataTrainingArguments
+from transformers.trainer_utils import is_main_process
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    use_fast: bool = field(default=False, metadata={"help": "Set this flag to use fast tokenization."})
+    # If you want to tweak more attributes on your tokenizer, you should do it in a distinct script,
+    # or just modify its tokenizer_config.json.
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
+    )
+
+
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if training_args.local_process_index in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        training_args.local_process_index,
+        training_args.device,
+        training_args.n_gpu,
+        bool(training_args.parallel_mode.value == "distributed"),
+        training_args.fp16,
+    )
+    # Set the verbosity to info of the Transformers logger (on main process only):
+    if is_main_process(training_args.local_process_index):
+        transformers.utils.logging.set_verbosity_info()
+        transformers.utils.logging.enable_default_handler()
+        transformers.utils.logging.enable_explicit_format()
+    logger.info("Training/evaluation parameters %s", training_args)
+
+    # Prepare Question-Answering task
+    # Load pretrained model and tokenizer
+    #
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+
+    config = AutoConfig.from_pretrained(
+        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+        use_fast=False,  # SquadDataset is not compatible with Fast tokenizers which have a smarter overflow handling
+    )
+    model = AutoModelForQuestionAnswering.from_pretrained(
+        model_args.model_name_or_path,
+        from_tf=bool(".ckpt" in model_args.model_name_or_path),
+        config=config,
+        cache_dir=model_args.cache_dir,
+    )
+
+    # Get datasets
+    is_language_sensitive = hasattr(model.config, "lang2id")
+    train_dataset = (
+        SquadDataset(
+            data_args, tokenizer=tokenizer, is_language_sensitive=is_language_sensitive, cache_dir=model_args.cache_dir
+        )
+        if training_args.do_train
+        else None
+    )
+    eval_dataset = (
+        SquadDataset(
+            data_args,
+            tokenizer=tokenizer,
+            mode="dev",
+            is_language_sensitive=is_language_sensitive,
+            cache_dir=model_args.cache_dir,
+        )
+        if training_args.do_eval
+        else None
+    )
+
+    # Data collator
+    data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) if training_args.fp16 else None
+
+    # Initialize our Trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        data_collator=data_collator,
+    )
+
+    # Training
+    if training_args.do_train:
+        trainer.train(
+            model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
+        )
+        trainer.save_model()
+        # For convenience, we also re-save the tokenizer to the same directory,
+        # so that you can share your model easily on huggingface.co/models =)
+        if trainer.is_world_master():
+            tokenizer.save_pretrained(training_args.output_dir)
+
+
+def _mp_fn(index):
+    # For xla_spawn (TPUs)
+    main()
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/legacy/run_camembert.py
+++ b/examples/legacy/run_camembert.py
@ -0,0 +1,47 @@
+#!/usr/bin/env python
+import torch
+
+from transformers import CamembertForMaskedLM, CamembertTokenizer
+
+
+def fill_mask(masked_input, model, tokenizer, topk=5):
+    # Adapted from https://github.com/pytorch/fairseq/blob/master/fairseq/models/roberta/hub_interface.py
+    assert masked_input.count("<mask>") == 1
+    input_ids = torch.tensor(tokenizer.encode(masked_input, add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+    logits = model(input_ids)[0]  # The last hidden-state is the first element of the output tuple
+    masked_index = (input_ids.squeeze() == tokenizer.mask_token_id).nonzero().item()
+    logits = logits[0, masked_index, :]
+    prob = logits.softmax(dim=0)
+    values, indices = prob.topk(k=topk, dim=0)
+    topk_predicted_token_bpe = " ".join(
+        [tokenizer.convert_ids_to_tokens(indices[i].item()) for i in range(len(indices))]
+    )
+    masked_token = tokenizer.mask_token
+    topk_filled_outputs = []
+    for index, predicted_token_bpe in enumerate(topk_predicted_token_bpe.split(" ")):
+        predicted_token = predicted_token_bpe.replace("\u2581", " ")
+        if f" {masked_token}" in masked_input:
+            topk_filled_outputs.append(
+                (
+                    masked_input.replace(f" {masked_token}", predicted_token),
+                    values[index].item(),
+                    predicted_token,
+                )
+            )
+        else:
+            topk_filled_outputs.append(
+                (
+                    masked_input.replace(masked_token, predicted_token),
+                    values[index].item(),
+                    predicted_token,
+                )
+            )
+    return topk_filled_outputs
+
+
+tokenizer = CamembertTokenizer.from_pretrained("almanach/camembert-base")
+model = CamembertForMaskedLM.from_pretrained("almanach/camembert-base")
+model.eval()
+
+masked_input = "Le camembert est <mask> :)"
+print(fill_mask(masked_input, model, tokenizer, topk=3))
--- a/examples/legacy/run_chinese_ref.py
+++ b/examples/legacy/run_chinese_ref.py
@ -0,0 +1,147 @@
+#!/usr/bin/env python
+import argparse
+import json
+
+from ltp import LTP
+
+from transformers import BertTokenizer
+
+
+def _is_chinese_char(cp):
+    """Checks whether CP is the codepoint of a CJK character."""
+    # This defines a "chinese character" as anything in the CJK Unicode block:
+    #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+    #
+    # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+    # despite its name. The modern Korean Hangul alphabet is a different block,
+    # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+    # space-separated words, so they are not treated specially and handled
+    # like the all of the other languages.
+    if (
+        (cp >= 0x4E00 and cp <= 0x9FFF)
+        or (cp >= 0x3400 and cp <= 0x4DBF)
+        or (cp >= 0x20000 and cp <= 0x2A6DF)
+        or (cp >= 0x2A700 and cp <= 0x2B73F)
+        or (cp >= 0x2B740 and cp <= 0x2B81F)
+        or (cp >= 0x2B820 and cp <= 0x2CEAF)
+        or (cp >= 0xF900 and cp <= 0xFAFF)
+        or (cp >= 0x2F800 and cp <= 0x2FA1F)
+    ):
+        return True
+
+    return False
+
+
+def is_chinese(word: str):
+    # word like '180' or '身高' or '神'
+    for char in word:
+        char = ord(char)
+        if not _is_chinese_char(char):
+            return 0
+    return 1
+
+
+def get_chinese_word(tokens: list[str]):
+    word_set = set()
+
+    for token in tokens:
+        chinese_word = len(token) > 1 and is_chinese(token)
+        if chinese_word:
+            word_set.add(token)
+    word_list = list(word_set)
+    return word_list
+
+
+def add_sub_symbol(bert_tokens: list[str], chinese_word_set: set()):
+    if not chinese_word_set:
+        return bert_tokens
+    max_word_len = max(len(w) for w in chinese_word_set)
+
+    bert_word = bert_tokens
+    start, end = 0, len(bert_word)
+    while start < end:
+        single_word = True
+        if is_chinese(bert_word[start]):
+            l = min(end - start, max_word_len)
+            for i in range(l, 1, -1):
+                whole_word = "".join(bert_word[start : start + i])
+                if whole_word in chinese_word_set:
+                    for j in range(start + 1, start + i):
+                        bert_word[j] = "##" + bert_word[j]
+                    start = start + i
+                    single_word = False
+                    break
+        if single_word:
+            start += 1
+    return bert_word
+
+
+def prepare_ref(lines: list[str], ltp_tokenizer: LTP, bert_tokenizer: BertTokenizer):
+    ltp_res = []
+
+    for i in range(0, len(lines), 100):
+        res = ltp_tokenizer.seg(lines[i : i + 100])[0]
+        res = [get_chinese_word(r) for r in res]
+        ltp_res.extend(res)
+    assert len(ltp_res) == len(lines)
+
+    bert_res = []
+    for i in range(0, len(lines), 100):
+        res = bert_tokenizer(lines[i : i + 100], add_special_tokens=True, truncation=True, max_length=512)
+        bert_res.extend(res["input_ids"])
+    assert len(bert_res) == len(lines)
+
+    ref_ids = []
+    for input_ids, chinese_word in zip(bert_res, ltp_res):
+        input_tokens = []
+        for id in input_ids:
+            token = bert_tokenizer._convert_id_to_token(id)
+            input_tokens.append(token)
+        input_tokens = add_sub_symbol(input_tokens, chinese_word)
+        ref_id = []
+        # We only save pos of chinese subwords start with ##, which mean is part of a whole word.
+        for i, token in enumerate(input_tokens):
+            if token[:2] == "##":
+                clean_token = token[2:]
+                # save chinese tokens' pos
+                if len(clean_token) == 1 and _is_chinese_char(ord(clean_token)):
+                    ref_id.append(i)
+        ref_ids.append(ref_id)
+
+    assert len(ref_ids) == len(bert_res)
+
+    return ref_ids
+
+
+def main(args):
+    # For Chinese (Ro)Bert, the best result is from : RoBERTa-wwm-ext (https://github.com/ymcui/Chinese-BERT-wwm)
+    # If we want to fine-tune these model, we have to use same tokenizer : LTP (https://github.com/HIT-SCIR/ltp)
+    with open(args.file_name, encoding="utf-8") as f:
+        data = f.readlines()
+    data = [line.strip() for line in data if len(line) > 0 and not line.isspace()]  # avoid delimiter like '\u2029'
+    ltp_tokenizer = LTP(args.ltp)  # faster in GPU device
+    bert_tokenizer = BertTokenizer.from_pretrained(args.bert)
+
+    ref_ids = prepare_ref(data, ltp_tokenizer, bert_tokenizer)
+
+    with open(args.save_path, "w", encoding="utf-8") as f:
+        data = [json.dumps(ref) + "\n" for ref in ref_ids]
+        f.writelines(data)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="prepare_chinese_ref")
+    parser.add_argument(
+        "--file_name",
+        type=str,
+        default="./resources/chinese-demo.txt",
+        help="file need process, same as training data in lm",
+    )
+    parser.add_argument(
+        "--ltp", type=str, default="./resources/ltp", help="resources for LTP tokenizer, usually a path"
+    )
+    parser.add_argument("--bert", type=str, default="./resources/robert", help="resources for Bert tokenizer")
+    parser.add_argument("--save_path", type=str, default="./resources/ref.txt", help="path to save res")
+
+    args = parser.parse_args()
+    main(args)
--- a/examples/legacy/run_language_modeling.py
+++ b/examples/legacy/run_language_modeling.py
@ -0,0 +1,363 @@
+#!/usr/bin/env python
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, CTRL, BERT, RoBERTa, XLNet).
+GPT, GPT-2 and CTRL are fine-tuned using a causal language modeling (CLM) loss. BERT and RoBERTa are fine-tuned
+using a masked language modeling (MLM) loss. XLNet is fine-tuned using a permutation language modeling (PLM) loss.
+"""
+
+import logging
+import math
+import os
+from dataclasses import dataclass, field
+from glob import glob
+from typing import Optional
+
+from torch.utils.data import ConcatDataset
+
+import transformers
+from transformers import (
+    CONFIG_MAPPING,
+    MODEL_WITH_LM_HEAD_MAPPING,
+    AutoConfig,
+    AutoModelWithLMHead,
+    AutoTokenizer,
+    DataCollatorForLanguageModeling,
+    DataCollatorForPermutationLanguageModeling,
+    DataCollatorForWholeWordMask,
+    HfArgumentParser,
+    LineByLineTextDataset,
+    LineByLineWithRefDataset,
+    PreTrainedTokenizer,
+    TextDataset,
+    Trainer,
+    TrainingArguments,
+    set_seed,
+)
+from transformers.trainer_utils import is_main_process
+
+
+logger = logging.getLogger(__name__)
+
+
+MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
+    """
+
+    model_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "The model checkpoint for weights initialization. Leave None if you want to train a model from"
+                " scratch."
+            )
+        },
+    )
+    model_type: Optional[str] = field(
+        default=None,
+        metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
+    )
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+
+    train_data_file: Optional[str] = field(
+        default=None, metadata={"help": "The input training data file (a text file)."}
+    )
+    train_data_files: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "The input training data files (multiple files in glob format). "
+                "Very often splitting large files to smaller files can prevent tokenizer going out of memory"
+            )
+        },
+    )
+    eval_data_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
+    )
+    train_ref_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input train ref data file for whole word mask in Chinese."},
+    )
+    eval_ref_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input eval ref data file for whole word mask in Chinese."},
+    )
+    line_by_line: bool = field(
+        default=False,
+        metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
+    )
+
+    mlm: bool = field(
+        default=False, metadata={"help": "Train with masked-language modeling loss instead of language modeling."}
+    )
+    whole_word_mask: bool = field(default=False, metadata={"help": "Whether ot not to use whole word mask."})
+    mlm_probability: float = field(
+        default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
+    )
+    plm_probability: float = field(
+        default=1 / 6,
+        metadata={
+            "help": (
+                "Ratio of length of a span of masked tokens to surrounding context length for permutation language"
+                " modeling."
+            )
+        },
+    )
+    max_span_length: int = field(
+        default=5, metadata={"help": "Maximum length of a span of masked tokens for permutation language modeling."}
+    )
+
+    block_size: int = field(
+        default=-1,
+        metadata={
+            "help": (
+                "Optional input sequence length after tokenization. "
+                "The training dataset will be truncated in block of this size for training."
+                "Default to the model max input length for single sentence inputs (take into account special tokens)."
+            )
+        },
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+
+
+def get_dataset(
+    args: DataTrainingArguments,
+    tokenizer: PreTrainedTokenizer,
+    evaluate: bool = False,
+    cache_dir: Optional[str] = None,
+):
+    def _dataset(file_path, ref_path=None):
+        if args.line_by_line:
+            if ref_path is not None:
+                if not args.whole_word_mask or not args.mlm:
+                    raise ValueError("You need to set world whole masking and mlm to True for Chinese Whole Word Mask")
+                return LineByLineWithRefDataset(
+                    tokenizer=tokenizer,
+                    file_path=file_path,
+                    block_size=args.block_size,
+                    ref_path=ref_path,
+                )
+
+            return LineByLineTextDataset(tokenizer=tokenizer, file_path=file_path, block_size=args.block_size)
+        else:
+            return TextDataset(
+                tokenizer=tokenizer,
+                file_path=file_path,
+                block_size=args.block_size,
+                overwrite_cache=args.overwrite_cache,
+                cache_dir=cache_dir,
+            )
+
+    if evaluate:
+        return _dataset(args.eval_data_file, args.eval_ref_file)
+    elif args.train_data_files:
+        return ConcatDataset([_dataset(f) for f in glob(args.train_data_files)])
+    else:
+        return _dataset(args.train_data_file, args.train_ref_file)
+
+
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    if data_args.eval_data_file is None and training_args.do_eval:
+        raise ValueError(
+            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
+            "or remove the --do_eval argument."
+        )
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if training_args.local_process_index in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        training_args.local_process_index,
+        training_args.device,
+        training_args.n_gpu,
+        bool(training_args.parallel_mode.value == "distributed"),
+        training_args.fp16,
+    )
+    # Set the verbosity to info of the Transformers logger (on main process only):
+    if is_main_process(training_args.local_process_index):
+        transformers.utils.logging.set_verbosity_info()
+        transformers.utils.logging.enable_default_handler()
+        transformers.utils.logging.enable_explicit_format()
+    logger.info("Training/evaluation parameters %s", training_args)
+
+    # Set seed
+    set_seed(training_args.seed)
+
+    # Load pretrained model and tokenizer
+    #
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+
+    if model_args.config_name:
+        config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir)
+    elif model_args.model_name_or_path:
+        config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
+    else:
+        config = CONFIG_MAPPING[model_args.model_type]()
+        logger.warning("You are instantiating a new config instance from scratch.")
+
+    if model_args.tokenizer_name:
+        tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, cache_dir=model_args.cache_dir)
+    elif model_args.model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
+    else:
+        raise ValueError(
+            "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another"
+            " script, save it,and load it from here, using --tokenizer_name"
+        )
+
+    if model_args.model_name_or_path:
+        model = AutoModelWithLMHead.from_pretrained(
+            model_args.model_name_or_path,
+            from_tf=bool(".ckpt" in model_args.model_name_or_path),
+            config=config,
+            cache_dir=model_args.cache_dir,
+        )
+    else:
+        logger.info("Training new model from scratch")
+        model = AutoModelWithLMHead.from_config(config)
+
+    model.resize_token_embeddings(len(tokenizer))
+
+    if config.model_type in ["bert", "roberta", "distilbert", "camembert"] and not data_args.mlm:
+        raise ValueError(
+            "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the "
+            "--mlm flag (masked language modeling)."
+        )
+
+    if data_args.block_size <= 0:
+        data_args.block_size = tokenizer.max_len
+        # Our input block size will be the max possible for the model
+    else:
+        data_args.block_size = min(data_args.block_size, tokenizer.max_len)
+
+    # Get datasets
+
+    train_dataset = (
+        get_dataset(data_args, tokenizer=tokenizer, cache_dir=model_args.cache_dir) if training_args.do_train else None
+    )
+    eval_dataset = (
+        get_dataset(data_args, tokenizer=tokenizer, evaluate=True, cache_dir=model_args.cache_dir)
+        if training_args.do_eval
+        else None
+    )
+    if config.model_type == "xlnet":
+        data_collator = DataCollatorForPermutationLanguageModeling(
+            tokenizer=tokenizer,
+            plm_probability=data_args.plm_probability,
+            max_span_length=data_args.max_span_length,
+        )
+    else:
+        if data_args.mlm and data_args.whole_word_mask:
+            data_collator = DataCollatorForWholeWordMask(
+                tokenizer=tokenizer, mlm_probability=data_args.mlm_probability
+            )
+        else:
+            data_collator = DataCollatorForLanguageModeling(
+                tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability
+            )
+
+    # Initialize our Trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        data_collator=data_collator,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        prediction_loss_only=True,
+    )
+
+    # Training
+    if training_args.do_train:
+        model_path = (
+            model_args.model_name_or_path
+            if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path)
+            else None
+        )
+        trainer.train(model_path=model_path)
+        trainer.save_model()
+        # For convenience, we also re-save the tokenizer to the same directory,
+        # so that you can share your model easily on huggingface.co/models =)
+        if trainer.is_world_master():
+            tokenizer.save_pretrained(training_args.output_dir)
+
+    # Evaluation
+    results = {}
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+
+        eval_output = trainer.evaluate()
+
+        perplexity = math.exp(eval_output["eval_loss"])
+        result = {"perplexity": perplexity}
+
+        output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt")
+        if trainer.is_world_master():
+            with open(output_eval_file, "w") as writer:
+                logger.info("***** Eval results *****")
+                for key in sorted(result.keys()):
+                    logger.info("  %s = %s", key, str(result[key]))
+                    writer.write("{} = {}\n".format(key, str(result[key])))
+
+        results.update(result)
+
+    return results
+
+
+def _mp_fn(index):
+    # For xla_spawn (TPUs)
+    main()
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/legacy/run_openai_gpt.py
+++ b/examples/legacy/run_openai_gpt.py
@ -0,0 +1,319 @@
+#!/usr/bin/env python
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" OpenAI GPT model fine-tuning script.
+    Adapted from https://github.com/huggingface/pytorch-openai-transformer-lm/blob/master/train.py
+    It self adapted from https://github.com/openai/finetune-transformer-lm/blob/master/train.py
+
+    This script with default values fine-tunes and evaluate a pretrained OpenAI GPT on the RocStories dataset:
+        python run_openai_gpt.py \
+          --model_name openai-community/openai-gpt \
+          --do_train \
+          --do_eval \
+          --train_dataset "$ROC_STORIES_DIR/cloze_test_val__spring2016 - cloze_test_ALL_val.csv" \
+          --eval_dataset "$ROC_STORIES_DIR/cloze_test_test__spring2016 - cloze_test_ALL_test.csv" \
+          --output_dir ../log \
+          --train_batch_size 16 \
+"""
+
+import argparse
+import csv
+import logging
+import os
+import random
+
+import numpy as np
+import torch
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
+from tqdm import tqdm, trange
+
+from transformers import (
+    CONFIG_NAME,
+    WEIGHTS_NAME,
+    OpenAIGPTDoubleHeadsModel,
+    OpenAIGPTTokenizer,
+    get_linear_schedule_with_warmup,
+)
+
+
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
+)
+logger = logging.getLogger(__name__)
+
+
+def accuracy(out, labels):
+    outputs = np.argmax(out, axis=1)
+    return np.sum(outputs == labels)
+
+
+def load_rocstories_dataset(dataset_path):
+    """Output a list of tuples(story, 1st continuation, 2nd continuation, label)"""
+    with open(dataset_path, encoding="utf_8") as f:
+        f = csv.reader(f)
+        output = []
+        next(f)  # skip the first line
+        for line in tqdm(f):
+            output.append((" ".join(line[1:5]), line[5], line[6], int(line[-1]) - 1))
+    return output
+
+
+def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, delimiter_token, clf_token):
+    """Pre-process datasets containing lists of tuples(story, 1st continuation, 2nd continuation, label)
+
+    To Transformer inputs of shape (n_batch, n_alternative, length) comprising for each batch, continuation:
+    input_ids[batch, alternative, :] = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
+    """
+    tensor_datasets = []
+    for dataset in encoded_datasets:
+        n_batch = len(dataset)
+        input_ids = np.zeros((n_batch, 2, input_len), dtype=np.int64)
+        mc_token_ids = np.zeros((n_batch, 2), dtype=np.int64)
+        lm_labels = np.full((n_batch, 2, input_len), fill_value=-100, dtype=np.int64)
+        mc_labels = np.zeros((n_batch,), dtype=np.int64)
+        for (
+            i,
+            (story, cont1, cont2, mc_label),
+        ) in enumerate(dataset):
+            with_cont1 = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
+            with_cont2 = [start_token] + story[:cap_length] + [delimiter_token] + cont2[:cap_length] + [clf_token]
+            input_ids[i, 0, : len(with_cont1)] = with_cont1
+            input_ids[i, 1, : len(with_cont2)] = with_cont2
+            mc_token_ids[i, 0] = len(with_cont1) - 1
+            mc_token_ids[i, 1] = len(with_cont2) - 1
+            lm_labels[i, 0, : len(with_cont1)] = with_cont1
+            lm_labels[i, 1, : len(with_cont2)] = with_cont2
+            mc_labels[i] = mc_label
+        all_inputs = (input_ids, mc_token_ids, lm_labels, mc_labels)
+        tensor_datasets.append(tuple(torch.tensor(t) for t in all_inputs))
+    return tensor_datasets
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_name", type=str, default="openai-community/openai-gpt", help="pretrained model name")
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument("--train_dataset", type=str, default="")
+    parser.add_argument("--eval_dataset", type=str, default="")
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--num_train_epochs", type=int, default=3)
+    parser.add_argument("--train_batch_size", type=int, default=8)
+    parser.add_argument("--eval_batch_size", type=int, default=16)
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", type=int, default=1)
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help=(
+            "If > 0: set total number of training                         steps to perform. Override num_train_epochs."
+        ),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before                        performing a backward/update pass.",
+    )
+    parser.add_argument("--learning_rate", type=float, default=6.25e-5)
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
+    parser.add_argument("--lr_schedule", type=str, default="warmup_linear")
+    parser.add_argument("--weight_decay", type=float, default=0.01)
+    parser.add_argument("--lm_coef", type=float, default=0.9)
+    parser.add_argument("--n_valid", type=int, default=374)
+
+    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
+    args = parser.parse_args()
+    print(args)
+
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
+
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    torch.cuda.manual_seed_all(args.seed)
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    n_gpu = torch.cuda.device_count()
+    logger.info(f"device: {device}, n_gpu {n_gpu}")
+
+    if not args.do_train and not args.do_eval:
+        raise ValueError("At least one of `do_train` or `do_eval` must be True.")
+
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+
+    # Load tokenizer and model
+    # This loading functions also add new tokens and embeddings called `special tokens`
+    # These new embeddings will be fine-tuned on the RocStories dataset
+    special_tokens = ["_start_", "_delimiter_", "_classify_"]
+    tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name)
+    tokenizer.add_tokens(special_tokens)
+    special_tokens_ids = tokenizer.convert_tokens_to_ids(special_tokens)
+    model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name)
+    model.resize_token_embeddings(len(tokenizer))
+    model.to(device)
+
+    # Load and encode the datasets
+    def tokenize_and_encode(obj):
+        """Tokenize and encode a nested object"""
+        if isinstance(obj, str):
+            return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
+        elif isinstance(obj, int):
+            return obj
+        return [tokenize_and_encode(o) for o in obj]
+
+    logger.info("Encoding dataset...")
+    train_dataset = load_rocstories_dataset(args.train_dataset)
+    eval_dataset = load_rocstories_dataset(args.eval_dataset)
+    datasets = (train_dataset, eval_dataset)
+    encoded_datasets = tokenize_and_encode(datasets)
+
+    # Compute the max input length for the Transformer
+    max_length = model.config.n_positions // 2 - 2
+    input_length = max(
+        len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3
+        for dataset in encoded_datasets
+        for story, cont1, cont2, _ in dataset
+    )
+    input_length = min(input_length, model.config.n_positions)  # Max size of input for the pre-trained model
+
+    # Prepare inputs tensors and dataloaders
+    tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids)
+    train_tensor_dataset, eval_tensor_dataset = tensor_datasets[0], tensor_datasets[1]
+
+    train_data = TensorDataset(*train_tensor_dataset)
+    train_sampler = RandomSampler(train_data)
+    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
+
+    eval_data = TensorDataset(*eval_tensor_dataset)
+    eval_sampler = SequentialSampler(eval_data)
+    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
+
+    # Prepare optimizer
+    if args.do_train:
+        if args.max_steps > 0:
+            t_total = args.max_steps
+            args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
+        else:
+            t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+
+        param_optimizer = list(model.named_parameters())
+        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
+        optimizer_grouped_parameters = [
+            {
+                "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
+                "weight_decay": args.weight_decay,
+            },
+            {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+        ]
+        optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
+        scheduler = get_linear_schedule_with_warmup(
+            optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+        )
+
+    if args.do_train:
+        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
+        model.train()
+        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
+            tr_loss = 0
+            nb_tr_steps = 0
+            tqdm_bar = tqdm(train_dataloader, desc="Training")
+            for step, batch in enumerate(tqdm_bar):
+                batch = tuple(t.to(device) for t in batch)
+                input_ids, mc_token_ids, lm_labels, mc_labels = batch
+                losses = model(input_ids, mc_token_ids=mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels)
+                loss = args.lm_coef * losses[0] + losses[1]
+                loss.backward()
+                optimizer.step()
+                scheduler.step()
+                optimizer.zero_grad()
+                tr_loss += loss.item()
+                exp_average_loss = (
+                    loss.item() if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item()
+                )
+                nb_tr_steps += 1
+                tqdm_bar.desc = f"Training loss: {exp_average_loss:.2e} lr: {scheduler.get_lr()[0]:.2e}"
+
+    # Save a trained model
+    if args.do_train:
+        # Save a trained model, configuration and tokenizer
+        model_to_save = model.module if hasattr(model, "module") else model  # Only save the model itself
+
+        # If we save using the predefined names, we can load using `from_pretrained`
+        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
+        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
+
+        torch.save(model_to_save.state_dict(), output_model_file)
+        model_to_save.config.to_json_file(output_config_file)
+        tokenizer.save_vocabulary(args.output_dir)
+
+        # Load a trained model and vocabulary that you have fine-tuned
+        model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.output_dir)
+        tokenizer = OpenAIGPTTokenizer.from_pretrained(args.output_dir)
+        model.to(device)
+
+    if args.do_eval:
+        model.eval()
+        eval_loss, eval_accuracy = 0, 0
+        nb_eval_steps, nb_eval_examples = 0, 0
+        for batch in tqdm(eval_dataloader, desc="Evaluating"):
+            batch = tuple(t.to(device) for t in batch)
+            input_ids, mc_token_ids, lm_labels, mc_labels = batch
+            with torch.no_grad():
+                _, mc_loss, _, mc_logits = model(
+                    input_ids, mc_token_ids=mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels
+                )
+
+            mc_logits = mc_logits.detach().cpu().numpy()
+            mc_labels = mc_labels.to("cpu").numpy()
+            tmp_eval_accuracy = accuracy(mc_logits, mc_labels)
+
+            eval_loss += mc_loss.mean().item()
+            eval_accuracy += tmp_eval_accuracy
+
+            nb_eval_examples += input_ids.size(0)
+            nb_eval_steps += 1
+
+        eval_loss = eval_loss / nb_eval_steps
+        eval_accuracy = eval_accuracy / nb_eval_examples
+        train_loss = tr_loss / nb_tr_steps if args.do_train else None
+        result = {"eval_loss": eval_loss, "eval_accuracy": eval_accuracy, "train_loss": train_loss}
+
+        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
+        with open(output_eval_file, "w") as writer:
+            logger.info("***** Eval results *****")
+            for key in sorted(result.keys()):
+                logger.info("  %s = %s", key, str(result[key]))
+                writer.write("{} = {}\n".format(key, str(result[key])))
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/legacy/run_swag.py
+++ b/examples/legacy/run_swag.py
@ -0,0 +1,706 @@
+#!/usr/bin/env python
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BERT finetuning runner.
+Finetuning the library models for multiple choice on SWAG (Bert).
+"""
+
+import argparse
+import csv
+import glob
+import logging
+import os
+import random
+
+import numpy as np
+import torch
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
+from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm, trange
+
+import transformers
+from transformers import (
+    WEIGHTS_NAME,
+    AutoConfig,
+    AutoModelForMultipleChoice,
+    AutoTokenizer,
+    get_linear_schedule_with_warmup,
+)
+from transformers.trainer_utils import is_main_process
+
+
+try:
+    from torch.utils.tensorboard import SummaryWriter
+except ImportError:
+    from tensorboardX import SummaryWriter
+
+
+logger = logging.getLogger(__name__)
+
+
+class SwagExample:
+    """A single training/test example for the SWAG dataset."""
+
+    def __init__(self, swag_id, context_sentence, start_ending, ending_0, ending_1, ending_2, ending_3, label=None):
+        self.swag_id = swag_id
+        self.context_sentence = context_sentence
+        self.start_ending = start_ending
+        self.endings = [
+            ending_0,
+            ending_1,
+            ending_2,
+            ending_3,
+        ]
+        self.label = label
+
+    def __str__(self):
+        return self.__repr__()
+
+    def __repr__(self):
+        attributes = [
+            f"swag_id: {self.swag_id}",
+            f"context_sentence: {self.context_sentence}",
+            f"start_ending: {self.start_ending}",
+            f"ending_0: {self.endings[0]}",
+            f"ending_1: {self.endings[1]}",
+            f"ending_2: {self.endings[2]}",
+            f"ending_3: {self.endings[3]}",
+        ]
+
+        if self.label is not None:
+            attributes.append(f"label: {self.label}")
+
+        return ", ".join(attributes)
+
+
+class InputFeatures:
+    def __init__(self, example_id, choices_features, label):
+        self.example_id = example_id
+        self.choices_features = [
+            {"input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids}
+            for _, input_ids, input_mask, segment_ids in choices_features
+        ]
+        self.label = label
+
+
+def read_swag_examples(input_file, is_training=True):
+    with open(input_file, encoding="utf-8") as f:
+        lines = list(csv.reader(f))
+
+    if is_training and lines[0][-1] != "label":
+        raise ValueError("For training, the input file must contain a label column.")
+
+    examples = [
+        SwagExample(
+            swag_id=line[2],
+            context_sentence=line[4],
+            start_ending=line[5],  # in the swag dataset, the
+            # common beginning of each
+            # choice is stored in "sent2".
+            ending_0=line[7],
+            ending_1=line[8],
+            ending_2=line[9],
+            ending_3=line[10],
+            label=int(line[11]) if is_training else None,
+        )
+        for line in lines[1:]  # we skip the line with the column names
+    ]
+
+    return examples
+
+
+def convert_examples_to_features(examples, tokenizer, max_seq_length, is_training):
+    """Loads a data file into a list of `InputBatch`s."""
+
+    # Swag is a multiple choice task. To perform this task using Bert,
+    # we will use the formatting proposed in "Improving Language
+    # Understanding by Generative Pre-Training" and suggested by
+    # @jacobdevlin-google in this issue
+    # https://github.com/google-research/bert/issues/38.
+    #
+    # Each choice will correspond to a sample on which we run the
+    # inference. For a given Swag example, we will create the 4
+    # following inputs:
+    # - [CLS] context [SEP] choice_1 [SEP]
+    # - [CLS] context [SEP] choice_2 [SEP]
+    # - [CLS] context [SEP] choice_3 [SEP]
+    # - [CLS] context [SEP] choice_4 [SEP]
+    # The model will output a single value for each input. To get the
+    # final decision of the model, we will run a softmax over these 4
+    # outputs.
+    features = []
+    for example_index, example in tqdm(enumerate(examples)):
+        context_tokens = tokenizer.tokenize(example.context_sentence)
+        start_ending_tokens = tokenizer.tokenize(example.start_ending)
+
+        choices_features = []
+        for ending_index, ending in enumerate(example.endings):
+            # We create a copy of the context tokens in order to be
+            # able to shrink it according to ending_tokens
+            context_tokens_choice = context_tokens[:]
+            ending_tokens = start_ending_tokens + tokenizer.tokenize(ending)
+            # Modifies `context_tokens_choice` and `ending_tokens` in
+            # place so that the total length is less than the
+            # specified length.  Account for [CLS], [SEP], [SEP] with
+            # "- 3"
+            _truncate_seq_pair(context_tokens_choice, ending_tokens, max_seq_length - 3)
+
+            tokens = ["[CLS]"] + context_tokens_choice + ["[SEP]"] + ending_tokens + ["[SEP]"]
+            segment_ids = [0] * (len(context_tokens_choice) + 2) + [1] * (len(ending_tokens) + 1)
+
+            input_ids = tokenizer.convert_tokens_to_ids(tokens)
+            input_mask = [1] * len(input_ids)
+
+            # Zero-pad up to the sequence length.
+            padding = [0] * (max_seq_length - len(input_ids))
+            input_ids += padding
+            input_mask += padding
+            segment_ids += padding
+
+            assert len(input_ids) == max_seq_length
+            assert len(input_mask) == max_seq_length
+            assert len(segment_ids) == max_seq_length
+
+            choices_features.append((tokens, input_ids, input_mask, segment_ids))
+
+        label = example.label
+        if example_index < 5:
+            logger.info("*** Example ***")
+            logger.info(f"swag_id: {example.swag_id}")
+            for choice_idx, (tokens, input_ids, input_mask, segment_ids) in enumerate(choices_features):
+                logger.info(f"choice: {choice_idx}")
+                logger.info("tokens: {}".format(" ".join(tokens)))
+                logger.info("input_ids: {}".format(" ".join(map(str, input_ids))))
+                logger.info("input_mask: {}".format(" ".join(map(str, input_mask))))
+                logger.info("segment_ids: {}".format(" ".join(map(str, segment_ids))))
+            if is_training:
+                logger.info(f"label: {label}")
+
+        features.append(InputFeatures(example_id=example.swag_id, choices_features=choices_features, label=label))
+
+    return features
+
+
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+    """Truncates a sequence pair in place to the maximum length."""
+
+    # This is a simple heuristic which will always truncate the longer sequence
+    # one token at a time. This makes more sense than truncating an equal percent
+    # of tokens from each, since if one sequence is very short then each token
+    # that's truncated likely contains more information than a longer sequence.
+    while True:
+        total_length = len(tokens_a) + len(tokens_b)
+        if total_length <= max_length:
+            break
+        if len(tokens_a) > len(tokens_b):
+            tokens_a.pop()
+        else:
+            tokens_b.pop()
+
+
+def accuracy(out, labels):
+    outputs = np.argmax(out, axis=1)
+    return np.sum(outputs == labels)
+
+
+def select_field(features, field):
+    return [[choice[field] for choice in feature.choices_features] for feature in features]
+
+
+def set_seed(args):
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if args.n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+
+def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
+    if args.local_rank not in [-1, 0]:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+
+    # Load data features from cache or dataset file
+    input_file = args.predict_file if evaluate else args.train_file
+    cached_features_file = os.path.join(
+        os.path.dirname(input_file),
+        "cached_{}_{}_{}".format(
+            "dev" if evaluate else "train",
+            list(filter(None, args.model_name_or_path.split("/"))).pop(),
+            str(args.max_seq_length),
+        ),
+    )
+    if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
+        logger.info("Loading features from cached file %s", cached_features_file)
+        features = torch.load(cached_features_file, weights_only=True)
+    else:
+        logger.info("Creating features from dataset file at %s", input_file)
+        examples = read_swag_examples(input_file)
+        features = convert_examples_to_features(examples, tokenizer, args.max_seq_length, not evaluate)
+
+        if args.local_rank in [-1, 0]:
+            logger.info("Saving features into cached file %s", cached_features_file)
+            torch.save(features, cached_features_file)
+
+    if args.local_rank == 0:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+
+    # Convert to Tensors and build dataset
+    all_input_ids = torch.tensor(select_field(features, "input_ids"), dtype=torch.long)
+    all_input_mask = torch.tensor(select_field(features, "input_mask"), dtype=torch.long)
+    all_segment_ids = torch.tensor(select_field(features, "segment_ids"), dtype=torch.long)
+    all_label = torch.tensor([f.label for f in features], dtype=torch.long)
+
+    if evaluate:
+        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
+    else:
+        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
+
+    if output_examples:
+        return dataset, examples, features
+    return dataset
+
+
+def train(args, train_dataset, model, tokenizer):
+    """Train the model"""
+    if args.local_rank in [-1, 0]:
+        tb_writer = SummaryWriter()
+
+    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
+    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
+    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
+
+    if args.max_steps > 0:
+        t_total = args.max_steps
+        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
+    else:
+        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+
+    # Prepare optimizer and schedule (linear warmup and decay)
+    no_decay = ["bias", "LayerNorm.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+            "weight_decay": args.weight_decay,
+        },
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
+    ]
+    optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
+    scheduler = get_linear_schedule_with_warmup(
+        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+    )
+    if args.fp16:
+        try:
+            from apex import amp
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
+        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
+
+    # multi-gpu training (should be after apex fp16 initialization)
+    if args.n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
+    # Distributed training (should be after apex fp16 initialization)
+    if args.local_rank != -1:
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
+        )
+
+    # Train!
+    logger.info("***** Running training *****")
+    logger.info("  Num examples = %d", len(train_dataset))
+    logger.info("  Num Epochs = %d", args.num_train_epochs)
+    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
+    logger.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size
+        * args.gradient_accumulation_steps
+        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
+    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
+    logger.info("  Total optimization steps = %d", t_total)
+
+    global_step = 0
+    tr_loss, logging_loss = 0.0, 0.0
+    model.zero_grad()
+    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
+    set_seed(args)  # Added here for reproducibility
+    for _ in train_iterator:
+        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
+        for step, batch in enumerate(epoch_iterator):
+            model.train()
+            batch = tuple(t.to(args.device) for t in batch)
+            inputs = {
+                "input_ids": batch[0],
+                "attention_mask": batch[1],
+                # 'token_type_ids':  None if args.model_type == 'xlm' else batch[2],
+                "token_type_ids": batch[2],
+                "labels": batch[3],
+            }
+            # if args.model_type in ['xlnet', 'xlm']:
+            #     inputs.update({'cls_index': batch[5],
+            #                    'p_mask':       batch[6]})
+            outputs = model(**inputs)
+            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
+
+            if args.n_gpu > 1:
+                loss = loss.mean()  # mean() to average on multi-gpu parallel (not distributed) training
+            if args.gradient_accumulation_steps > 1:
+                loss = loss / args.gradient_accumulation_steps
+
+            if args.fp16:
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+            else:
+                loss.backward()
+                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+
+            tr_loss += loss.item()
+            if (step + 1) % args.gradient_accumulation_steps == 0:
+                optimizer.step()
+                scheduler.step()  # Update learning rate schedule
+                model.zero_grad()
+                global_step += 1
+
+                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
+                    # Log metrics
+                    if (
+                        args.local_rank == -1 and args.evaluate_during_training
+                    ):  # Only evaluate when single GPU otherwise metrics may not average well
+                        results = evaluate(args, model, tokenizer)
+                        for key, value in results.items():
+                            tb_writer.add_scalar(f"eval_{key}", value, global_step)
+                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
+                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
+                    logging_loss = tr_loss
+
+                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
+                    # Save model checkpoint
+                    output_dir = os.path.join(args.output_dir, f"checkpoint-{global_step}")
+                    model_to_save = (
+                        model.module if hasattr(model, "module") else model
+                    )  # Take care of distributed/parallel training
+                    model_to_save.save_pretrained(output_dir)
+                    tokenizer.save_vocabulary(output_dir)
+                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
+                    logger.info("Saving model checkpoint to %s", output_dir)
+
+            if args.max_steps > 0 and global_step > args.max_steps:
+                epoch_iterator.close()
+                break
+        if args.max_steps > 0 and global_step > args.max_steps:
+            train_iterator.close()
+            break
+
+    if args.local_rank in [-1, 0]:
+        tb_writer.close()
+
+    return global_step, tr_loss / global_step
+
+
+def evaluate(args, model, tokenizer, prefix=""):
+    dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)
+
+    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
+        os.makedirs(args.output_dir)
+
+    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
+    # Note that DistributedSampler samples randomly
+    eval_sampler = SequentialSampler(dataset) if args.local_rank == -1 else DistributedSampler(dataset)
+    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
+
+    # Eval!
+    logger.info(f"***** Running evaluation {prefix} *****")
+    logger.info("  Num examples = %d", len(dataset))
+    logger.info("  Batch size = %d", args.eval_batch_size)
+
+    eval_loss, eval_accuracy = 0, 0
+    nb_eval_steps, nb_eval_examples = 0, 0
+
+    for batch in tqdm(eval_dataloader, desc="Evaluating"):
+        model.eval()
+        batch = tuple(t.to(args.device) for t in batch)
+        with torch.no_grad():
+            inputs = {
+                "input_ids": batch[0],
+                "attention_mask": batch[1],
+                # 'token_type_ids': None if args.model_type == 'xlm' else batch[2]  # XLM don't use segment_ids
+                "token_type_ids": batch[2],
+                "labels": batch[3],
+            }
+
+            # if args.model_type in ['xlnet', 'xlm']:
+            #     inputs.update({'cls_index': batch[4],
+            #                    'p_mask':    batch[5]})
+            outputs = model(**inputs)
+            tmp_eval_loss, logits = outputs[:2]
+            eval_loss += tmp_eval_loss.mean().item()
+
+        logits = logits.detach().cpu().numpy()
+        label_ids = inputs["labels"].to("cpu").numpy()
+        tmp_eval_accuracy = accuracy(logits, label_ids)
+        eval_accuracy += tmp_eval_accuracy
+
+        nb_eval_steps += 1
+        nb_eval_examples += inputs["input_ids"].size(0)
+
+    eval_loss = eval_loss / nb_eval_steps
+    eval_accuracy = eval_accuracy / nb_eval_examples
+    result = {"eval_loss": eval_loss, "eval_accuracy": eval_accuracy}
+
+    output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
+    with open(output_eval_file, "w") as writer:
+        logger.info("***** Eval results *****")
+        for key in sorted(result.keys()):
+            logger.info("%s = %s", key, str(result[key]))
+            writer.write("{} = {}\n".format(key, str(result[key])))
+
+    return result
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    # Required parameters
+    parser.add_argument(
+        "--train_file", default=None, type=str, required=True, help="SWAG csv for training. E.g., train.csv"
+    )
+    parser.add_argument(
+        "--predict_file",
+        default=None,
+        type=str,
+        required=True,
+        help="SWAG csv for predictions. E.g., val.csv or test.csv",
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models",
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model checkpoints and predictions will be written.",
+    )
+
+    # Other parameters
+    parser.add_argument(
+        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="",
+        type=str,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--max_seq_length",
+        default=384,
+        type=int,
+        help=(
+            "The maximum total input sequence length after tokenization. Sequences "
+            "longer than this will be truncated, and sequences shorter than this will be padded."
+        ),
+    )
+    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
+    parser.add_argument(
+        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
+    )
+
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
+    parser.add_argument(
+        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
+    )
+    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument(
+        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
+    )
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
+
+    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
+    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
+    parser.add_argument(
+        "--eval_all_checkpoints",
+        action="store_true",
+        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
+    )
+    parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
+    parser.add_argument(
+        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
+
+    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help=(
+            "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. "
+            "See details at https://nvidia.github.io/apex/amp.html"
+        ),
+    )
+    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
+    args = parser.parse_args()
+
+    # Setup distant debugging if needed
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
+
+    # Setup CUDA, GPU & distributed training
+    if args.local_rank == -1 or args.no_cuda:
+        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
+    else:  # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
+        torch.cuda.set_device(args.local_rank)
+        device = torch.device("cuda", args.local_rank)
+        torch.distributed.init_process_group(backend="nccl")
+        args.n_gpu = 1
+    args.device = device
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        args.local_rank,
+        device,
+        args.n_gpu,
+        bool(args.local_rank != -1),
+        args.fp16,
+    )
+    # Set the verbosity to info of the Transformers logger (on main process only):
+    if is_main_process(args.local_rank):
+        transformers.utils.logging.set_verbosity_info()
+        transformers.utils.logging.enable_default_handler()
+        transformers.utils.logging.enable_explicit_format()
+
+    # Set seed
+    set_seed(args)
+
+    # Load pretrained model and tokenizer
+    if args.local_rank not in [-1, 0]:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+
+    config = AutoConfig.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+    )
+    model = AutoModelForMultipleChoice.from_pretrained(
+        args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config
+    )
+
+    if args.local_rank == 0:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+
+    model.to(args.device)
+
+    logger.info("Training/evaluation parameters %s", args)
+
+    # Training
+    if args.do_train:
+        train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False)
+        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
+        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
+
+    # Save the trained model and the tokenizer
+    if args.local_rank == -1 or torch.distributed.get_rank() == 0:
+        logger.info("Saving model checkpoint to %s", args.output_dir)
+        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
+        # They can then be reloaded using `from_pretrained()`
+        model_to_save = (
+            model.module if hasattr(model, "module") else model
+        )  # Take care of distributed/parallel training
+        model_to_save.save_pretrained(args.output_dir)
+        tokenizer.save_pretrained(args.output_dir)
+
+        # Good practice: save your training arguments together with the trained model
+        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
+
+        # Load a trained model and vocabulary that you have fine-tuned
+        model = AutoModelForMultipleChoice.from_pretrained(args.output_dir)
+        tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
+        model.to(args.device)
+
+    # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
+    results = {}
+    if args.do_eval and args.local_rank in [-1, 0]:
+        if args.do_train:
+            checkpoints = [args.output_dir]
+        else:
+            # if do_train is False and do_eval is true, load model directly from pretrained.
+            checkpoints = [args.model_name_or_path]
+
+        if args.eval_all_checkpoints:
+            checkpoints = [
+                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
+            ]
+
+        logger.info("Evaluate the following checkpoints: %s", checkpoints)
+
+        for checkpoint in checkpoints:
+            # Reload the model
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
+            model = AutoModelForMultipleChoice.from_pretrained(checkpoint)
+            tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+            model.to(args.device)
+
+            # Evaluate
+            result = evaluate(args, model, tokenizer, prefix=global_step)
+
+            result = {k + (f"_{global_step}" if global_step else ""): v for k, v in result.items()}
+            results.update(result)
+
+    logger.info(f"Results: {results}")
+
+    return results
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/legacy/run_transfo_xl.py
+++ b/examples/legacy/run_transfo_xl.py
@ -0,0 +1,143 @@
+#!/usr/bin/env python
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Transformer XL model evaluation script.
+Adapted from https://github.com/kimiyoung/transformer-xl.
+In particular https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/eval.py
+
+This script with default values evaluates a pretrained Transformer-XL on WikiText 103
+"""
+
+import argparse
+import logging
+import math
+import time
+
+import torch
+
+from transformers import TransfoXLCorpus, TransfoXLLMHeadModel
+
+
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
+)
+logger = logging.getLogger(__name__)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="PyTorch Transformer Language Model")
+    parser.add_argument("--model_name", type=str, default="transfo-xl/transfo-xl-wt103", help="pretrained model name")
+    parser.add_argument(
+        "--split", type=str, default="test", choices=["all", "valid", "test"], help="which split to evaluate"
+    )
+    parser.add_argument("--batch_size", type=int, default=10, help="batch size")
+    parser.add_argument("--tgt_len", type=int, default=128, help="number of tokens to predict")
+    parser.add_argument("--ext_len", type=int, default=0, help="length of the extended context")
+    parser.add_argument("--mem_len", type=int, default=1600, help="length of the retained previous heads")
+    parser.add_argument("--clamp_len", type=int, default=1000, help="max positional embedding index")
+    parser.add_argument("--no_cuda", action="store_true", help="Do not use CUDA even though CUA is available")
+    parser.add_argument("--work_dir", type=str, required=True, help="path to the work_dir")
+    parser.add_argument("--no_log", action="store_true", help="do not log the eval result")
+    parser.add_argument("--same_length", action="store_true", help="set same length attention with masking")
+    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
+    args = parser.parse_args()
+    assert args.ext_len >= 0, "extended context length must be non-negative"
+
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
+
+    device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+    logger.info(f"device: {device}")
+
+    # Load a pre-processed dataset
+    # You can also build the corpus yourself using TransfoXLCorpus methods
+    # The pre-processing involve computing word frequencies to prepare the Adaptive input and SoftMax
+    # and tokenizing the dataset
+    # The pre-processed corpus is a conversion (using the conversion script )
+    corpus = TransfoXLCorpus.from_pretrained(args.model_name)
+
+    va_iter = corpus.get_iterator("valid", args.batch_size, args.tgt_len, device=device, ext_len=args.ext_len)
+    te_iter = corpus.get_iterator("test", args.batch_size, args.tgt_len, device=device, ext_len=args.ext_len)
+
+    # Load a pre-trained model
+    model = TransfoXLLMHeadModel.from_pretrained(args.model_name)
+    model.to(device)
+
+    logger.info(
+        "Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}".format(
+            args.batch_size, args.tgt_len, args.ext_len, args.mem_len, args.clamp_len
+        )
+    )
+
+    model.reset_memory_length(args.mem_len)
+    if args.clamp_len > 0:
+        model.clamp_len = args.clamp_len
+    if args.same_length:
+        model.same_length = True
+
+    ###############################################################################
+    # Evaluation code
+    ###############################################################################
+    def evaluate(eval_iter):
+        # Turn on evaluation mode which disables dropout.
+        model.eval()
+        total_len, total_loss = 0, 0.0
+        start_time = time.time()
+        with torch.no_grad():
+            mems = None
+            for idx, (data, target, seq_len) in enumerate(eval_iter):
+                ret = model(data, lm_labels=target, mems=mems)
+                loss, _, mems = ret
+                loss = loss.mean()
+                total_loss += seq_len * loss.item()
+                total_len += seq_len
+            total_time = time.time() - start_time
+        logger.info(f"Time : {total_time:.2f}s, {1000 * total_time / (idx + 1):.2f}ms/segment")
+        return total_loss / total_len
+
+    # Run on test data.
+    if args.split == "all":
+        test_loss = evaluate(te_iter)
+        valid_loss = evaluate(va_iter)
+    elif args.split == "valid":
+        valid_loss = evaluate(va_iter)
+        test_loss = None
+    elif args.split == "test":
+        test_loss = evaluate(te_iter)
+        valid_loss = None
+
+    def format_log(loss, split):
+        log_str = "| {0} loss {1:5.2f} | {0} ppl {2:9.3f} ".format(split, loss, math.exp(loss))
+        return log_str
+
+    log_str = ""
+    if valid_loss is not None:
+        log_str += format_log(valid_loss, "valid")
+    if test_loss is not None:
+        log_str += format_log(test_loss, "test")
+
+    logger.info("=" * 100)
+    logger.info(log_str)
+    logger.info("=" * 100)
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/legacy/seq2seq/README.md
+++ b/examples/legacy/seq2seq/README.md
@ -0,0 +1,327 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Sequence-to-Sequence Training and Evaluation
+
+This directory contains examples for finetuning and evaluating transformers on summarization and translation tasks.
+For deprecated `bertabs` instructions, see https://github.com/huggingface/transformers-research-projects/blob/main/bertabs/README.md.
+
+### Supported Architectures
+
+- `BartForConditionalGeneration`
+- `MarianMTModel`
+- `PegasusForConditionalGeneration`
+- `MBartForConditionalGeneration`
+- `FSMTForConditionalGeneration`
+- `T5ForConditionalGeneration`
+
+### Download the Datasets
+
+#### XSUM
+
+```bash
+cd examples/legacy/seq2seq
+wget https://cdn-datasets.huggingface.co/summarization/xsum.tar.gz
+tar -xzvf xsum.tar.gz
+export XSUM_DIR=${PWD}/xsum
+```
+this should make a directory called `xsum/` with files like `test.source`.
+To use your own data, copy that files format. Each article to be summarized is on its own line.
+
+#### CNN/DailyMail
+
+```bash
+cd examples/legacy/seq2seq
+wget https://cdn-datasets.huggingface.co/summarization/cnn_dm_v2.tgz
+tar -xzvf cnn_dm_v2.tgz  # empty lines removed
+mv cnn_cln cnn_dm
+export CNN_DIR=${PWD}/cnn_dm
+```
+this should make a directory called `cnn_dm/` with 6 files.
+
+#### WMT16 English-Romanian Translation Data
+
+download with this command:
+```bash
+wget https://cdn-datasets.huggingface.co/translation/wmt_en_ro.tar.gz
+tar -xzvf wmt_en_ro.tar.gz
+export ENRO_DIR=${PWD}/wmt_en_ro
+```
+this should make a directory called `wmt_en_ro/` with 6 files.
+
+#### WMT English-German
+
+```bash
+wget https://cdn-datasets.huggingface.co/translation/wmt_en_de.tgz
+tar -xzvf wmt_en_de.tgz
+export DATA_DIR=${PWD}/wmt_en_de
+```
+
+#### FSMT datasets (wmt)
+
+Refer to the scripts starting with `eval_` under:
+https://github.com/huggingface/transformers/tree/main/scripts/fsmt
+
+#### Pegasus (multiple datasets)
+
+Multiple eval datasets are available for download from:
+https://github.com/stas00/porting/tree/master/datasets/pegasus
+
+
+#### Your Data
+
+If you are using your own data, it must be formatted as one directory with 6 files:
+```
+train.source
+train.target
+val.source
+val.target
+test.source
+test.target
+```
+The `.source` files are the input, the `.target` files are the desired output.
+
+### Tips and Tricks
+
+General Tips:
+- since you need to run from `examples/legacy/seq2seq`, and likely need to modify code, the easiest workflow is fork transformers, clone your fork, and run `pip install -e .` before you get started.
+- try `--freeze_encoder` or `--freeze_embeds` for faster training/larger batch size.  (3hr per epoch with bs=8, see the "xsum_shared_task" command below)
+
+- In addition to the pytorch-lightning .ckpt checkpoint, a transformers checkpoint will be saved.
+Load it with `BartForConditionalGeneration.from_pretrained(f'{output_dir}/best_tfmr)`.
+- At the moment, `--do_predict` does not work in a multi-gpu setting. You need to use `evaluate_checkpoint` or the `run_eval.py` code.
+- This warning can be safely ignored:
+    > "Some weights of BartForConditionalGeneration were not initialized from the model checkpoint at facebook/bart-large-xsum and are newly initialized: ['final_logits_bias']"
+- Both finetuning and eval are 30% faster with `--fp16`.
+- Read scripts before you run them!
+
+Summarization Tips:
+- (summ) 1 epoch at batch size 1 for bart-large takes 24 hours and requires 13GB GPU RAM with fp16 on an NVIDIA-V100.
+- If you want to run experiments on improving the summarization finetuning process, try the XSUM Shared Task (below). It's faster to train than CNNDM because the summaries are shorter.
+- For CNN/DailyMail, the default `val_max_target_length` and `test_max_target_length` will truncate the ground truth labels, resulting in slightly higher rouge scores. To get accurate rouge scores, you should rerun calculate_rouge on the `{output_dir}/test_generations.txt` file saved by `trainer.test()`
+- `--max_target_length=60 --val_max_target_length=60 --test_max_target_length=100 ` is a reasonable setting for XSUM.
+- `wandb` can be used by specifying `--logger_name wandb`. It is useful for reproducibility. Specify the environment variable `WANDB_PROJECT='hf_xsum'` to do the XSUM shared task.
+- If you are finetuning on your own dataset, start from `distilbart-cnn-12-6` if you want long summaries and `distilbart-xsum-12-6` if you want short summaries.
+(It rarely makes sense to start from `bart-large` unless you are a researching finetuning methods).
+
+**Update 2018-07-18**
+Datasets: `LegacySeq2SeqDataset` will be used for all tokenizers without a `prepare_seq2seq_batch` method. Otherwise, `Seq2SeqDataset` will be used.
+Future work/help wanted: A new dataset to support multilingual tasks.
+
+
+### Fine-tuning using Seq2SeqTrainer
+To use `Seq2SeqTrainer` for fine-tuning you should use the `finetune_trainer.py` script. It subclasses `Trainer` to extend it for seq2seq training. Except the `Trainer`-related `TrainingArguments`, it shares the same argument names as that of `finetune.py` file. One notable difference is that calculating generative metrics (BLEU, ROUGE) is optional and is controlled using the `--predict_with_generate` argument.
+
+To see all the possible command line options, run:
+
+```bash
+python finetune_trainer.py --help
+```
+
+For multi-gpu training use `torch.distributed.launch`, e.g. with 2 gpus:
+```bash
+torchrun --nproc_per_node=2  finetune_trainer.py ...
+```
+
+**At the moment, `Seq2SeqTrainer` does not support *with teacher* distillation.**
+
+All `Seq2SeqTrainer`-based fine-tuning scripts are included in the `builtin_trainer` directory.
+
+#### TPU Training
+`Seq2SeqTrainer` supports TPU training with few caveats
+1. As `generate` method does not work on TPU at the moment, `predict_with_generate` cannot be used. You should use `--prediction_loss_only` to only calculate loss, and do not set `--do_predict` and `--predict_with_generate`.
+2. All sequences should be padded to be of equal length to avoid extremely slow training. (`finetune_trainer.py` does this automatically when running on TPU.)
+
+We provide a very simple launcher script named `xla_spawn.py` that lets you run our example scripts on multiple TPU cores without any boilerplate. Just pass a `--num_cores` flag to this script, then your regular training script with its arguments (this is similar to the `torch.distributed.launch` helper for `torch.distributed`).
+
+`builtin_trainer/finetune_tpu.sh` script provides minimal arguments needed for TPU training.
+
+The following command fine-tunes `sshleifer/student_marian_en_ro_6_3` on TPU V3-8 and should complete one epoch in ~5-6 mins.
+
+```bash
+./builtin_trainer/train_distil_marian_enro_tpu.sh
+```
+
+## Evaluation Commands
+
+To create summaries for each article in dataset, we use `run_eval.py`, here are a few commands that run eval for different tasks and models.
+If 'translation' is in your task name, the computed metric will be BLEU. Otherwise, ROUGE will be used.
+
+For t5, you need to specify --task translation_{src}_to_{tgt} as follows:
+```bash
+export DATA_DIR=wmt_en_ro
+./run_eval.py google-t5/t5-base \
+    $DATA_DIR/val.source t5_val_generations.txt \
+    --reference_path $DATA_DIR/val.target \
+    --score_path enro_bleu.json \
+    --task translation_en_to_ro \
+    --n_obs 100 \
+    --device cuda \
+    --fp16 \
+    --bs 32
+```
+
+This command works for MBART, although the BLEU score is suspiciously low.
+```bash
+export DATA_DIR=wmt_en_ro
+./run_eval.py facebook/mbart-large-en-ro $DATA_DIR/val.source mbart_val_generations.txt \
+    --reference_path $DATA_DIR/val.target \
+    --score_path enro_bleu.json \
+    --task translation \
+    --n_obs 100 \
+    --device cuda \
+    --fp16 \
+    --bs 32
+```
+
+Summarization (xsum will be very similar):
+```bash
+export DATA_DIR=cnn_dm
+./run_eval.py sshleifer/distilbart-cnn-12-6 $DATA_DIR/val.source dbart_val_generations.txt \
+    --reference_path $DATA_DIR/val.target \
+    --score_path cnn_rouge.json \
+    --task summarization \
+    --n_obs 100 \
+
+th 56 \
+    --fp16 \
+    --bs 32
+```
+
+### Multi-GPU Evaluation
+here is a command to run xsum evaluation on 8 GPUs. It is more than linearly faster than run_eval.py in some cases
+because it uses SortishSampler to minimize padding. You can also use it on 1 GPU. `data_dir` must have
+`{type_path}.source` and `{type_path}.target`. Run `./run_distributed_eval.py --help` for all clargs.
+
+```bash
+torchrun --nproc_per_node=8  run_distributed_eval.py \
+    --model_name sshleifer/distilbart-large-xsum-12-3  \
+    --save_dir xsum_generations \
+    --data_dir xsum \
+    --fp16  # you can pass generate kwargs like num_beams here, just like run_eval.py
+```
+
+Contributions that implement this command for other distributed hardware setups are welcome!
+
+#### Single-GPU Eval: Tips and Tricks
+
+When using `run_eval.py`, the following features can be useful:
+
+* if you running the script multiple times and want to make it easier to track what arguments produced that output, use `--dump-args`. Along with the results it will also dump any custom params that were passed to the script. For example if you used: `--num_beams 8 --early_stopping true`, the output will be:
+   ```json
+   {'bleu': 26.887, 'n_obs': 10, 'runtime': 1, 'seconds_per_sample': 0.1, 'num_beams': 8, 'early_stopping': True}
+   ```
+
+   `--info` is an additional argument available for the same purpose of tracking the conditions of the experiment. It's useful to pass things that weren't in the argument list, e.g. a language pair `--info "lang:en-ru"`. But also if you pass `--info` without a value it will fallback to the current date/time string, e.g. `2020-09-13 18:44:43`.
+
+   If using `--dump-args --info`, the output will be:
+
+   ```json
+   {'bleu': 26.887, 'n_obs': 10, 'runtime': 1, 'seconds_per_sample': 0.1, 'num_beams': 8, 'early_stopping': True, 'info': '2020-09-13 18:44:43'}
+   ```
+
+   If using `--dump-args --info "pair:en-ru chkpt=best`, the output will be:
+
+   ```json
+   {'bleu': 26.887, 'n_obs': 10, 'runtime': 1, 'seconds_per_sample': 0.1, 'num_beams': 8, 'early_stopping': True, 'info': 'pair=en-ru chkpt=best'}
+   ```
+
+
+* if you need to perform a parametric search in order to find the best ones that lead to the highest BLEU score, let `run_eval_search.py` to do the searching for you.
+
+   The script accepts the exact same arguments as `run_eval.py`, plus an additional argument `--search`. The value of `--search` is parsed, reformatted and fed to ``run_eval.py`` as additional args.
+
+   The format for the `--search` value is a simple string with hparams and colon separated values to try, e.g.:
+   ```
+    --search "num_beams=5:10 length_penalty=0.8:1.0:1.2 early_stopping=true:false"
+   ```
+   which will generate `12` `(2*3*2)` searches for a product of each hparam. For example the example that was just used will invoke `run_eval.py` repeatedly with:
+
+   ```
+    --num_beams 5 --length_penalty 0.8 --early_stopping true
+    --num_beams 5 --length_penalty 0.8 --early_stopping false
+    [...]
+    --num_beams 10 --length_penalty 1.2 --early_stopping false
+   ```
+
+   On completion, this function prints a markdown table of the results sorted by the best BLEU score and the winning arguments.
+
+```
+bleu  | num_beams | length_penalty | early_stopping
+----- | --------- | -------------- | --------------
+26.71 |         5 |            1.1 |              1
+26.66 |         5 |            0.9 |              1
+26.66 |         5 |            0.9 |              0
+26.41 |         5 |            1.1 |              0
+21.94 |         1 |            0.9 |              1
+21.94 |         1 |            0.9 |              0
+21.94 |         1 |            1.1 |              1
+21.94 |         1 |            1.1 |              0
+
+Best score args:
+stas/wmt19-en-ru data/en-ru/val.source data/en-ru/test_translations.txt --reference_path data/en-ru/val.target --score_path data/en-ru/test_bleu.json --bs 8 --task translation --num_beams 5 --length_penalty 1.1 --early_stopping True
+```
+
+If you pass `--info "some experiment-specific info"` it will get printed before the results table - this is useful for scripting and multiple runs, so one can tell the different sets of results from each other.
+
+
+### Contributing
+- follow the standard contributing guidelines and code of conduct.
+- add tests to `test_seq2seq_examples.py`
+- To run only the seq2seq tests, you must be in the root of the repository and run:
+```bash
+pytest examples/seq2seq/
+```
+
+### Converting pytorch-lightning checkpoints
+pytorch lightning ``-do_predict`` often fails, after you are done training, the best way to evaluate your model is to convert it.
+
+This should be done for you, with a file called `{save_dir}/best_tfmr`.
+
+If that file doesn't exist but you have a lightning `.ckpt` file, you can run
+```bash
+python convert_pl_checkpoint_to_hf.py PATH_TO_CKPT  randomly_initialized_hf_model_path save_dir/best_tfmr
+```
+Then either `run_eval` or `run_distributed_eval` with `save_dir/best_tfmr` (see previous sections)
+
+
+# Experimental Features
+These features are harder to use and not always useful.
+
+###  Dynamic Batch Size for MT
+`finetune.py` has a command line arg `--max_tokens_per_batch` that allows batches to be dynamically sized.
+This feature can only be used:
+- with fairseq installed
+- on 1 GPU
+- without sortish sampler
+- after calling `./save_len_file.py $tok $data_dir`
+
+For example,
+```bash
+./save_len_file.py Helsinki-NLP/opus-mt-en-ro  wmt_en_ro
+./dynamic_bs_example.sh --max_tokens_per_batch=2000 --output_dir benchmark_dynamic_bs
+```
+splits `wmt_en_ro/train` into 11,197 uneven length batches and can finish 1 epoch in 8 minutes on a v100.
+
+For comparison,
+```bash
+./dynamic_bs_example.sh --sortish_sampler --train_batch_size 48
+```
+uses 12,723 batches of length 48 and takes slightly more time 9.5 minutes.
+
+The feature is still experimental, because:
+ we can make it much more robust if we have memory mapped/preprocessed datasets.
+ The speedup over sortish sampler is not that large at the moment.
--- a/examples/legacy/seq2seq/init.py
+++ b/examples/legacy/seq2seq/init.py
@ -0,0 +1,5 @@
+import os
+import sys
+
+
+sys.path.insert(1, os.path.dirname(os.path.realpath(__file__)))
--- a/examples/legacy/seq2seq/convert_model_to_fp16.py
+++ b/examples/legacy/seq2seq/convert_model_to_fp16.py
@ -0,0 +1,36 @@
+#!/usr/bin/env python
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Union
+
+import fire
+import torch
+from tqdm import tqdm
+
+
+def convert(src_path: str, map_location: str = "cpu", save_path: Union[str, None] = None) -> None:
+    """Convert a pytorch_model.bin or model.pt file to torch.float16 for faster downloads, less disk space."""
+    state_dict = torch.load(src_path, map_location=map_location, weights_only=True)
+    for k, v in tqdm(state_dict.items()):
+        if not isinstance(v, torch.Tensor):
+            raise TypeError("FP16 conversion only works on paths that are saved state dicts, like pytorch_model.bin")
+        state_dict[k] = v.half()
+    if save_path is None:  # overwrite src_path
+        save_path = src_path
+    torch.save(state_dict, save_path)
+
+
+if __name__ == "__main__":
+    fire.Fire(convert)
--- a/examples/legacy/seq2seq/download_wmt.py
+++ b/examples/legacy/seq2seq/download_wmt.py
@ -0,0 +1,67 @@
+#!/usr/bin/env python
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pathlib import Path
+
+import fire
+from tqdm import tqdm
+
+
+def download_wmt_dataset(src_lang="ro", tgt_lang="en", dataset="wmt16", save_dir=None) -> None:
+    """Download a dataset using the datasets package and save it to the format expected by finetune.py
+    Format of save_dir: train.source, train.target, val.source, val.target, test.source, test.target.
+
+    Args:
+        src_lang: <str> source language
+        tgt_lang: <str> target language
+        dataset: <str> wmt16, wmt17, etc. wmt16 is a good start as it's small. To get the full list run `import datasets; print([d.id for d in datasets.list_datasets() if "wmt" in d.id])`
+        save_dir: <str>, where to save the datasets, defaults to f'{dataset}-{src_lang}-{tgt_lang}'
+
+    Usage:
+        >>> download_wmt_dataset('ro', 'en', dataset='wmt16') # saves to wmt16-ro-en
+    """
+    try:
+        import datasets
+    except (ModuleNotFoundError, ImportError):
+        raise ImportError("run pip install datasets")
+    pair = f"{src_lang}-{tgt_lang}"
+    print(f"Converting {dataset}-{pair}")
+    ds = datasets.load_dataset(dataset, pair)
+    if save_dir is None:
+        save_dir = f"{dataset}-{pair}"
+    save_dir = Path(save_dir)
+    save_dir.mkdir(exist_ok=True)
+
+    for split in ds:
+        print(f"Splitting {split} with {ds[split].num_rows} records")
+
+        # to save to val.source, val.target like summary datasets
+        fn = "val" if split == "validation" else split
+        src_path = save_dir.joinpath(f"{fn}.source")
+        tgt_path = save_dir.joinpath(f"{fn}.target")
+        src_fp = src_path.open("w+")
+        tgt_fp = tgt_path.open("w+")
+
+        # reader is the bottleneck so writing one record at a time doesn't slow things down
+        for x in tqdm(ds[split]):
+            ex = x["translation"]
+            src_fp.write(ex[src_lang] + "\n")
+            tgt_fp.write(ex[tgt_lang] + "\n")
+
+    print(f"Saved {dataset} dataset to {save_dir}")
+
+
+if __name__ == "__main__":
+    fire.Fire(download_wmt_dataset)
--- a/src/transformers/pipelines/deprecated/init.py
+++ b/src/transformers/pipelines/deprecated/init.py
@ -1,5 +1,4 @@
-# coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team.
+# Copyright 2020 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -13,4 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from .text2text_generation import SummarizationPipeline, Text2TextGenerationPipeline, TranslationPipeline
+# the proper usage is documented in the README, you need to specify data_dir, output_dir and model_name_or_path
+# run ./finetune.sh --help to see all the possible options
+python finetune_trainer.py \
+    --learning_rate=3e-5 \
+    --fp16 \
+    --do_train --do_eval --do_predict \
+    --eval_strategy steps \
+    --predict_with_generate \
+    --n_val 1000 \
+    "$@"
--- a/examples/legacy/seq2seq/finetune_tpu.sh
+++ b/examples/legacy/seq2seq/finetune_tpu.sh
@ -0,0 +1,26 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+export TPU_NUM_CORES=8
+
+# the proper usage is documented in the README, you need to specify data_dir, output_dir and model_name_or_path
+# run ./finetune_tpu.sh --help to see all the possible options
+# To specify the number of cores to use, use the TPU_NUM_DEVICES environment variable
+python xla_spawn.py finetune_trainer.py \
+    --learning_rate=3e-5 \
+    --do_train --do_eval \
+    --eval_strategy steps \
+    --prediction_loss_only \
+    --n_val 1000 \
+    "$@"
--- a/examples/legacy/seq2seq/finetune_trainer.py
+++ b/examples/legacy/seq2seq/finetune_trainer.py
@ -0,0 +1,370 @@
+#!/usr/bin/env python
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+from dataclasses import dataclass, field
+from typing import Optional
+
+from seq2seq_trainer import Seq2SeqTrainer
+from seq2seq_training_args import Seq2SeqTrainingArguments
+
+import transformers
+from transformers import (
+    AutoConfig,
+    AutoModelForSeq2SeqLM,
+    AutoTokenizer,
+    HfArgumentParser,
+    MBartTokenizer,
+    MBartTokenizerFast,
+    set_seed,
+)
+from transformers.trainer_utils import is_main_process
+from transformers.training_args import ParallelMode
+from utils import (
+    Seq2SeqDataCollator,
+    Seq2SeqDataset,
+    assert_all_frozen,
+    build_compute_metrics_fn,
+    freeze_embeds,
+    freeze_params,
+    lmap,
+    save_json,
+    use_task_specific_params,
+    write_txt_file,
+)
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
+    )
+    freeze_encoder: bool = field(default=False, metadata={"help": "Whether tp freeze the encoder."})
+    freeze_embeds: bool = field(default=False, metadata={"help": "Whether  to freeze the embeddings."})
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+
+    data_dir: str = field(
+        metadata={"help": "The input data dir. Should contain the .tsv files (or other data files) for the task."}
+    )
+    task: Optional[str] = field(
+        default="summarization",
+        metadata={"help": "Task name, summarization (or summarization_{dataset} for pegasus) or translation"},
+    )
+    max_source_length: Optional[int] = field(
+        default=1024,
+        metadata={
+            "help": (
+                "The maximum total input sequence length after tokenization. Sequences longer "
+                "than this will be truncated, sequences shorter will be padded."
+            )
+        },
+    )
+    max_target_length: Optional[int] = field(
+        default=128,
+        metadata={
+            "help": (
+                "The maximum total sequence length for target text after tokenization. Sequences longer "
+                "than this will be truncated, sequences shorter will be padded."
+            )
+        },
+    )
+    val_max_target_length: Optional[int] = field(
+        default=142,
+        metadata={
+            "help": (
+                "The maximum total sequence length for validation target text after tokenization. Sequences longer "
+                "than this will be truncated, sequences shorter will be padded. "
+                "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used "
+                "during ``evaluate`` and ``predict``."
+            )
+        },
+    )
+    test_max_target_length: Optional[int] = field(
+        default=142,
+        metadata={
+            "help": (
+                "The maximum total sequence length for test target text after tokenization. Sequences longer "
+                "than this will be truncated, sequences shorter will be padded."
+            )
+        },
+    )
+    n_train: Optional[int] = field(default=-1, metadata={"help": "# training examples. -1 means use all."})
+    n_val: Optional[int] = field(default=-1, metadata={"help": "# validation examples. -1 means use all."})
+    n_test: Optional[int] = field(default=-1, metadata={"help": "# test examples. -1 means use all."})
+    src_lang: Optional[str] = field(default=None, metadata={"help": "Source language id for translation."})
+    tgt_lang: Optional[str] = field(default=None, metadata={"help": "Target language id for translation."})
+    eval_beams: Optional[int] = field(default=None, metadata={"help": "# num_beams to use for evaluation."})
+    ignore_pad_token_for_loss: bool = field(
+        default=True,
+        metadata={"help": "If only pad tokens should be ignored. This assumes that `config.pad_token_id` is defined."},
+    )
+
+
+def handle_metrics(split, metrics, output_dir):
+    """
+    Log and save metrics
+
+    Args:
+    - split: one of train, val, test
+    - metrics: metrics dict
+    - output_dir: where to save the metrics
+    """
+
+    logger.info(f"***** {split} metrics *****")
+    for key in sorted(metrics.keys()):
+        logger.info(f"  {key} = {metrics[key]}")
+    save_json(metrics, os.path.join(output_dir, f"{split}_results.json"))
+
+
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))
+
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if training_args.local_process_index in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        training_args.local_process_index,
+        training_args.device,
+        training_args.n_gpu,
+        bool(training_args.parallel_mode == ParallelMode.DISTRIBUTED),
+        training_args.fp16,
+    )
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()
+    # Set the verbosity to info of the Transformers logger (on main process only):
+    if is_main_process(training_args.local_process_index):
+        transformers.utils.logging.set_verbosity_info()
+    logger.info("Training/evaluation parameters %s", training_args)
+
+    # Set seed
+    set_seed(training_args.seed)
+
+    # Load pretrained model and tokenizer
+    #
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+
+    config = AutoConfig.from_pretrained(
+        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+    )
+
+    extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "dropout", "attention_dropout")
+    for p in extra_model_params:
+        if getattr(training_args, p, None):
+            assert hasattr(config, p), f"({config.__class__.__name__}) doesn't have a `{p}` attribute"
+            setattr(config, p, getattr(training_args, p))
+
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+    )
+    model = AutoModelForSeq2SeqLM.from_pretrained(
+        model_args.model_name_or_path,
+        from_tf=".ckpt" in model_args.model_name_or_path,
+        config=config,
+        cache_dir=model_args.cache_dir,
+    )
+
+    # use task specific params
+    use_task_specific_params(model, data_args.task)
+
+    # set num_beams for evaluation
+    if data_args.eval_beams is None:
+        data_args.eval_beams = model.config.num_beams
+
+    # set decoder_start_token_id for MBart
+    if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
+        assert data_args.tgt_lang is not None and data_args.src_lang is not None, (
+            "mBart requires --tgt_lang and --src_lang"
+        )
+        if isinstance(tokenizer, MBartTokenizer):
+            model.config.decoder_start_token_id = tokenizer.lang_code_to_id[data_args.tgt_lang]
+        else:
+            model.config.decoder_start_token_id = tokenizer.convert_tokens_to_ids(data_args.tgt_lang)
+
+    if model_args.freeze_embeds:
+        freeze_embeds(model)
+    if model_args.freeze_encoder:
+        freeze_params(model.get_encoder())
+        assert_all_frozen(model.get_encoder())
+
+    dataset_class = Seq2SeqDataset
+
+    # Get datasets
+    train_dataset = (
+        dataset_class(
+            tokenizer,
+            type_path="train",
+            data_dir=data_args.data_dir,
+            n_obs=data_args.n_train,
+            max_target_length=data_args.max_target_length,
+            max_source_length=data_args.max_source_length,
+            prefix=model.config.prefix or "",
+        )
+        if training_args.do_train
+        else None
+    )
+    eval_dataset = (
+        dataset_class(
+            tokenizer,
+            type_path="val",
+            data_dir=data_args.data_dir,
+            n_obs=data_args.n_val,
+            max_target_length=data_args.val_max_target_length,
+            max_source_length=data_args.max_source_length,
+            prefix=model.config.prefix or "",
+        )
+        if training_args.do_eval
+        else None
+    )
+    test_dataset = (
+        dataset_class(
+            tokenizer,
+            type_path="test",
+            data_dir=data_args.data_dir,
+            n_obs=data_args.n_test,
+            max_target_length=data_args.test_max_target_length,
+            max_source_length=data_args.max_source_length,
+            prefix=model.config.prefix or "",
+        )
+        if training_args.do_predict
+        else None
+    )
+
+    # Initialize our Trainer
+    compute_metrics_fn = (
+        build_compute_metrics_fn(data_args.task, tokenizer) if training_args.predict_with_generate else None
+    )
+    trainer = Seq2SeqTrainer(
+        model=model,
+        args=training_args,
+        data_args=data_args,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        data_collator=Seq2SeqDataCollator(tokenizer, data_args, model.config.decoder_start_token_id),
+        compute_metrics=compute_metrics_fn,
+        processing_class=tokenizer,
+    )
+
+    all_metrics = {}
+    # Training
+    if training_args.do_train:
+        logger.info("*** Train ***")
+
+        train_result = trainer.train(
+            model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
+        )
+        metrics = train_result.metrics
+        metrics["train_n_objs"] = data_args.n_train
+
+        trainer.save_model()  # this also saves the tokenizer
+
+        if trainer.is_world_process_zero():
+            handle_metrics("train", metrics, training_args.output_dir)
+            all_metrics.update(metrics)
+
+            # Need to save the state, since Trainer.save_model saves only the tokenizer with the model
+            trainer.state.save_to_json(os.path.join(training_args.output_dir, "trainer_state.json"))
+
+            # For convenience, we also re-save the tokenizer to the same directory,
+            # so that you can share your model easily on huggingface.co/models =)
+            tokenizer.save_pretrained(training_args.output_dir)
+
+    # Evaluation
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+
+        metrics = trainer.evaluate(metric_key_prefix="val")
+        metrics["val_n_objs"] = data_args.n_val
+        metrics["val_loss"] = round(metrics["val_loss"], 4)
+
+        if trainer.is_world_process_zero():
+            handle_metrics("val", metrics, training_args.output_dir)
+            all_metrics.update(metrics)
+
+    if training_args.do_predict:
+        logger.info("*** Predict ***")
+
+        test_output = trainer.predict(test_dataset=test_dataset, metric_key_prefix="test")
+        metrics = test_output.metrics
+        metrics["test_n_objs"] = data_args.n_test
+
+        if trainer.is_world_process_zero():
+            metrics["test_loss"] = round(metrics["test_loss"], 4)
+            handle_metrics("test", metrics, training_args.output_dir)
+            all_metrics.update(metrics)
+
+            if training_args.predict_with_generate:
+                test_preds = tokenizer.batch_decode(
+                    test_output.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
+                )
+                test_preds = lmap(str.strip, test_preds)
+                write_txt_file(test_preds, os.path.join(training_args.output_dir, "test_generations.txt"))
+
+    if trainer.is_world_process_zero():
+        save_json(all_metrics, os.path.join(training_args.output_dir, "all_results.json"))
+
+    return all_metrics
+
+
+def _mp_fn(index):
+    # For xla_spawn (TPUs)
+    main()
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/legacy/seq2seq/minify_dataset.py
+++ b/examples/legacy/seq2seq/minify_dataset.py
@ -0,0 +1,34 @@
+#!/usr/bin/env python
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pathlib import Path
+
+import fire
+
+
+def minify(src_dir: str, dest_dir: str, n: int):
+    """Write first n lines of each file f in src_dir to dest_dir/f"""
+    src_dir = Path(src_dir)
+    dest_dir = Path(dest_dir)
+    dest_dir.mkdir(exist_ok=True)
+    for path in src_dir.iterdir():
+        new = [x.rstrip() for x in list(path.open().readlines())][:n]
+        dest_path = dest_dir.joinpath(path.name)
+        print(dest_path)
+        dest_path.open("w").write("\n".join(new))
+
+
+if __name__ == "__main__":
+    fire.Fire(minify)
--- a/examples/legacy/seq2seq/old_test_calculate_rouge.py
+++ b/examples/legacy/seq2seq/old_test_calculate_rouge.py
@ -0,0 +1,109 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+from pathlib import Path
+
+import pandas as pd
+from rouge_cli import calculate_rouge_path
+
+from utils import calculate_rouge
+
+
+PRED = [
+    'Prosecutor: "No videos were used in the crash investigation" German papers say they saw a cell phone video of the'
+    ' final seconds on board Flight 9525. The Germanwings co-pilot says he had a "previous episode of severe'
+    " depression\" German airline confirms it knew of Andreas Lubitz's depression years before he took control.",
+    "The Palestinian Authority officially becomes the 123rd member of the International Criminal Court. The formal"
+    " accession was marked with a ceremony at The Hague, in the Netherlands. The Palestinians signed the ICC's"
+    " founding Rome Statute in January. Israel and the United States opposed the Palestinians' efforts to join the"
+    " body.",
+    "Amnesty International releases its annual report on the death penalty. The report catalogs the use of"
+    " state-sanctioned killing as a punitive measure across the globe. At least 607 people were executed around the"
+    " world in 2014, compared to 778 in 2013. The U.S. remains one of the worst offenders for imposing capital"
+    " punishment.",
+]
+
+TGT = [
+    'Marseille prosecutor says "so far no videos were used in the crash investigation" despite media reports .'
+    ' Journalists at Bild and Paris Match are "very confident" the video clip is real, an editor says . Andreas Lubitz'
+    " had informed his Lufthansa training school of an episode of severe depression, airline says .",
+    "Membership gives the ICC jurisdiction over alleged crimes committed in Palestinian territories since last June ."
+    " Israel and the United States opposed the move, which could open the door to war crimes investigations against"
+    " Israelis .",
+    "Amnesty's annual death penalty report catalogs encouraging signs, but setbacks in numbers of those sentenced to"
+    " death . Organization claims that governments around the world are using the threat of terrorism to advance"
+    " executions . The number of executions worldwide has gone down by almost 22% compared with 2013, but death"
+    " sentences up by 28% .",
+]
+
+
+def test_disaggregated_scores_are_determinstic():
+    no_aggregation = calculate_rouge(PRED, TGT, bootstrap_aggregation=False, rouge_keys=["rouge2", "rougeL"])
+    assert isinstance(no_aggregation, defaultdict)
+    no_aggregation_just_r2 = calculate_rouge(PRED, TGT, bootstrap_aggregation=False, rouge_keys=["rouge2"])
+    assert (
+        pd.DataFrame(no_aggregation["rouge2"]).fmeasure.mean()
+        == pd.DataFrame(no_aggregation_just_r2["rouge2"]).fmeasure.mean()
+    )
+
+
+def test_newline_cnn_improvement():
+    k = "rougeLsum"
+    score = calculate_rouge(PRED, TGT, newline_sep=True, rouge_keys=[k])[k]
+    score_no_sep = calculate_rouge(PRED, TGT, newline_sep=False, rouge_keys=[k])[k]
+    assert score > score_no_sep
+
+
+def test_newline_irrelevant_for_other_metrics():
+    k = ["rouge1", "rouge2", "rougeL"]
+    score_sep = calculate_rouge(PRED, TGT, newline_sep=True, rouge_keys=k)
+    score_no_sep = calculate_rouge(PRED, TGT, newline_sep=False, rouge_keys=k)
+    assert score_sep == score_no_sep
+
+
+def test_single_sent_scores_dont_depend_on_newline_sep():
+    pred = [
+        "Her older sister, Margot Frank, died in 1945, a month earlier than previously thought.",
+        'Marseille prosecutor says "so far no videos were used in the crash investigation" despite media reports .',
+    ]
+    tgt = [
+        "Margot Frank, died in 1945, a month earlier than previously thought.",
+        'Prosecutor: "No videos were used in the crash investigation" German papers say they saw a cell phone video of'
+        " the final seconds on board Flight 9525.",
+    ]
+    assert calculate_rouge(pred, tgt, newline_sep=True) == calculate_rouge(pred, tgt, newline_sep=False)
+
+
+def test_pegasus_newline():
+    pred = [
+        """" "a person who has such a video needs to immediately give it to the investigators," prosecutor says .<n> "it is a very disturbing scene," editor-in-chief of bild online tells "erin burnett: outfront" """
+    ]
+    tgt = [
+        """ Marseille prosecutor says "so far no videos were used in the crash investigation" despite media reports . Journalists at Bild and Paris Match are "very confident" the video clip is real, an editor says . Andreas Lubitz had informed his Lufthansa training school of an episode of severe depression, airline says ."""
+    ]
+
+    prev_score = calculate_rouge(pred, tgt, rouge_keys=["rougeLsum"], newline_sep=False)["rougeLsum"]
+    new_score = calculate_rouge(pred, tgt, rouge_keys=["rougeLsum"])["rougeLsum"]
+    assert new_score > prev_score
+
+
+def test_rouge_cli():
+    data_dir = Path("examples/seq2seq/test_data/wmt_en_ro")
+    metrics = calculate_rouge_path(data_dir.joinpath("test.source"), data_dir.joinpath("test.target"))
+    assert isinstance(metrics, dict)
+    metrics_default_dict = calculate_rouge_path(
+        data_dir.joinpath("test.source"), data_dir.joinpath("test.target"), bootstrap_aggregation=False
+    )
+    assert isinstance(metrics_default_dict, defaultdict)
--- a/examples/legacy/seq2seq/old_test_datasets.py
+++ b/examples/legacy/seq2seq/old_test_datasets.py
@ -0,0 +1,247 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from pathlib import Path
+
+import numpy as np
+import pytest
+from pack_dataset import pack_data_dir
+from parameterized import parameterized
+from save_len_file import save_len_file
+from torch.utils.data import DataLoader
+
+from transformers import AutoTokenizer
+from transformers.models.mbart.modeling_mbart import shift_tokens_right
+from transformers.testing_utils import TestCasePlus, slow
+from utils import FAIRSEQ_AVAILABLE, DistributedSortishSampler, LegacySeq2SeqDataset, Seq2SeqDataset
+
+
+BERT_BASE_CASED = "google-bert/bert-base-cased"
+PEGASUS_XSUM = "google/pegasus-xsum"
+ARTICLES = [" Sam ate lunch today.", "Sams lunch ingredients."]
+SUMMARIES = ["A very interesting story about what I ate for lunch.", "Avocado, celery, turkey, coffee"]
+T5_TINY = "patrickvonplaten/t5-tiny-random"
+BART_TINY = "sshleifer/bart-tiny-random"
+MBART_TINY = "sshleifer/tiny-mbart"
+MARIAN_TINY = "sshleifer/tiny-marian-en-de"
+
+
+def _dump_articles(path: Path, articles: list):
+    content = "\n".join(articles)
+    Path(path).open("w").writelines(content)
+
+
+def make_test_data_dir(tmp_dir):
+    for split in ["train", "val", "test"]:
+        _dump_articles(os.path.join(tmp_dir, f"{split}.source"), ARTICLES)
+        _dump_articles(os.path.join(tmp_dir, f"{split}.target"), SUMMARIES)
+    return tmp_dir
+
+
+class TestAll(TestCasePlus):
+    @parameterized.expand(
+        [
+            MBART_TINY,
+            MARIAN_TINY,
+            T5_TINY,
+            BART_TINY,
+            PEGASUS_XSUM,
+        ],
+    )
+    @slow
+    def test_seq2seq_dataset_truncation(self, tok_name):
+        tokenizer = AutoTokenizer.from_pretrained(tok_name)
+        tmp_dir = make_test_data_dir(tmp_dir=self.get_auto_remove_tmp_dir())
+        max_len_source = max(len(tokenizer.encode(a)) for a in ARTICLES)
+        max_len_target = max(len(tokenizer.encode(a)) for a in SUMMARIES)
+        max_src_len = 4
+        max_tgt_len = 8
+        assert max_len_target > max_src_len  # Will be truncated
+        assert max_len_source > max_src_len  # Will be truncated
+        src_lang, tgt_lang = "ro_RO", "de_DE"  # ignored for all but mbart, but never causes error.
+        train_dataset = Seq2SeqDataset(
+            tokenizer,
+            data_dir=tmp_dir,
+            type_path="train",
+            max_source_length=max_src_len,
+            max_target_length=max_tgt_len,  # ignored
+            src_lang=src_lang,
+            tgt_lang=tgt_lang,
+        )
+        dataloader = DataLoader(train_dataset, batch_size=2, collate_fn=train_dataset.collate_fn)
+        for batch in dataloader:
+            assert isinstance(batch, dict)
+            assert batch["attention_mask"].shape == batch["input_ids"].shape
+            # show that articles were trimmed.
+            assert batch["input_ids"].shape[1] == max_src_len
+            # show that targets are the same len
+            assert batch["labels"].shape[1] == max_tgt_len
+            if tok_name != MBART_TINY:
+                continue
+            # check language codes in correct place
+            batch["decoder_input_ids"] = shift_tokens_right(batch["labels"], tokenizer.pad_token_id)
+            assert batch["decoder_input_ids"][0, 0].item() == tokenizer.lang_code_to_id[tgt_lang]
+            assert batch["decoder_input_ids"][0, -1].item() == tokenizer.eos_token_id
+            assert batch["input_ids"][0, -2].item() == tokenizer.eos_token_id
+            assert batch["input_ids"][0, -1].item() == tokenizer.lang_code_to_id[src_lang]
+
+            break  # No need to test every batch
+
+    @parameterized.expand([BART_TINY, BERT_BASE_CASED])
+    def test_legacy_dataset_truncation(self, tok):
+        tokenizer = AutoTokenizer.from_pretrained(tok)
+        tmp_dir = make_test_data_dir(tmp_dir=self.get_auto_remove_tmp_dir())
+        max_len_source = max(len(tokenizer.encode(a)) for a in ARTICLES)
+        max_len_target = max(len(tokenizer.encode(a)) for a in SUMMARIES)
+        trunc_target = 4
+        train_dataset = LegacySeq2SeqDataset(
+            tokenizer,
+            data_dir=tmp_dir,
+            type_path="train",
+            max_source_length=20,
+            max_target_length=trunc_target,
+        )
+        dataloader = DataLoader(train_dataset, batch_size=2, collate_fn=train_dataset.collate_fn)
+        for batch in dataloader:
+            assert batch["attention_mask"].shape == batch["input_ids"].shape
+            # show that articles were trimmed.
+            assert batch["input_ids"].shape[1] == max_len_source
+            assert 20 >= batch["input_ids"].shape[1]  # trimmed significantly
+            # show that targets were truncated
+            assert batch["labels"].shape[1] == trunc_target  # Truncated
+            assert max_len_target > trunc_target  # Truncated
+            break  # No need to test every batch
+
+    def test_pack_dataset(self):
+        tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-cc25")
+
+        tmp_dir = Path(make_test_data_dir(tmp_dir=self.get_auto_remove_tmp_dir()))
+        orig_examples = tmp_dir.joinpath("train.source").open().readlines()
+        save_dir = Path(make_test_data_dir(tmp_dir=self.get_auto_remove_tmp_dir()))
+        pack_data_dir(tokenizer, tmp_dir, 128, save_dir)
+        orig_paths = {x.name for x in tmp_dir.iterdir()}
+        new_paths = {x.name for x in save_dir.iterdir()}
+        packed_examples = save_dir.joinpath("train.source").open().readlines()
+        # orig: [' Sam ate lunch today.\n', 'Sams lunch ingredients.']
+        # desired_packed: [' Sam ate lunch today.\n Sams lunch ingredients.']
+        assert len(packed_examples) < len(orig_examples)
+        assert len(packed_examples) == 1
+        assert len(packed_examples[0]) == sum(len(x) for x in orig_examples)
+        assert orig_paths == new_paths
+
+    @pytest.mark.skipif(not FAIRSEQ_AVAILABLE, reason="This test requires fairseq")
+    def test_dynamic_batch_size(self):
+        if not FAIRSEQ_AVAILABLE:
+            return
+        ds, max_tokens, tokenizer = self._get_dataset(max_len=64)
+        required_batch_size_multiple = 64
+        batch_sampler = ds.make_dynamic_sampler(max_tokens, required_batch_size_multiple=required_batch_size_multiple)
+        batch_sizes = [len(x) for x in batch_sampler]
+        assert len(set(batch_sizes)) > 1  # it's not dynamic batch size if every batch is the same length
+        assert sum(batch_sizes) == len(ds)  # no dropped or added examples
+        data_loader = DataLoader(ds, batch_sampler=batch_sampler, collate_fn=ds.collate_fn, num_workers=2)
+        failures = []
+        num_src_per_batch = []
+        for batch in data_loader:
+            src_shape = batch["input_ids"].shape
+            bs = src_shape[0]
+            assert bs % required_batch_size_multiple == 0 or bs < required_batch_size_multiple
+            num_src_tokens = np.product(batch["input_ids"].shape)
+            num_src_per_batch.append(num_src_tokens)
+            if num_src_tokens > (max_tokens * 1.1):
+                failures.append(num_src_tokens)
+        assert num_src_per_batch[0] == max(num_src_per_batch)
+        if failures:
+            raise AssertionError(f"too many tokens in {len(failures)} batches")
+
+    def test_sortish_sampler_reduces_padding(self):
+        ds, _, tokenizer = self._get_dataset(max_len=512)
+        bs = 2
+        sortish_sampler = ds.make_sortish_sampler(bs, shuffle=False)
+
+        naive_dl = DataLoader(ds, batch_size=bs, collate_fn=ds.collate_fn, num_workers=2)
+        sortish_dl = DataLoader(ds, batch_size=bs, collate_fn=ds.collate_fn, num_workers=2, sampler=sortish_sampler)
+
+        pad = tokenizer.pad_token_id
+
+        def count_pad_tokens(data_loader, k="input_ids"):
+            return [batch[k].eq(pad).sum().item() for batch in data_loader]
+
+        assert sum(count_pad_tokens(sortish_dl, k="labels")) < sum(count_pad_tokens(naive_dl, k="labels"))
+        assert sum(count_pad_tokens(sortish_dl)) < sum(count_pad_tokens(naive_dl))
+        assert len(sortish_dl) == len(naive_dl)
+
+    def _get_dataset(self, n_obs=1000, max_len=128):
+        if os.getenv("USE_REAL_DATA", None):
+            data_dir = "examples/seq2seq/wmt_en_ro"
+            max_tokens = max_len * 2 * 64
+            if not Path(data_dir).joinpath("train.len").exists():
+                save_len_file(MARIAN_TINY, data_dir)
+        else:
+            data_dir = "examples/seq2seq/test_data/wmt_en_ro"
+            max_tokens = max_len * 4
+            save_len_file(MARIAN_TINY, data_dir)
+
+        tokenizer = AutoTokenizer.from_pretrained(MARIAN_TINY)
+        ds = Seq2SeqDataset(
+            tokenizer,
+            data_dir=data_dir,
+            type_path="train",
+            max_source_length=max_len,
+            max_target_length=max_len,
+            n_obs=n_obs,
+        )
+        return ds, max_tokens, tokenizer
+
+    def test_distributed_sortish_sampler_splits_indices_between_procs(self):
+        ds, max_tokens, tokenizer = self._get_dataset()
+        ids1 = set(DistributedSortishSampler(ds, 256, num_replicas=2, rank=0, add_extra_examples=False))
+        ids2 = set(DistributedSortishSampler(ds, 256, num_replicas=2, rank=1, add_extra_examples=False))
+        assert ids1.intersection(ids2) == set()
+
+    @parameterized.expand(
+        [
+            MBART_TINY,
+            MARIAN_TINY,
+            T5_TINY,
+            BART_TINY,
+            PEGASUS_XSUM,
+        ],
+    )
+    def test_dataset_kwargs(self, tok_name):
+        tokenizer = AutoTokenizer.from_pretrained(tok_name, use_fast=False)
+        if tok_name == MBART_TINY:
+            train_dataset = Seq2SeqDataset(
+                tokenizer,
+                data_dir=make_test_data_dir(tmp_dir=self.get_auto_remove_tmp_dir()),
+                type_path="train",
+                max_source_length=4,
+                max_target_length=8,
+                src_lang="EN",
+                tgt_lang="FR",
+            )
+            kwargs = train_dataset.dataset_kwargs
+            assert "src_lang" in kwargs and "tgt_lang" in kwargs
+        else:
+            train_dataset = Seq2SeqDataset(
+                tokenizer,
+                data_dir=make_test_data_dir(tmp_dir=self.get_auto_remove_tmp_dir()),
+                type_path="train",
+                max_source_length=4,
+                max_target_length=8,
+            )
+            kwargs = train_dataset.dataset_kwargs
+            assert "add_prefix_space" not in kwargs if tok_name != BART_TINY else "add_prefix_space" in kwargs
+            assert len(kwargs) == 1 if tok_name == BART_TINY else len(kwargs) == 0
--- a/examples/legacy/seq2seq/old_test_fsmt_bleu_score.py
+++ b/examples/legacy/seq2seq/old_test_fsmt_bleu_score.py
@ -0,0 +1,70 @@
+# Copyright 2020 Huggingface
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import unittest
+
+from parameterized import parameterized
+
+from transformers import FSMTForConditionalGeneration, FSMTTokenizer
+from transformers.testing_utils import get_tests_dir, require_torch, slow, torch_device
+from utils import calculate_bleu
+
+
+filename = get_tests_dir() + "/test_data/fsmt/fsmt_val_data.json"
+with open(filename, encoding="utf-8") as f:
+    bleu_data = json.load(f)
+
+
+@require_torch
+class ModelEvalTester(unittest.TestCase):
+    def get_tokenizer(self, mname):
+        return FSMTTokenizer.from_pretrained(mname)
+
+    def get_model(self, mname):
+        model = FSMTForConditionalGeneration.from_pretrained(mname).to(torch_device)
+        if torch_device == "cuda":
+            model.half()
+        return model
+
+    @parameterized.expand(
+        [
+            ["en-ru", 26.0],
+            ["ru-en", 22.0],
+            ["en-de", 22.0],
+            ["de-en", 29.0],
+        ]
+    )
+    @slow
+    def test_bleu_scores(self, pair, min_bleu_score):
+        # note: this test is not testing the best performance since it only evals a small batch
+        # but it should be enough to detect a regression in the output quality
+        mname = f"facebook/wmt19-{pair}"
+        tokenizer = self.get_tokenizer(mname)
+        model = self.get_model(mname)
+
+        src_sentences = bleu_data[pair]["src"]
+        tgt_sentences = bleu_data[pair]["tgt"]
+
+        batch = tokenizer(src_sentences, return_tensors="pt", truncation=True, padding="longest").to(torch_device)
+        outputs = model.generate(
+            input_ids=batch.input_ids,
+            num_beams=8,
+        )
+        decoded_sentences = tokenizer.batch_decode(
+            outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+        scores = calculate_bleu(decoded_sentences, tgt_sentences)
+        print(scores)
+        self.assertGreaterEqual(scores["bleu"], min_bleu_score)
--- a/examples/legacy/seq2seq/old_test_seq2seq_examples.py
+++ b/examples/legacy/seq2seq/old_test_seq2seq_examples.py
@ -0,0 +1,132 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import os
+import sys
+from pathlib import Path
+from unittest.mock import patch
+
+from parameterized import parameterized
+from run_eval import run_generate
+from run_eval_search import run_search
+
+from transformers.testing_utils import CaptureStdout, TestCasePlus, slow
+from utils import ROUGE_KEYS
+
+
+logging.basicConfig(level=logging.DEBUG)
+logger = logging.getLogger()
+
+
+def _dump_articles(path: Path, articles: list):
+    content = "\n".join(articles)
+    Path(path).open("w").writelines(content)
+
+
+T5_TINY = "patrickvonplaten/t5-tiny-random"
+BART_TINY = "sshleifer/bart-tiny-random"
+MBART_TINY = "sshleifer/tiny-mbart"
+
+stream_handler = logging.StreamHandler(sys.stdout)
+logger.addHandler(stream_handler)
+logging.disable(logging.CRITICAL)  # remove noisy download output from tracebacks
+
+
+class TestTheRest(TestCasePlus):
+    def run_eval_tester(self, model):
+        input_file_name = Path(self.get_auto_remove_tmp_dir()) / "utest_input.source"
+        output_file_name = input_file_name.parent / "utest_output.txt"
+        assert not output_file_name.exists()
+        articles = [" New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County."]
+        _dump_articles(input_file_name, articles)
+
+        score_path = str(Path(self.get_auto_remove_tmp_dir()) / "scores.json")
+        task = "translation_en_to_de" if model == T5_TINY else "summarization"
+        testargs = f"""
+            run_eval_search.py
+            {model}
+            {input_file_name}
+            {output_file_name}
+            --score_path {score_path}
+            --task {task}
+            --num_beams 2
+            --length_penalty 2.0
+            """.split()
+
+        with patch.object(sys, "argv", testargs):
+            run_generate()
+            assert Path(output_file_name).exists()
+            # os.remove(Path(output_file_name))
+
+    # test one model to quickly (no-@slow) catch simple problems and do an
+    # extensive testing of functionality with multiple models as @slow separately
+    def test_run_eval(self):
+        self.run_eval_tester(T5_TINY)
+
+    # any extra models should go into the list here - can be slow
+    @parameterized.expand([BART_TINY, MBART_TINY])
+    @slow
+    def test_run_eval_slow(self, model):
+        self.run_eval_tester(model)
+
+    # testing with 2 models to validate: 1. translation (t5) 2. summarization (mbart)
+    @parameterized.expand([T5_TINY, MBART_TINY])
+    @slow
+    def test_run_eval_search(self, model):
+        input_file_name = Path(self.get_auto_remove_tmp_dir()) / "utest_input.source"
+        output_file_name = input_file_name.parent / "utest_output.txt"
+        assert not output_file_name.exists()
+
+        text = {
+            "en": ["Machine learning is great, isn't it?", "I like to eat bananas", "Tomorrow is another great day!"],
+            "de": [
+                "Maschinelles Lernen ist großartig, oder?",
+                "Ich esse gerne Bananen",
+                "Morgen ist wieder ein toller Tag!",
+            ],
+        }
+
+        tmp_dir = Path(self.get_auto_remove_tmp_dir())
+        score_path = str(tmp_dir / "scores.json")
+        reference_path = str(tmp_dir / "val.target")
+        _dump_articles(input_file_name, text["en"])
+        _dump_articles(reference_path, text["de"])
+        task = "translation_en_to_de" if model == T5_TINY else "summarization"
+        testargs = f"""
+            run_eval_search.py
+            {model}
+            {str(input_file_name)}
+            {str(output_file_name)}
+            --score_path {score_path}
+            --reference_path {reference_path}
+            --task {task}
+            """.split()
+        testargs.extend(["--search", "num_beams=1:2 length_penalty=0.9:1.0"])
+
+        with patch.object(sys, "argv", testargs):
+            with CaptureStdout() as cs:
+                run_search()
+            expected_strings = [" num_beams | length_penalty", model, "Best score args"]
+            un_expected_strings = ["Info"]
+            if "translation" in task:
+                expected_strings.append("bleu")
+            else:
+                expected_strings.extend(ROUGE_KEYS)
+            for w in expected_strings:
+                assert w in cs.out
+            for w in un_expected_strings:
+                assert w not in cs.out
+            assert Path(output_file_name).exists()
+            os.remove(Path(output_file_name))
--- a/examples/legacy/seq2seq/old_test_seq2seq_examples_multi_gpu.py
+++ b/examples/legacy/seq2seq/old_test_seq2seq_examples_multi_gpu.py
@ -0,0 +1,55 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# as due to their complexity multi-gpu tests could impact other tests, and to aid debug we have those in a separate module.
+
+import os
+import sys
+
+from transformers.testing_utils import TestCasePlus, execute_subprocess_async, get_gpu_count, require_torch_gpu, slow
+
+from .utils import load_json
+
+
+class TestSummarizationDistillerMultiGPU(TestCasePlus):
+    @classmethod
+    def setUpClass(cls):
+        return cls
+
+    @slow
+    @require_torch_gpu
+    def test_distributed_eval(self):
+        output_dir = self.get_auto_remove_tmp_dir()
+        args = f"""
+            --model_name Helsinki-NLP/opus-mt-en-ro
+            --save_dir {output_dir}
+            --data_dir {self.test_file_dir_str}/test_data/wmt_en_ro
+            --num_beams 2
+            --task translation
+        """.split()
+
+        # we want this test to run even if there is only one GPU, but if there are more we use them all
+        n_gpu = get_gpu_count()
+        distributed_args = f"""
+            -m torch.distributed.launch
+            --nproc_per_node={n_gpu}
+            {self.test_file_dir}/run_distributed_eval.py
+        """.split()
+        cmd = [sys.executable] + distributed_args + args
+        execute_subprocess_async(cmd, env=self.get_env())
+
+        metrics_save_path = os.path.join(output_dir, "test_bleu.json")
+        metrics = load_json(metrics_save_path)
+        # print(metrics)
+        self.assertGreaterEqual(metrics["bleu"], 25)
--- a/examples/legacy/seq2seq/old_test_tatoeba_conversion.py
+++ b/examples/legacy/seq2seq/old_test_tatoeba_conversion.py
@ -0,0 +1,38 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+import unittest
+
+from transformers.models.marian.convert_marian_tatoeba_to_pytorch import DEFAULT_REPO, TatoebaConverter
+from transformers.testing_utils import slow
+from transformers.utils import cached_property
+
+
+@unittest.skipUnless(os.path.exists(DEFAULT_REPO), "Tatoeba directory does not exist.")
+class TatoebaConversionTester(unittest.TestCase):
+    @cached_property
+    def resolver(self):
+        tmp_dir = tempfile.mkdtemp()
+        return TatoebaConverter(save_dir=tmp_dir)
+
+    @slow
+    def test_resolver(self):
+        self.resolver.convert_models(["heb-eng"])
+
+    @slow
+    def test_model_card(self):
+        content, mmeta = self.resolver.write_model_card("opus-mt-he-en", dry_run=True)
+        assert mmeta["long_pair"] == "heb-eng"
--- a/examples/legacy/seq2seq/pack_dataset.py
+++ b/examples/legacy/seq2seq/pack_dataset.py
@ -0,0 +1,87 @@
+#!/usr/bin/env python
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fill examples with bitext up to max_tokens without breaking up examples.
+[['I went', 'yo fui'],
+['to the store', 'a la tienda']
+]
+=> ['I went to the store', 'yo fui a la tienda']
+"""
+
+import argparse
+import shutil
+from pathlib import Path
+
+from tqdm import tqdm
+
+from transformers import AutoTokenizer
+
+
+def pack_examples(tok, src_examples, tgt_examples, max_tokens=1024):
+    finished_src, finished_tgt = [], []
+
+    sorted_examples = list(zip(src_examples, tgt_examples))
+    new_src, new_tgt = sorted_examples[0]
+
+    def is_too_big(strang):
+        return tok(strang, return_tensors="pt").input_ids.shape[1] > max_tokens
+
+    for src, tgt in tqdm(sorted_examples[1:]):
+        cand_src = new_src + " " + src
+        cand_tgt = new_tgt + " " + tgt
+        if is_too_big(cand_src) or is_too_big(cand_tgt):  # can't fit, finalize example
+            finished_src.append(new_src)
+            finished_tgt.append(new_tgt)
+            new_src, new_tgt = src, tgt
+        else:  # can fit, keep adding
+            new_src, new_tgt = cand_src, cand_tgt
+
+    # cleanup
+    if new_src:
+        assert new_tgt
+        finished_src.append(new_src)
+        finished_tgt.append(new_tgt)
+    return finished_src, finished_tgt
+
+
+def pack_data_dir(tok, data_dir: Path, max_tokens, save_path):
+    save_path = Path(save_path)
+    save_path.mkdir(exist_ok=True)
+    for split in ["train"]:
+        src_path, tgt_path = data_dir / f"{split}.source", data_dir / f"{split}.target"
+        src_docs = [x.rstrip() for x in Path(src_path).open()]
+        tgt_docs = [x.rstrip() for x in Path(tgt_path).open()]
+        packed_src, packed_tgt = pack_examples(tok, src_docs, tgt_docs, max_tokens)
+        print(f"packed {split} split from {len(src_docs)} examples -> {len(packed_src)}.")
+        Path(save_path / f"{split}.source").open("w").write("\n".join(packed_src))
+        Path(save_path / f"{split}.target").open("w").write("\n".join(packed_tgt))
+    for split in ["val", "test"]:
+        src_path, tgt_path = data_dir / f"{split}.source", data_dir / f"{split}.target"
+        shutil.copyfile(src_path, save_path / f"{split}.source")
+        shutil.copyfile(tgt_path, save_path / f"{split}.target")
+
+
+def packer_cli():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--tok_name", type=str, help="like facebook/bart-large-cnn,google-t5/t5-base, etc.")
+    parser.add_argument("--max_seq_len", type=int, default=128)
+    parser.add_argument("--data_dir", type=str)
+    parser.add_argument("--save_path", type=str)
+    args = parser.parse_args()
+    tokenizer = AutoTokenizer.from_pretrained(args.tok_name)
+    return pack_data_dir(tokenizer, Path(args.data_dir), args.max_seq_len, args.save_path)
+
+
+if __name__ == "__main__":
+    packer_cli()
--- a/examples/legacy/seq2seq/requirements.txt
+++ b/examples/legacy/seq2seq/requirements.txt
@ -0,0 +1,20 @@
+tensorboard
+scikit-learn
+seqeval
+psutil
+sacrebleu
+rouge-score
+tensorflow_datasets
+matplotlib
+git-python==1.0.3
+faiss-cpu
+streamlit
+elasticsearch
+nltk
+pandas
+datasets >= 1.1.3
+fire
+pytest<8.0.1
+conllu
+sentencepiece != 0.1.92
+protobuf
--- a/examples/legacy/seq2seq/romanian_postprocessing.md
+++ b/examples/legacy/seq2seq/romanian_postprocessing.md
@ -0,0 +1,65 @@
+### Motivation
+Without processing, english-> romanian mbart-large-en-ro gets BLEU score 26.8 on the WMT data.
+With post processing, it can score 37..
+Here is the postprocessing code, stolen from @mjpost in this [issue](https://github.com/pytorch/fairseq/issues/1758)
+
+
+
+### Instructions
+Note: You need to have your test_generations.txt before you start this process.
+(1) Setup `mosesdecoder` and `wmt16-scripts`
+```bash
+cd $HOME
+git clone git@github.com:moses-smt/mosesdecoder.git
+cd mosesdecoder  
+git clone git@github.com:rsennrich/wmt16-scripts.git
+```
+
+(2) define a function for post processing.
+ It removes diacritics and does other things I don't understand 
+```bash
+ro_post_process () {
+  sys=$1
+  ref=$2
+  export MOSES_PATH=$HOME/mosesdecoder
+  REPLACE_UNICODE_PUNCT=$MOSES_PATH/scripts/tokenizer/replace-unicode-punctuation.perl
+  NORM_PUNC=$MOSES_PATH/scripts/tokenizer/normalize-punctuation.perl
+  REM_NON_PRINT_CHAR=$MOSES_PATH/scripts/tokenizer/remove-non-printing-char.perl
+  REMOVE_DIACRITICS=$MOSES_PATH/wmt16-scripts/preprocess/remove-diacritics.py
+  NORMALIZE_ROMANIAN=$MOSES_PATH/wmt16-scripts/preprocess/normalise-romanian.py
+  TOKENIZER=$MOSES_PATH/scripts/tokenizer/tokenizer.perl
+
+
+
+  lang=ro
+  for file in $sys $ref; do
+    cat $file \
+    | $REPLACE_UNICODE_PUNCT \
+    | $NORM_PUNC -l $lang \
+    | $REM_NON_PRINT_CHAR \
+    | $NORMALIZE_ROMANIAN \
+    | $REMOVE_DIACRITICS \
+    | $TOKENIZER -no-escape -l $lang \
+    > $(basename $file).tok
+  done
+  # compute BLEU
+  cat $(basename $sys).tok | sacrebleu -tok none -s none -b $(basename $ref).tok
+}
+```
+
+(3) Call the function on test_generations.txt and test.target
+For example,
+```bash
+ro_post_process enro_finetune/test_generations.txt wmt_en_ro/test.target
+```
+This will split out a new blue score and write a new fine called `test_generations.tok` with post-processed outputs.
+
+
+
+
+
+
+
+
+
+```
--- a/examples/legacy/seq2seq/rouge_cli.py
+++ b/examples/legacy/seq2seq/rouge_cli.py
@ -0,0 +1,31 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import fire
+
+from utils import calculate_rouge, save_json
+
+
+def calculate_rouge_path(pred_path, tgt_path, save_path=None, **kwargs):
+    """Kwargs will be passed to calculate_rouge"""
+    pred_lns = [x.strip() for x in open(pred_path)]
+    tgt_lns = [x.strip() for x in open(tgt_path)][: len(pred_lns)]
+    metrics = calculate_rouge(pred_lns, tgt_lns, **kwargs)
+    if save_path is not None:
+        save_json(metrics, save_path, indent=None)
+    return metrics  # these print nicely
+
+
+if __name__ == "__main__":
+    fire.Fire(calculate_rouge_path)
--- a/examples/legacy/seq2seq/run_distributed_eval.py
+++ b/examples/legacy/seq2seq/run_distributed_eval.py
@ -0,0 +1,261 @@
+#!/usr/bin/env python
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import shutil
+import time
+from json import JSONDecodeError
+from logging import getLogger
+from pathlib import Path
+from typing import Optional
+
+import torch
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+from utils import (
+    Seq2SeqDataset,
+    calculate_bleu,
+    calculate_rouge,
+    chunks,
+    lmap,
+    load_json,
+    parse_numeric_n_bool_cl_kwargs,
+    save_json,
+    use_task_specific_params,
+    write_txt_file,
+)
+
+
+logger = getLogger(__name__)
+
+
+def eval_data_dir(
+    data_dir,
+    save_dir: str,
+    model_name: str,
+    bs: int = 8,
+    max_source_length: int = 1024,
+    type_path="val",
+    n_obs=None,
+    fp16=False,
+    task="summarization",
+    local_rank=None,
+    num_return_sequences=1,
+    dataset_kwargs: Optional[dict] = None,
+    prefix="",
+    **generate_kwargs,
+) -> dict:
+    """Run evaluation on part of the data for one gpu and save to {save_dir}/rank_{rank}_output.json"""
+    model_name = str(model_name)
+    assert local_rank is not None
+    torch.distributed.init_process_group(backend="nccl", rank=local_rank)
+
+    save_dir = Path(save_dir)
+    save_path = save_dir.joinpath(f"rank_{local_rank}_output.json")
+    torch.cuda.set_device(local_rank)
+    model = AutoModelForSeq2SeqLM.from_pretrained(model_name).cuda()
+    if fp16:
+        model = model.half()
+    # determine if we need to increase num_beams
+    use_task_specific_params(model, task)  # update config with task specific params
+    num_beams = generate_kwargs.pop("num_beams", model.config.num_beams)  # AttributeError risk?
+    if num_return_sequences > num_beams:
+        num_beams = num_return_sequences
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    logger.info(f"Inferred tokenizer type: {tokenizer.__class__}")  # if this is wrong, check config.model_type.
+
+    if max_source_length is None:
+        max_source_length = tokenizer.model_max_length
+    if prefix is None:
+        prefix = prefix or getattr(model.config, "prefix", "") or ""
+    ds = Seq2SeqDataset(
+        tokenizer,
+        data_dir,
+        max_source_length,
+        max_target_length=1024,
+        type_path=type_path,
+        n_obs=n_obs,
+        prefix=prefix,
+        **dataset_kwargs,
+    )
+    # I set shuffle=True for a more accurate progress bar.
+    # If all the longest samples are first, the prog bar estimate is too high at the beginning.
+    sampler = ds.make_sortish_sampler(bs, distributed=True, add_extra_examples=False, shuffle=True)
+    data_loader = DataLoader(ds, sampler=sampler, batch_size=bs, collate_fn=ds.collate_fn)
+    results = []
+    for batch in tqdm(data_loader):
+        summaries = model.generate(
+            input_ids=batch["input_ids"].to(model.device),
+            attention_mask=batch["attention_mask"].to(model.device),
+            num_return_sequences=num_return_sequences,
+            num_beams=num_beams,
+            **generate_kwargs,
+        )
+        preds = tokenizer.batch_decode(summaries, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+        ids = batch["ids"]
+        if num_return_sequences > 1:
+            preds = chunks(preds, num_return_sequences)  # batch size chunks, each of size num_return_seq
+        for i, pred in enumerate(preds):
+            results.append({"pred": pred, "id": ids[i].item()})
+    save_json(results, save_path)
+    return results, sampler.num_replicas
+
+
+def run_generate():
+    parser = argparse.ArgumentParser(
+        epilog="Unspecified args like --num_beams=2 --decoder_start_token_id=4 are passed to model.generate"
+    )
+    parser.add_argument("--data_dir", type=str, help="like cnn_dm/test.source")
+    parser.add_argument(
+        "--model_name",
+        type=str,
+        help="like facebook/bart-large-cnn,google-t5/t5-base, etc.",
+        default="sshleifer/distilbart-xsum-12-3",
+    )
+    parser.add_argument("--save_dir", type=str, help="where to save", default="tmp_gen")
+    parser.add_argument("--max_source_length", type=int, default=None)
+    parser.add_argument(
+        "--type_path", type=str, default="test", help="which subset to evaluate typically train/val/test"
+    )
+    parser.add_argument("--task", type=str, default="summarization", help="used for task_specific_params + metrics")
+    parser.add_argument("--bs", type=int, default=8, required=False, help="batch size")
+    parser.add_argument(
+        "--local_rank", type=int, default=-1, required=False, help="should be passed by distributed.launch"
+    )
+
+    parser.add_argument(
+        "--n_obs", type=int, default=None, required=False, help="How many observations. Defaults to all."
+    )
+    parser.add_argument(
+        "--num_return_sequences", type=int, default=1, required=False, help="How many sequences to return"
+    )
+    parser.add_argument(
+        "--sync_timeout",
+        type=int,
+        default=600,
+        required=False,
+        help="How long should master process wait for other processes to finish.",
+    )
+    parser.add_argument("--src_lang", type=str, default=None, required=False)
+    parser.add_argument("--tgt_lang", type=str, default=None, required=False)
+    parser.add_argument(
+        "--prefix", type=str, required=False, default=None, help="will be added to the beginning of src examples"
+    )
+    parser.add_argument("--fp16", action="store_true")
+    parser.add_argument("--debug", action="store_true")
+    start_time = time.time()
+    args, rest = parser.parse_known_args()
+    generate_kwargs = parse_numeric_n_bool_cl_kwargs(rest)
+    if generate_kwargs and args.local_rank <= 0:
+        print(f"parsed the following generate kwargs: {generate_kwargs}")
+    json_save_dir = Path(args.save_dir + "_tmp")
+    Path(json_save_dir).mkdir(exist_ok=True)  # this handles locking.
+    intermediate_files = list(json_save_dir.glob("rank_*.json"))
+    if intermediate_files:
+        raise ValueError(f"Found files at {json_save_dir} please move or remove them.")
+        # In theory, a node could finish and save before another node hits this. If this happens, we can address later.
+    dataset_kwargs = {}
+    if args.src_lang is not None:
+        dataset_kwargs["src_lang"] = args.src_lang
+    if args.tgt_lang is not None:
+        dataset_kwargs["tgt_lang"] = args.tgt_lang
+
+    Path(args.save_dir).mkdir(exist_ok=True)
+    results, num_replicas = eval_data_dir(
+        args.data_dir,
+        json_save_dir,
+        args.model_name,
+        type_path=args.type_path,
+        bs=args.bs,
+        fp16=args.fp16,
+        task=args.task,
+        local_rank=args.local_rank,
+        n_obs=args.n_obs,
+        max_source_length=args.max_source_length,
+        num_return_sequences=args.num_return_sequences,
+        prefix=args.prefix,
+        dataset_kwargs=dataset_kwargs,
+        **generate_kwargs,
+    )
+
+    if args.local_rank <= 0:
+        save_dir = Path(args.save_dir)
+        save_dir.mkdir(exist_ok=True)
+        partial_results = gather_results_from_each_node(num_replicas, json_save_dir, args.sync_timeout)
+        preds = combine_partial_results(partial_results)
+        if args.num_return_sequences > 1:
+            save_path = save_dir.joinpath("pseudolabel_results.json")
+            print(f"Saving aggregated results at {save_path}, intermediate in {json_save_dir}/")
+            save_json(preds, save_path)
+            return
+        tgt_file = Path(args.data_dir).joinpath(args.type_path + ".target")
+        with open(tgt_file) as f:
+            labels = [x.rstrip() for x in f][: len(preds)]
+
+        # Calculate metrics, save metrics,  and save _generations.txt
+        calc_bleu = "translation" in args.task
+        score_fn = calculate_bleu if calc_bleu else calculate_rouge
+        metric_name = "bleu" if calc_bleu else "rouge"
+        metrics: dict = score_fn(preds, labels)
+        metrics["n_obs"] = len(preds)
+        runtime = time.time() - start_time
+        metrics["seconds_per_sample"] = round(runtime / metrics["n_obs"], 4)
+        metrics["n_gpus"] = num_replicas
+        # TODO(@stas00): add whatever metadata to metrics
+        metrics_save_path = save_dir.joinpath(f"{args.type_path}_{metric_name}.json")
+        save_json(metrics, metrics_save_path, indent=None)
+        print(metrics)
+        write_txt_file(preds, save_dir.joinpath(f"{args.type_path}_generations.txt"))
+        if args.debug:
+            write_txt_file(labels, save_dir.joinpath(f"{args.type_path}.target"))
+        else:
+            shutil.rmtree(json_save_dir)
+
+
+def combine_partial_results(partial_results) -> list:
+    """Concatenate partial results into one file, then sort it by id."""
+    records = []
+    for partial_result in partial_results:
+        records.extend(partial_result)
+    records = sorted(records, key=lambda x: x["id"])
+    preds = [x["pred"] for x in records]
+    return preds
+
+
+def gather_results_from_each_node(num_replicas, save_dir, timeout) -> list[dict[str, list]]:
+    # WAIT FOR lots of .json files
+    start_wait = time.time()
+    logger.info("waiting for all nodes to finish")
+    json_data = None
+    while (time.time() - start_wait) < timeout:
+        json_files = list(save_dir.glob("rank_*.json"))
+        if len(json_files) < num_replicas:
+            continue
+        try:
+            # make sure all json files are fully saved
+            json_data = lmap(load_json, json_files)
+            return json_data
+        except JSONDecodeError:
+            continue
+    raise TimeoutError("Rank 0 gave up on waiting for other processes")
+    # Unreachable
+
+
+if __name__ == "__main__":
+    # Usage for MT:
+    run_generate()
--- a/examples/legacy/seq2seq/run_eval.py
+++ b/examples/legacy/seq2seq/run_eval.py
@ -0,0 +1,184 @@
+#!/usr/bin/env python
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import datetime
+import json
+import time
+import warnings
+from logging import getLogger
+from pathlib import Path
+
+import torch
+from tqdm import tqdm
+
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+from utils import calculate_bleu, calculate_rouge, chunks, parse_numeric_n_bool_cl_kwargs, use_task_specific_params
+
+
+logger = getLogger(__name__)
+
+
+DEFAULT_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+
+
+def generate_summaries_or_translations(
+    examples: list[str],
+    out_file: str,
+    model_name: str,
+    batch_size: int = 8,
+    device: str = DEFAULT_DEVICE,
+    fp16=False,
+    task="summarization",
+    prefix=None,
+    **generate_kwargs,
+) -> dict:
+    """Save model.generate results to <out_file>, and return how long it took."""
+    fout = Path(out_file).open("w", encoding="utf-8")
+    model_name = str(model_name)
+    model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
+    if fp16:
+        model = model.half()
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    logger.info(f"Inferred tokenizer type: {tokenizer.__class__}")  # if this is wrong, check config.model_type.
+
+    start_time = time.time()
+    # update config with task specific params
+    use_task_specific_params(model, task)
+    if prefix is None:
+        prefix = prefix or getattr(model.config, "prefix", "") or ""
+    for examples_chunk in tqdm(list(chunks(examples, batch_size))):
+        examples_chunk = [prefix + text for text in examples_chunk]
+        batch = tokenizer(examples_chunk, return_tensors="pt", truncation=True, padding="longest").to(device)
+        summaries = model.generate(
+            input_ids=batch.input_ids,
+            attention_mask=batch.attention_mask,
+            **generate_kwargs,
+        )
+        dec = tokenizer.batch_decode(summaries, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+        for hypothesis in dec:
+            fout.write(hypothesis + "\n")
+            fout.flush()
+    fout.close()
+    runtime = int(time.time() - start_time)  # seconds
+    n_obs = len(examples)
+    return {"n_obs": n_obs, "runtime": runtime, "seconds_per_sample": round(runtime / n_obs, 4)}
+
+
+def datetime_now():
+    return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
+
+def run_generate(verbose=True):
+    """
+
+    Takes input text, generates output, and then using reference calculates the BLEU scores.
+
+    The results are saved to a file and returned to the caller, and printed out unless ``verbose=False`` is passed.
+
+    Args:
+        verbose (:obj:`bool`, `optional`, defaults to :obj:`True`): print results to stdout
+
+    Returns:
+        a tuple: ``(scores, params}``
+        - ``scores``: a dict of scores data ``{'bleu': 39.6501, 'n_obs': 2000, 'runtime': 186, 'seconds_per_sample': 0.093}``
+        - ``params``: a dict of custom params, e.g. ``{'num_beams': 5, 'length_penalty': 0.8}``
+    """
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("model_name", type=str, help="like facebook/bart-large-cnn,google-t5/t5-base, etc.")
+    parser.add_argument("input_path", type=str, help="like cnn_dm/test.source")
+    parser.add_argument("save_path", type=str, help="where to save summaries")
+    parser.add_argument("--reference_path", type=str, required=False, help="like cnn_dm/test.target")
+    parser.add_argument("--score_path", type=str, required=False, default="metrics.json", help="where to save metrics")
+    parser.add_argument("--device", type=str, required=False, default=DEFAULT_DEVICE, help="cuda, cuda:1, cpu etc.")
+    parser.add_argument(
+        "--prefix", type=str, required=False, default=None, help="will be added to the beginning of src examples"
+    )
+    parser.add_argument("--task", type=str, default="summarization", help="used for task_specific_params + metrics")
+    parser.add_argument("--bs", type=int, default=8, required=False, help="batch size")
+    parser.add_argument(
+        "--n_obs", type=int, default=-1, required=False, help="How many observations. Defaults to all."
+    )
+    parser.add_argument("--fp16", action="store_true")
+    parser.add_argument("--dump-args", action="store_true", help="print the custom hparams with the results")
+    parser.add_argument(
+        "--info",
+        nargs="?",
+        type=str,
+        const=datetime_now(),
+        help=(
+            "use in conjunction w/ --dump-args to print with the results whatever other info you'd like, e.g."
+            " lang=en-ru. If no value is passed, the current datetime string will be used."
+        ),
+    )
+    # Unspecified args like --num_beams=2 --decoder_start_token_id=4 are passed to model.generate
+    args, rest = parser.parse_known_args()
+    parsed_args = parse_numeric_n_bool_cl_kwargs(rest)
+    if parsed_args and verbose:
+        print(f"parsed the following generate kwargs: {parsed_args}")
+    examples = [" " + x.rstrip() if "t5" in args.model_name else x.rstrip() for x in open(args.input_path)]
+    if args.n_obs > 0:
+        examples = examples[: args.n_obs]
+    Path(args.save_path).parent.mkdir(exist_ok=True)
+
+    if args.reference_path is None and Path(args.score_path).exists():
+        warnings.warn(f"score_path {args.score_path} will be overwritten unless you type ctrl-c.")
+
+    if args.device == "cpu" and args.fp16:
+        # this mix leads to RuntimeError: "threshold_cpu" not implemented for 'Half'
+        raise ValueError("Can't mix --fp16 and --device cpu")
+
+    runtime_metrics = generate_summaries_or_translations(
+        examples,
+        args.save_path,
+        args.model_name,
+        batch_size=args.bs,
+        device=args.device,
+        fp16=args.fp16,
+        task=args.task,
+        prefix=args.prefix,
+        **parsed_args,
+    )
+
+    if args.reference_path is None:
+        return {}
+
+    # Compute scores
+    score_fn = calculate_bleu if "translation" in args.task else calculate_rouge
+    output_lns = [x.rstrip() for x in open(args.save_path)]
+    reference_lns = [x.rstrip() for x in open(args.reference_path)][: len(output_lns)]
+    scores: dict = score_fn(output_lns, reference_lns)
+    scores.update(runtime_metrics)
+
+    if args.dump_args:
+        scores.update(parsed_args)
+    if args.info:
+        scores["info"] = args.info
+
+    if verbose:
+        print(scores)
+
+    if args.score_path is not None:
+        json.dump(scores, open(args.score_path, "w"))
+
+    return scores
+
+
+if __name__ == "__main__":
+    # Usage for MT:
+    # python run_eval.py MODEL_NAME $DATA_DIR/test.source $save_dir/test_translations.txt --reference_path $DATA_DIR/test.target --score_path $save_dir/test_bleu.json  --task translation $@
+    run_generate(verbose=True)
--- a/examples/legacy/seq2seq/run_eval_search.py
+++ b/examples/legacy/seq2seq/run_eval_search.py
@ -0,0 +1,158 @@
+#!/usr/bin/env python
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import itertools
+import operator
+import sys
+from collections import OrderedDict
+
+from run_eval import datetime_now, run_generate
+
+from utils import ROUGE_KEYS
+
+
+# A table of supported tasks and the list of scores in the order of importance to be sorted by.
+# To add a new task, simply list the score names that `run_eval.run_generate()` returns
+task_score_names = {
+    "translation": ["bleu"],
+    "summarization": ROUGE_KEYS,
+}
+
+
+def parse_search_arg(search):
+    groups = search.split()
+    entries = dict(g.split("=") for g in groups)
+    entry_names = list(entries.keys())
+    sets = [[f"--{k} {v}" for v in vs.split(":")] for k, vs in entries.items()]
+    matrix = [list(x) for x in itertools.product(*sets)]
+    return matrix, entry_names
+
+
+def run_search():
+    """
+     Run parametric search over the desired hparam space with help of ``run_eval.py``.
+
+     All the arguments except ``--search`` are passed to ``run_eval.py`` as is. The values inside of "--search" are parsed, reformatted and fed to ``run_eval.py`` as additional args.
+
+    The format for the ``--search`` value is a simple string with hparams and colon separated values to try, e.g.:
+    ```
+     --search "num_beams=5:10 length_penalty=0.8:1.0:1.2 early_stopping=true:false"
+    ```
+    which will generate ``12`` ``(2*3*2)`` searches for a product of each hparam. For example the example that was just used will invoke ``run_eval.py`` repeatedly with:
+
+    ```
+     --num_beams 5 --length_penalty 0.8 --early_stopping true
+     --num_beams 5 --length_penalty 0.8 --early_stopping false
+     [...]
+     --num_beams 10 --length_penalty 1.2 --early_stopping false
+    ```
+
+    On completion, this function prints a markdown table of the results sorted by the best BLEU score and the winning arguments.
+
+
+    """
+    prog = sys.argv[0]
+
+    parser = argparse.ArgumentParser(
+        usage=(
+            "\n\nImportant: this script accepts all arguments `run_eval.py` accepts and then a few extra, therefore"
+            " refer to `run_eval.py -h` for the complete list."
+        )
+    )
+    parser.add_argument(
+        "--search",
+        type=str,
+        required=False,
+        help='param space to search, e.g. "num_beams=5:10 length_penalty=0.8:1.0:1.2"',
+    )
+    parser.add_argument(
+        "--bs", type=int, default=8, required=False, help="initial batch size (may get reduced if it's too big)"
+    )
+    parser.add_argument("--task", type=str, help="used for task_specific_params + metrics")
+    parser.add_argument(
+        "--info",
+        nargs="?",
+        type=str,
+        const=datetime_now(),
+        help=(
+            "add custom notes to be printed before the results table. If no value is passed, the current datetime"
+            " string will be used."
+        ),
+    )
+    args, args_main = parser.parse_known_args()
+    # we share some of the args
+    args_main.extend(["--task", args.task])
+    args_normal = [prog] + args_main
+
+    # to support variations like translation_en_to_de"
+    task = "translation" if "translation" in args.task else "summarization"
+
+    matrix, col_names = parse_search_arg(args.search)
+    col_names[0:0] = task_score_names[task]  # score cols first
+    col_widths = {col: len(str(col)) for col in col_names}
+    results = []
+    for r in matrix:
+        hparams = dict(x.replace("--", "").split() for x in r)
+        args_exp = " ".join(r).split()
+        args_exp.extend(["--bs", str(args.bs)])  # in case we need to reduce its size due to CUDA OOM
+        sys.argv = args_normal + args_exp
+
+        # XXX: need to trap CUDA OOM and lower args.bs if that happens and retry
+
+        scores = run_generate(verbose=False)
+        # make sure scores are first in the table
+        result = OrderedDict()
+        for score in task_score_names[task]:
+            result[score] = scores[score]
+        result.update(hparams)
+        results.append(result)
+
+        # find widest entries
+        for k, v in result.items():
+            l = len(str(v))
+            if l > col_widths[k]:
+                col_widths[k] = l
+
+    results_sorted = sorted(results, key=operator.itemgetter(*task_score_names[task]), reverse=True)
+    print(" | ".join([f"{col:{col_widths[col]}}" for col in col_names]))
+    print(" | ".join([f"{'-' * col_widths[col]}" for col in col_names]))
+    for row in results_sorted:
+        print(" | ".join([f"{row[col]:{col_widths[col]}}" for col in col_names]))
+
+    best = results_sorted[0]
+    for score in task_score_names[task]:
+        del best[score]
+    best_args = [f"--{k} {v}" for k, v in best.items()]
+    dyn_args = ["--bs", str(args.bs)]
+    if args.info:
+        print(f"\nInfo: {args.info}")
+    print("\nBest score args:")
+    print(" ".join(args_main + best_args + dyn_args))
+
+    return results_sorted
+
+
+if __name__ == "__main__":
+    # Usage:
+    # [normal-run_eval_search.py cmd plus] \
+    # --search="num_beams=1:5:10 length_penalty=0.8:1:1.2 early_stopping=true:false"
+    #
+    # Example:
+    # PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py $MODEL_NAME \
+    # $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target \
+    # --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation \
+    # --search="num_beams=1:5:10 length_penalty=0.8:1:1.2 early_stopping=true:false"
+    run_search()
--- a/examples/legacy/seq2seq/save_len_file.py
+++ b/examples/legacy/seq2seq/save_len_file.py
@ -0,0 +1,56 @@
+#!/usr/bin/env python
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import fire
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+
+from transformers import AutoTokenizer
+from utils import Seq2SeqDataset, pickle_save
+
+
+def save_len_file(
+    tokenizer_name, data_dir, max_source_length=1024, max_target_length=1024, consider_target=False, **kwargs
+):
+    """Save max(src_len, tgt_len) for each example to allow dynamic batching."""
+    tok = AutoTokenizer.from_pretrained(tokenizer_name)
+    train_ds = Seq2SeqDataset(tok, data_dir, max_source_length, max_target_length, type_path="train", **kwargs)
+    pad = tok.pad_token_id
+
+    def get_lens(ds):
+        dl = tqdm(
+            DataLoader(ds, batch_size=512, num_workers=8, shuffle=False, collate_fn=ds.collate_fn),
+            desc=str(ds.len_file),
+        )
+        max_lens = []
+        for batch in dl:
+            src_lens = batch["input_ids"].ne(pad).sum(1).tolist()
+            tgt_lens = batch["labels"].ne(pad).sum(1).tolist()
+            if consider_target:
+                for src, tgt in zip(src_lens, tgt_lens):
+                    max_lens.append(max(src, tgt))
+            else:
+                max_lens.extend(src_lens)
+        return max_lens
+
+    train_lens = get_lens(train_ds)
+    val_ds = Seq2SeqDataset(tok, data_dir, max_source_length, max_target_length, type_path="val", **kwargs)
+    val_lens = get_lens(val_ds)
+    pickle_save(train_lens, train_ds.len_file)
+    pickle_save(val_lens, val_ds.len_file)
+
+
+if __name__ == "__main__":
+    fire.Fire(save_len_file)
--- a/examples/legacy/seq2seq/save_randomly_initialized_model.py
+++ b/examples/legacy/seq2seq/save_randomly_initialized_model.py
@ -0,0 +1,39 @@
+#!/usr/bin/env python
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import fire
+
+from transformers import AutoConfig, AutoModelForSeq2SeqLM, AutoTokenizer
+
+
+def save_randomly_initialized_version(config_name: str, save_dir: str, **config_kwargs):
+    """Save a randomly initialized version of a model using a pretrained config.
+    Args:
+        config_name: which config to use
+        save_dir: where to save the resulting model and tokenizer
+        config_kwargs: Passed to AutoConfig
+
+    Usage::
+        save_randomly_initialized_version("facebook/bart-large-cnn", "distilbart_random_cnn_6_3", encoder_layers=6, decoder_layers=3, num_beams=3)
+    """
+    cfg = AutoConfig.from_pretrained(config_name, **config_kwargs)
+    model = AutoModelForSeq2SeqLM.from_config(cfg)
+    model.save_pretrained(save_dir)
+    AutoTokenizer.from_pretrained(config_name).save_pretrained(save_dir)
+    return model
+
+
+if __name__ == "__main__":
+    fire.Fire(save_randomly_initialized_version)
--- a/examples/legacy/seq2seq/sentence_splitter.py
+++ b/examples/legacy/seq2seq/sentence_splitter.py
@ -0,0 +1,35 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+
+from filelock import FileLock
+
+
+try:
+    import nltk
+
+    NLTK_AVAILABLE = True
+except (ImportError, ModuleNotFoundError):
+    NLTK_AVAILABLE = False
+
+if NLTK_AVAILABLE:
+    with FileLock(".lock") as lock:
+        nltk.download("punkt", quiet=True)
+
+
+def add_newline_to_end_of_each_sentence(x: str) -> str:
+    """This was added to get rougeLsum scores matching published rougeL scores for BART and PEGASUS."""
+    re.sub("<n>", "", x)  # remove pegasus newline char
+    assert NLTK_AVAILABLE, "nltk must be installed to separate newlines between sentences. (pip install nltk)"
+    return "\n".join(nltk.sent_tokenize(x))
--- a/examples/legacy/seq2seq/seq2seq_trainer.py
+++ b/examples/legacy/seq2seq/seq2seq_trainer.py
@ -0,0 +1,248 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Optional, Union
+
+import torch
+from torch import nn
+from torch.utils.data import DistributedSampler, RandomSampler
+
+from transformers import PreTrainedModel, Trainer, logging
+from transformers.models.fsmt.configuration_fsmt import FSMTConfig
+from transformers.optimization import (
+    Adafactor,
+    get_constant_schedule,
+    get_constant_schedule_with_warmup,
+    get_cosine_schedule_with_warmup,
+    get_cosine_with_hard_restarts_schedule_with_warmup,
+    get_linear_schedule_with_warmup,
+    get_polynomial_decay_schedule_with_warmup,
+)
+from transformers.trainer_pt_utils import get_tpu_sampler
+from transformers.training_args import ParallelMode
+from transformers.utils import is_torch_xla_available
+
+
+logger = logging.get_logger(__name__)
+
+arg_to_scheduler = {
+    "linear": get_linear_schedule_with_warmup,
+    "cosine": get_cosine_schedule_with_warmup,
+    "cosine_w_restarts": get_cosine_with_hard_restarts_schedule_with_warmup,
+    "polynomial": get_polynomial_decay_schedule_with_warmup,
+    "constant": get_constant_schedule,
+    "constant_w_warmup": get_constant_schedule_with_warmup,
+}
+
+
+class Seq2SeqTrainer(Trainer):
+    def __init__(self, config=None, data_args=None, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        if config is None:
+            assert isinstance(self.model, PreTrainedModel), (
+                "If no `config` is passed the model to be trained has to be of type `PreTrainedModel`, but is"
+                f" {self.model.__class__}"
+            )
+            self.config = self.model.config
+        else:
+            self.config = config
+
+        self.data_args = data_args
+        self.vocab_size = self.config.tgt_vocab_size if isinstance(self.config, FSMTConfig) else self.config.vocab_size
+
+        if self.args.label_smoothing != 0 or (self.data_args is not None and self.data_args.ignore_pad_token_for_loss):
+            assert self.config.pad_token_id is not None, (
+                "Make sure that `config.pad_token_id` is correctly defined when ignoring `pad_token` for loss"
+                " calculation or doing label smoothing."
+            )
+
+        if self.config.pad_token_id is None and self.config.eos_token_id is not None:
+            logger.warning(
+                f"The `config.pad_token_id` is `None`. Using `config.eos_token_id` = {self.config.eos_token_id} for"
+                " padding.."
+            )
+
+        if self.args.label_smoothing == 0:
+            self.loss_fn = torch.nn.CrossEntropyLoss(ignore_index=self.config.pad_token_id)
+        else:
+            # dynamically import label_smoothed_nll_loss
+            from utils import label_smoothed_nll_loss
+
+            self.loss_fn = label_smoothed_nll_loss
+
+    def create_optimizer_and_scheduler(self, num_training_steps: int):
+        """
+        Setup the optimizer and the learning rate scheduler.
+
+        We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
+        Trainer's init through :obj:`optimizers`, or subclass and override this method in a subclass.
+        """
+        if self.optimizer is None:
+            no_decay = ["bias", "LayerNorm.weight"]
+            optimizer_grouped_parameters = [
+                {
+                    "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
+                    "weight_decay": self.args.weight_decay,
+                },
+                {
+                    "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
+                    "weight_decay": 0.0,
+                },
+            ]
+            if self.args.adafactor:
+                optimizer_cls = Adafactor
+                optimizer_kwargs = {"scale_parameter": False, "relative_step": False}
+            else:
+                optimizer_cls = torch.optim.AdamW
+                optimizer_kwargs = {
+                    "betas": (self.args.adam_beta1, self.args.adam_beta2),
+                    "eps": self.args.adam_epsilon,
+                }
+            optimizer_kwargs["lr"] = self.args.learning_rate
+            self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
+
+        if self.lr_scheduler is None:
+            self.lr_scheduler = self._get_lr_scheduler(num_training_steps)
+        else:  # ignoring --lr_scheduler
+            logger.warning("scheduler is passed to `Seq2SeqTrainer`, `--lr_scheduler` arg is ignored.")
+
+    def _get_lr_scheduler(self, num_training_steps):
+        schedule_func = arg_to_scheduler[self.args.lr_scheduler]
+        if self.args.lr_scheduler == "constant":
+            scheduler = schedule_func(self.optimizer)
+        elif self.args.lr_scheduler == "constant_w_warmup":
+            scheduler = schedule_func(self.optimizer, num_warmup_steps=self.args.warmup_steps)
+        else:
+            scheduler = schedule_func(
+                self.optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=num_training_steps
+            )
+        return scheduler
+
+    def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
+        if isinstance(self.train_dataset, torch.utils.data.IterableDataset):
+            return None
+        elif is_torch_xla_available():
+            return get_tpu_sampler(self.train_dataset)
+        else:
+            if self.args.sortish_sampler:
+                self.train_dataset.make_sortish_sampler(
+                    self.args.per_device_train_batch_size,
+                    distributed=(self.args.parallel_mode == ParallelMode.DISTRIBUTED),
+                )
+
+            return (
+                RandomSampler(self.train_dataset)
+                if self.args.local_process_index == -1
+                else DistributedSampler(self.train_dataset)
+            )
+
+    def _compute_loss(self, model, inputs, labels):
+        if self.args.label_smoothing == 0:
+            if self.data_args is not None and self.data_args.ignore_pad_token_for_loss:
+                # force training to ignore pad token
+                logits = model(**inputs, use_cache=False)[0]
+                loss = self.loss_fn(logits.view(-1, logits.shape[-1]), labels.view(-1))
+            else:
+                # compute usual loss via models
+                loss, logits = model(**inputs, labels=labels, use_cache=False)[:2]
+        else:
+            # compute label smoothed loss
+            logits = model(**inputs, use_cache=False)[0]
+            lprobs = torch.nn.functional.log_softmax(logits, dim=-1)
+            loss, _ = self.loss_fn(lprobs, labels, self.args.label_smoothing, ignore_index=self.config.pad_token_id)
+        return loss, logits
+
+    def compute_loss(self, model, inputs):
+        labels = inputs.pop("labels")
+        loss, _ = self._compute_loss(model, inputs, labels)
+        return loss
+
+    def prediction_step(
+        self,
+        model: nn.Module,
+        inputs: dict[str, Union[torch.Tensor, Any]],
+        prediction_loss_only: bool,
+        ignore_keys: Optional[list[str]] = None,
+    ) -> tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
+        """
+        Perform an evaluation step on :obj:`model` using obj:`inputs`.
+
+        Subclass and override to inject custom behavior.
+
+        Args:
+            model (:obj:`nn.Module`):
+                The model to evaluate.
+            inputs (:obj:`dict[str, Union[torch.Tensor, Any]]`):
+                The inputs and targets of the model.
+
+                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
+                argument :obj:`labels`. Check your model's documentation for all accepted arguments.
+            prediction_loss_only (:obj:`bool`):
+                Whether or not to return the loss only.
+
+        Return:
+            tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
+            A tuple with the loss, logits and labels (each being optional).
+        """
+        inputs = self._prepare_inputs(inputs)
+
+        gen_kwargs = {
+            "max_length": self.data_args.val_max_target_length
+            if self.data_args is not None
+            else self.config.max_length,
+            "num_beams": self.data_args.eval_beams if self.data_args is not None else self.config.num_beams,
+        }
+
+        if self.args.predict_with_generate and not self.args.prediction_loss_only:
+            generated_tokens = self.model.generate(
+                inputs["input_ids"],
+                attention_mask=inputs["attention_mask"],
+                **gen_kwargs,
+            )
+            # in case the batch is shorter than max length, the output should be padded
+            if generated_tokens.shape[-1] < gen_kwargs["max_length"]:
+                generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"])
+
+        labels = inputs.pop("labels")
+        with torch.no_grad():
+            # compute loss on predict data
+            loss, logits = self._compute_loss(model, inputs, labels)
+
+        loss = loss.mean().detach()
+        if self.args.prediction_loss_only:
+            return (loss, None, None)
+
+        logits = generated_tokens if self.args.predict_with_generate else logits
+
+        if labels.shape[-1] < gen_kwargs["max_length"]:
+            labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"])
+
+        return (loss, logits, labels)
+
+    def _pad_tensors_to_max_len(self, tensor, max_length):
+        # If PAD token is not defined at least EOS token has to be defined
+        pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else self.config.eos_token_id
+
+        if pad_token_id is None:
+            raise ValueError(
+                "Make sure that either `config.pad_token_id` or `config.eos_token_id` is defined if tensor has to be"
+                f" padded to `max_length`={max_length}"
+            )
+
+        padded_tensor = pad_token_id * torch.ones(
+            (tensor.shape[0], max_length), dtype=tensor.dtype, device=tensor.device
+        )
+        padded_tensor[:, : tensor.shape[-1]] = tensor
+        return padded_tensor
--- a/examples/legacy/seq2seq/seq2seq_training_args.py
+++ b/examples/legacy/seq2seq/seq2seq_training_args.py
@ -0,0 +1,60 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from dataclasses import dataclass, field
+from typing import Optional
+
+from seq2seq_trainer import arg_to_scheduler
+
+from transformers import TrainingArguments
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class Seq2SeqTrainingArguments(TrainingArguments):
+    """
+    Parameters:
+        label_smoothing (:obj:`float`, `optional`, defaults to 0):
+            The label smoothing epsilon to apply (if not zero).
+        sortish_sampler (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether to SortishSampler or not. It sorts the inputs according to lengths in-order to minimizing the padding size.
+        predict_with_generate (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether to use generate to calculate generative metrics (ROUGE, BLEU).
+    """
+
+    label_smoothing: Optional[float] = field(
+        default=0.0, metadata={"help": "The label smoothing epsilon to apply (if not zero)."}
+    )
+    sortish_sampler: bool = field(default=False, metadata={"help": "Whether to SortishSampler or not."})
+    predict_with_generate: bool = field(
+        default=False, metadata={"help": "Whether to use generate to calculate generative metrics (ROUGE, BLEU)."}
+    )
+    adafactor: bool = field(default=False, metadata={"help": "whether to use adafactor"})
+    encoder_layerdrop: Optional[float] = field(
+        default=None, metadata={"help": "Encoder layer dropout probability. Goes into model.config."}
+    )
+    decoder_layerdrop: Optional[float] = field(
+        default=None, metadata={"help": "Decoder layer dropout probability. Goes into model.config."}
+    )
+    dropout: Optional[float] = field(default=None, metadata={"help": "Dropout probability. Goes into model.config."})
+    attention_dropout: Optional[float] = field(
+        default=None, metadata={"help": "Attention dropout probability. Goes into model.config."}
+    )
+    lr_scheduler: Optional[str] = field(
+        default="linear",
+        metadata={"help": f"Which lr scheduler to use. Selected in {sorted(arg_to_scheduler.keys())}"},
+    )
--- a/examples/legacy/seq2seq/test_data/fsmt/build-eval-data.py
+++ b/examples/legacy/seq2seq/test_data/fsmt/build-eval-data.py
@ -0,0 +1,32 @@
+#!/usr/bin/env python
+
+import json
+import subprocess
+
+
+pairs = [
+    ["en", "ru"],
+    ["ru", "en"],
+    ["en", "de"],
+    ["de", "en"],
+]
+
+n_objs = 8
+
+
+def get_all_data(pairs, n_objs):
+    text = {}
+    for src, tgt in pairs:
+        pair = f"{src}-{tgt}"
+        cmd = f"sacrebleu -t wmt19 -l {pair} --echo src".split()
+        src_lines = subprocess.run(cmd, stdout=subprocess.PIPE).stdout.decode("utf-8").splitlines()
+        cmd = f"sacrebleu -t wmt19 -l {pair} --echo ref".split()
+        tgt_lines = subprocess.run(cmd, stdout=subprocess.PIPE).stdout.decode("utf-8").splitlines()
+        text[pair] = {"src": src_lines[:n_objs], "tgt": tgt_lines[:n_objs]}
+    return text
+
+
+text = get_all_data(pairs, n_objs)
+filename = "./fsmt_val_data.json"
+with open(filename, "w", encoding="utf-8") as f:
+    bleu_data = json.dump(text, f, indent=2, ensure_ascii=False)
--- a/Show More
+++ b/Show More