final check

Fix pr_slow_ci_suggestion.yml after #42023 (#42049 )
fix Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
2025-11-06 13:34:37 +08:00 · 2025-11-05 22:19:31 +01:00 · 2025-11-05 22:10:12 +01:00 · 2025-11-05 21:01:06 +01:00 · 2025-11-05 18:26:47 +01:00 · 2025-11-05 16:22:22 +01:00
312 changed files with 3997 additions and 12072 deletions
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@ -52,7 +52,7 @@ jobs:
            commit_id=$GITHUB_SHA
          fi
          commit_msg=$(git show -s --format=%s | cut -c1-70)
-          python3 benchmark_v2/run_benchmarks.py -b 32 -s 128 -n 256 --branch-name "$BRANCH_NAME" --commit-id "$commit_id" --commit-message "$commit_msg" --model-id "$MODEL_ID" --log-level INFO --push-result-to-dataset "$DATASET_ID"
+          python3 benchmark_v2/run_benchmarks.py -b 32 -s 128 -n 256 --level 2 --branch-name "$BRANCH_NAME" --commit-id "$commit_id" --commit-message "$commit_msg" --model-id "$MODEL_ID" --log-level INFO --push-result-to-dataset "$DATASET_ID"
        env:
          HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
          PUSH_TO_HUB_TOKEN: ${{ secrets.PUSH_TO_HUB_TOKEN }}
--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
@ -78,7 +78,7 @@ jobs:
        with:
          context: ./docker/transformers-all-latest-gpu
          build-args: |
-            REF=update_dockerfile
+            REF=main
            PYTORCH=2.8.0
            TORCHCODEC=0.7.0
            FLASH_ATTN=yes
@ -97,7 +97,7 @@ jobs:
  latest-torch-deepspeed-docker:
    name: "Latest PyTorch + DeepSpeed"
    runs-on:
-      group: aws-g4dn-2xlarge-cache
+      group: aws-general-8-plus
    steps:
      -
        name: Set up Docker Buildx
@ -200,6 +200,37 @@ jobs:
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

+  cache-latest-pytorch-amd:
+    name: "Cache Latest Pytorch (AMD) Image"
+    needs: latest-pytorch-amd
+    runs-on:
+      group: amd-mi325-1gpu
+    steps:
+      -
+        name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+        
+      - 
+        name: Pull and save docker image to cache
+        run: |
+          image="huggingface/transformers-pytorch-amd-gpu"
+          final_path="/mnt/image-cache/transformers-pytorch-amd-gpu.tar"
+          tmp_path="${final_path}.tmp"
+
+          echo "Pulling image: ${image}"
+          docker pull "${image}"
+
+          echo "Saving to temp file: ${tmp_path}"
+          docker save "${image}" -o "${tmp_path}"
+
+          echo "Moving to final path: ${final_path}"
+          mv -f "${tmp_path}" "${final_path}"
+
+          echo "Cache populated successfully at ${final_path}"
+
  latest-pytorch-deepspeed-amd:
    name: "PyTorch + DeepSpeed (AMD) [dev]"
    runs-on:
--- a/.github/workflows/check_failed_tests.yml
+++ b/.github/workflows/check_failed_tests.yml
@ -6,9 +6,6 @@ on:
      docker:
        required: true
        type: string
-      start_sha:
-        required: true
-        type: string
      job:
        required: true
        type: string
@ -24,7 +21,13 @@ on:
      commit_sha:
        required: false
        type: string
-
+      pr_number:
+        required: false
+        type: string
+    outputs:
+      report:
+        description: "Content of the report of new failures"
+        value: ${{ jobs.process_new_failures_with_commit_info.outputs.report }}

 env:
  HF_HOME: /mnt/cache
@ -61,13 +64,15 @@ jobs:
      - name: Check file
        id: check_file
        working-directory: /transformers
+        env:
+          job: ${{ inputs.job }}
        run: |
-          if [ -f ci_results_${{ inputs.job }}/new_failures.json ]; then
-            echo "`ci_results_${{ inputs.job }}/new_failures.json` exists, continue ..."
+          if [ -f "ci_results_${job}/new_failures.json" ]; then
+            echo "\`ci_results_${job}/new_failures.json\` exists, continue ..."
            echo "process=true" >> $GITHUB_ENV
            echo "process=true" >> $GITHUB_OUTPUT
          else
-            echo "`ci_results_${{ inputs.job }}/new_failures.json` doesn't exist, abort."
+            echo "\`ci_results_${job}/new_failures.json\` doesn't exist, abort."
            echo "process=false" >> $GITHUB_ENV
            echo "process=false" >> $GITHUB_OUTPUT
          fi
@ -88,27 +93,62 @@ jobs:
            echo "PREV_WORKFLOW_RUN_ID=" >> $GITHUB_ENV
          fi

-          if [ -f setup_values/other_workflow_run_id.txt ]; then
-            echo "OTHER_WORKFLOW_RUN_ID=$(cat setup_values/other_workflow_run_id.txt)" >> $GITHUB_ENV
-          else
-            echo "OTHER_WORKFLOW_RUN_ID=" >> $GITHUB_ENV
-          fi
-
      - name: Update clone
        working-directory: /transformers
        if: ${{ env.process == 'true' }}
-        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
+        env:
+          commit_sha: ${{ inputs.commit_sha || github.sha }}
+        run: |
+          git fetch origin "$commit_sha" && git checkout "$commit_sha"

-      - name: Get target commit
+      - name: Get `START_SHA`
        working-directory: /transformers/utils
        if: ${{ env.process == 'true' }}
+        env:
+          commit_sha: ${{ inputs.commit_sha || github.sha }}
        run: |
-          echo "END_SHA=$(TOKEN=${{ secrets.ACCESS_REPO_INFO_TOKEN }} python3 -c 'import os; from get_previous_daily_ci import get_last_daily_ci_run_commit; commit=get_last_daily_ci_run_commit(token=os.environ["TOKEN"], workflow_run_id=os.environ["PREV_WORKFLOW_RUN_ID"]); print(commit)')" >> $GITHUB_ENV
+          echo "START_SHA=$commit_sha" >> $GITHUB_ENV

-      - name: Checkout to `start_sha`
-        working-directory: /transformers
-        if: ${{ env.process == 'true' }}
-        run: git fetch && git checkout ${{ inputs.start_sha }}
+      # This is used if the CI is triggered from a pull request `self-comment-ci.yml` (after security check is verified)
+      - name: Extract the base commit on `main` (of the merge commit created by Github) if it is a PR
+        id: pr_info
+        if: ${{ env.process == 'true' && inputs.pr_number != '' }}
+        uses: actions/github-script@v6
+        with:
+          script: |            
+            const { data: pr } = await github.rest.pulls.get({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              pull_number: ${{ inputs.pr_number }}
+            });
+
+            const { data: merge_commit }  = await github.rest.repos.getCommit({
+              owner: pr.base.repo.owner.login,
+              repo: pr.base.repo.name,
+              ref: pr.merge_commit_sha,
+            });
+
+            core.setOutput('merge_commit_base_sha', merge_commit.parents[0].sha);
+
+      # Usually, `END_SHA` should be the commit of the last previous workflow run of the **SAME** (scheduled) workflow.
+      # (This is why we don't need to specify `workflow_id` which would be fetched automatically in the python script.)
+      - name: Get `END_SHA` from previous CI runs of the same workflow
+        working-directory: /transformers/utils
+        if: ${{ env.process == 'true' && inputs.pr_number == '' }}
+        env:
+          ACCESS_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
+        run: |
+          echo "END_SHA=$(TOKEN="$ACCESS_TOKEN" python3 -c 'import os; from get_previous_daily_ci import get_last_daily_ci_run_commit; commit=get_last_daily_ci_run_commit(token=os.environ["TOKEN"], workflow_run_id=os.environ["PREV_WORKFLOW_RUN_ID"]); print(commit)')" >> $GITHUB_ENV
+
+      # However, for workflow runs triggered by `issue_comment` (for pull requests), we want to check against the
+      # parent commit (on `main`) of the `merge_commit` (dynamically created by GitHub). In this case, the goal is to
+      # see if a reported failing test is actually ONLY failing on the `merge_commit`.
+      - name: Set `END_SHA`
+        if: ${{ env.process == 'true' && inputs.pr_number != '' }}
+        env:
+          merge_commit_base_sha: ${{ steps.pr_info.outputs.merge_commit_base_sha }}
+        run: |
+          echo "END_SHA=$merge_commit_base_sha" >> $GITHUB_ENV

      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
        working-directory: /transformers
@ -138,14 +178,20 @@ jobs:
      - name: Check failed tests
        working-directory: /transformers
        if: ${{ env.process == 'true' }}
-        run: python3 utils/check_bad_commit.py --start_commit ${{ inputs.start_sha }} --end_commit ${{ env.END_SHA }} --file ci_results_${{ inputs.job }}/new_failures.json --output_file new_failures_with_bad_commit_${{ inputs.job }}_${{ matrix.run_idx }}.json
+        env:
+          job: ${{ inputs.job }}
+          run_idx: ${{ matrix.run_idx }}
+        run: python3 utils/check_bad_commit.py --start_commit "$START_SHA" --end_commit "$END_SHA" --file "ci_results_${job}/new_failures.json" --output_file "new_failures_with_bad_commit_${job}_${run_idx}.json"

      - name: Show results
        working-directory: /transformers
        if: ${{ env.process == 'true' }}
+        env:
+          job: ${{ inputs.job }}
+          run_idx: ${{ matrix.run_idx }}
        run: |
-          ls -l new_failures_with_bad_commit_${{ inputs.job }}_${{ matrix.run_idx }}.json
-          cat new_failures_with_bad_commit_${{ inputs.job }}_${{ matrix.run_idx }}.json
+          ls -l "new_failures_with_bad_commit_${job}_${run_idx}.json"
+          cat "new_failures_with_bad_commit_${job}_${run_idx}.json"

      - name: Upload artifacts
        uses: actions/upload-artifact@v4
@ -159,6 +205,8 @@ jobs:
    if: needs.check_new_failures.outputs.process == 'true'
    runs-on:
      group: aws-g5-4xlarge-cache
+    outputs:
+      report: ${{ steps.set_output.outputs.report }}
    container:
      image: ${{ inputs.docker }}
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@ -176,32 +224,28 @@ jobs:

      - name: Check files
        working-directory: /transformers
+        env:
+          job: ${{ inputs.job }}
        run: |
          ls -la /transformers
-          ls -la /transformers/new_failures_with_bad_commit_${{ inputs.job }}
+          ls -la "/transformers/new_failures_with_bad_commit_${job}"

      # Currently, we only run with a single runner by using `run_idx: [1]`. We might try to run with multiple runners
      # to further reduce the false positive caused by flaky tests, which requires further processing to merge reports.
      - name: Merge files
        shell: bash
        working-directory: /transformers
+        env:
+          job: ${{ inputs.job }}
        run: |
-          cp /transformers/new_failures_with_bad_commit_${{ inputs.job }}/new_failures_with_bad_commit_${{ inputs.job }}_1.json new_failures_with_bad_commit.json
+          cp "/transformers/new_failures_with_bad_commit_${job}/new_failures_with_bad_commit_${job}_1.json" new_failures_with_bad_commit.json

      - name: Update clone
-        working-directory: /transformers
-        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
-
-      - name: Process report
-        shell: bash
        working-directory: /transformers
        env:
-          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
-          TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}
-          JOB_NAME: ${{ inputs.job }}
-          REPORT_REPO_ID: ${{ inputs.report_repo_id }}
+          commit_sha: ${{ inputs.commit_sha || github.sha }}
        run: |
-          python3 utils/process_bad_commit_report.py
+          git fetch origin "$commit_sha" && git checkout "$commit_sha"

      - name: Process report
        shell: bash
@ -218,11 +262,37 @@ jobs:
            echo EOF
          } >> "$GITHUB_ENV"

-      - name: Prepare Slack report title
+      # The output is useful if a caller needs more processing, for example, we have a chain
+      # self-comment-ci.yml -> self-scheduled.yml -> this one (check_failed_tests.yml),
+      # and `self-comment-ci.yml` needs further processing before sending a GitHub comment to the pull request page.
+      - name: Show results & Set outputs
+        id: set_output
        working-directory: /transformers
+        run: |
+          ls -l new_failures_with_bad_commit.json
+          cat new_failures_with_bad_commit.json
+
+          {
+            echo 'report<<EOF'
+            cat new_failures_with_bad_commit.json
+            echo ''  # Force a newline
+            echo EOF
+          } >> "$GITHUB_OUTPUT"
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: new_failures_with_bad_commit_${{ inputs.job }}
+          path: /transformers/new_failures_with_bad_commit.json
+
+      - name: Prepare Slack report title
+        working-directory: /transformers
+        env:
+          ci_event: ${{ inputs.ci_event }}
+          job: ${{ inputs.job }}
        run: |
          pip install slack_sdk
-          echo "title=$(python3 -c 'import sys; sys.path.append("utils"); from utils.notification_service import job_to_test_map; ci_event = "${{ inputs.ci_event }}"; job = "${{ inputs.job }}"; test_name = job_to_test_map[job]; title = f"New failed tests of {ci_event}" + ":" + f" {test_name}"; print(title)')" >> $GITHUB_ENV
+          echo "title=$(python3 -c 'import sys; import os; sys.path.append("utils"); from utils.notification_service import job_to_test_map; ci_event = os.environ["ci_event"]; job = os.environ["job"]; test_name = job_to_test_map[job]; title = f"New failed tests of {ci_event}" + ":" + f" {test_name}"; print(title)')" >> $GITHUB_ENV

      - name: Send processed report
        if: ${{ !endsWith(env.REPORT_TEXT, '{}') }}
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@ -0,0 +1,22 @@
+---
+name: CodeQL Security Analysis
+
+on:
+  push:
+    branches: ["main"]
+  # pull_request:
+  #   branches: ["main"]
+  workflow_dispatch:
+
+jobs:
+  codeql:
+    name: CodeQL Analysis
+    uses: huggingface/security-workflows/.github/workflows/codeql-reusable.yml@main
+    permissions:
+      security-events: write
+      packages: read
+      actions: read
+      contents: read
+    with:
+      languages: '["actions"]'
+      queries: 'security-extended,security-and-quality'
--- a/.github/workflows/get-pr-info.yml
+++ b/.github/workflows/get-pr-info.yml
@ -39,6 +39,9 @@ on:
      PR_MERGE_COMMIT_SHA:
        description: "The sha of the merge commit for the pull request (created by GitHub) in the base repository"
        value: ${{ jobs.get-pr-info.outputs.PR_MERGE_COMMIT_SHA }}
+      PR_MERGE_COMMIT_BASE_SHA:
+        description: "The sha of the parent commit of the the merge commit on the target branch in the base repository"
+        value: ${{ jobs.get-pr-info.outputs.PR_MERGE_COMMIT_BASE_SHA }}
      PR_HEAD_COMMIT_DATE:
        description: "The date of the head sha of the pull request branch in the head repository"
        value: ${{ jobs.get-pr-info.outputs.PR_HEAD_COMMIT_DATE }}
@ -74,6 +77,7 @@ jobs:
      PR_BASE_REF: ${{ steps.pr_info.outputs.base_ref }}
      PR_HEAD_SHA: ${{ steps.pr_info.outputs.head_sha }}
      PR_BASE_SHA: ${{ steps.pr_info.outputs.base_sha }}
+      PR_MERGE_COMMIT_BASE_SHA: ${{ steps.pr_info.outputs.merge_commit_base_sha }}
      PR_MERGE_COMMIT_SHA: ${{ steps.pr_info.outputs.merge_commit_sha }}
      PR_HEAD_COMMIT_DATE: ${{ steps.pr_info.outputs.head_commit_date }}
      PR_MERGE_COMMIT_DATE: ${{ steps.pr_info.outputs.merge_commit_date }}
@ -122,6 +126,7 @@ jobs:
            core.setOutput('base_ref', pr.base.ref);
            core.setOutput('head_sha', pr.head.sha);
            core.setOutput('base_sha', pr.base.sha);
+            core.setOutput('merge_commit_base_sha', merge_commit.parents[0].sha);
            core.setOutput('merge_commit_sha', pr.merge_commit_sha);
            core.setOutput('pr', pr);

@ -142,16 +147,21 @@ jobs:
              date: merge_commit.commit.committer.date
            });

+            console.log('PR Info:', {
+              pr_info: pr
+            });
+
      - name: Convert dates to timestamps
        id: get_timestamps
+        env:
+          head_commit_date: ${{ steps.pr_info.outputs.head_commit_date }}
+          merge_commit_date: ${{ steps.pr_info.outputs.merge_commit_date }}
        run: |
-          head_commit_date=${{ steps.pr_info.outputs.head_commit_date }}
-          merge_commit_date=${{ steps.pr_info.outputs.merge_commit_date }}
-          echo $head_commit_date
-          echo $merge_commit_date
+          echo "$head_commit_date"
+          echo "$merge_commit_date"
          head_commit_timestamp=$(date -d "$head_commit_date" +%s)
          merge_commit_timestamp=$(date -d "$merge_commit_date" +%s)
-          echo $head_commit_timestamp
-          echo $merge_commit_timestamp
+          echo "$head_commit_timestamp"
+          echo "$merge_commit_timestamp"
          echo "head_commit_timestamp=$head_commit_timestamp" >> $GITHUB_OUTPUT
-          echo "merge_commit_timestamp=$merge_commit_timestamp" >> $GITHUB_OUTPUT
+          echo "merge_commit_timestamp=$merge_commit_timestamp" >> $GITHUB_OUTPUT
--- a/.github/workflows/get-pr-number.yml
+++ b/.github/workflows/get-pr-number.yml
@ -15,13 +15,19 @@ jobs:
    steps:
      - name: Get PR number
        shell: bash
+        env:
+          issue_number: ${{ github.event.issue.number }}
+          is_pull_request_issue: ${{ github.event.issue.pull_request != null }}
+          pr_number: ${{ github.event.pull_request.number }}
+          is_pull_request: ${{ github.event.pull_request != null }}
+          event_number: ${{ github.event.number }}
        run: |
-          if [[ "${{ github.event.issue.number }}" != "" && "${{ github.event.issue.pull_request }}" != "" ]]; then
-            echo "PR_NUMBER=${{ github.event.issue.number }}" >> $GITHUB_ENV
-          elif [[ "${{ github.event.pull_request.number }}" != "" ]]; then
-            echo "PR_NUMBER=${{ github.event.pull_request.number }}" >> $GITHUB_ENV
-          elif [[ "${{ github.event.pull_request }}" != "" ]]; then
-            echo "PR_NUMBER=${{ github.event.number }}" >> $GITHUB_ENV
+          if [[ "$issue_number" != "" && "$is_pull_request_issue" == "true" ]]; then
+            echo "PR_NUMBER=$issue_number" >> $GITHUB_ENV
+          elif [[ "$pr_number" != "" ]]; then
+            echo "PR_NUMBER=$pr_number" >> $GITHUB_ENV
+          elif [[ "$is_pull_request" == "true" ]]; then
+            echo "PR_NUMBER=$event_number" >> $GITHUB_ENV
          else
            echo "PR_NUMBER=" >> $GITHUB_ENV
          fi
@ -29,8 +35,8 @@ jobs:
      - name: Check PR number
        shell: bash
        run: |
-          echo "${{ env.PR_NUMBER }}"
+          echo "$PR_NUMBER"

      - name: Set PR number
        id: set_pr_number
-        run: echo "PR_NUMBER=${{ env.PR_NUMBER }}" >> "$GITHUB_OUTPUT"
+        run: echo "PR_NUMBER=$PR_NUMBER" >> "$GITHUB_OUTPUT"
--- a/.github/workflows/model_jobs.yml
+++ b/.github/workflows/model_jobs.yml
@ -62,25 +62,33 @@ jobs:
    steps:
      - name: Echo input and matrix info
        shell: bash
+        env:
+          folder_slices: ${{ inputs.folder_slices }}
+          matrix_folders: ${{ matrix.folders }}
+          slice_data: ${{ toJson(fromJson(inputs.folder_slices)[inputs.slice_id]) }}
        run: |
-          echo "${{ inputs.folder_slices }}"
-          echo "${{ matrix.folders }}"
-          echo "${{ toJson(fromJson(inputs.folder_slices)[inputs.slice_id]) }}"
+          echo "$folder_slices"
+          echo "$matrix_folders"
+          echo "$slice_data"

      - name: Echo folder ${{ matrix.folders }}
        shell: bash
        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
        # set the artifact folder names (because the character `/` is not allowed).
+        env:
+          matrix_folders_raw: ${{ matrix.folders }}
        run: |
-          echo "${{ matrix.folders }}"
-          matrix_folders=${{ matrix.folders }}
-          matrix_folders=${matrix_folders/'models/'/'models_'}
+          echo "$matrix_folders_raw"
+          matrix_folders="${matrix_folders_raw/'models/'/'models_'}"
          echo "$matrix_folders"
          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV

      - name: Update clone
        working-directory: /transformers
-        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
+        env:
+          commit_sha: ${{ inputs.commit_sha || github.sha }}
+        run: |
+          git fetch origin "$commit_sha" && git checkout "$commit_sha"

      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
        working-directory: /transformers
@ -115,15 +123,17 @@ jobs:
        id: set_machine_type
        working-directory: /transformers
        shell: bash
+        env:
+          input_machine_type: ${{ inputs.machine_type }}
        run: |
-          echo "${{ inputs.machine_type }}"
+          echo "$input_machine_type"

-          if [ "${{ inputs.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+          if [ "$input_machine_type" = "aws-g5-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ inputs.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
+          elif [ "$input_machine_type" = "aws-g5-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
-            machine_type=${{ inputs.machine_type }}
+            machine_type="$input_machine_type"
          fi

          echo "$machine_type"
@ -132,15 +142,21 @@ jobs:

      - name: Create report directory if it doesn't exist
        shell: bash
+        env:
+          report_name_prefix: ${{ inputs.report_name_prefix }}
        run: |
-          mkdir -p /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
-          echo "dummy" > /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports/dummy.txt
-          ls -la /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
+          mkdir -p "/transformers/reports/${machine_type}_${report_name_prefix}_${matrix_folders}_test_reports"
+          echo "dummy" > "/transformers/reports/${machine_type}_${report_name_prefix}_${matrix_folders}_test_reports/dummy.txt"
+          ls -la "/transformers/reports/${machine_type}_${report_name_prefix}_${matrix_folders}_test_reports"

      - name: Run all tests on GPU
        working-directory: /transformers
+        env:
+          report_name_prefix: ${{ inputs.report_name_prefix }}
+          pytest_marker: ${{ inputs.pytest_marker }}
+          model: ${{ matrix.folders }}
        run: |
-          script -q -c "PATCH_TESTING_METHODS_TO_COLLECT_OUTPUTS=yes _PATCHED_TESTING_METHODS_OUTPUT_DIR=/transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports python3 -m pytest -rsfE -v -m '${{ inputs.pytest_marker }}' --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports tests/${{ matrix.folders }}" test_outputs.txt
+          script -q -c "PATCH_TESTING_METHODS_TO_COLLECT_OUTPUTS=yes _PATCHED_TESTING_METHODS_OUTPUT_DIR=/transformers/reports/${machine_type}_${report_name_prefix}_${matrix_folders}_test_reports python3 -m pytest -rsfE -v -m '${pytest_marker}' --make-reports=${machine_type}_${report_name_prefix}_${matrix_folders}_test_reports tests/${model}" test_outputs.txt
          ls -la
          # Extract the exit code from the output file
          EXIT_CODE=$(tail -1 test_outputs.txt | grep -o 'COMMAND_EXIT_CODE="[0-9]*"' | cut -d'"' -f2)
@ -151,19 +167,25 @@ jobs:
        # This step is only to show information on Github Actions log.
        # Always mark this step as successful, even if the report directory or the file `failures_short.txt` in it doesn't exist
        continue-on-error: true
-        run: cat /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports/failures_short.txt
+        env:
+          report_name_prefix: ${{ inputs.report_name_prefix }}
+        run: cat "/transformers/reports/${machine_type}_${report_name_prefix}_${matrix_folders}_test_reports/failures_short.txt"

      - name: Captured information
        if: ${{ failure() }}
        continue-on-error: true
+        env:
+          report_name_prefix: ${{ inputs.report_name_prefix }}
        run: |
-          cat /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports/captured_info.txt
+          cat "/transformers/reports/${machine_type}_${report_name_prefix}_${matrix_folders}_test_reports/captured_info.txt"

      - name: Copy test_outputs.txt
        if: ${{ always() }}
        continue-on-error: true
+        env:
+          report_name_prefix: ${{ inputs.report_name_prefix }}
        run: |
-          cp /transformers/test_outputs.txt /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
+          cp /transformers/test_outputs.txt "/transformers/reports/${machine_type}_${report_name_prefix}_${matrix_folders}_test_reports"

      - name: "Test suite reports artifacts: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports"
        if: ${{ always() }}
@ -174,7 +196,7 @@ jobs:

  collated_reports:
    name: Collated Reports
-    if: ${{ always() }}
+    if: ${{ always() && inputs.runner_type != '' }}
    needs: run_models_gpu
    uses: huggingface/transformers/.github/workflows/collated-reports.yml@main
    with:
--- a/.github/workflows/pr_slow_ci_suggestion.yml
+++ b/.github/workflows/pr_slow_ci_suggestion.yml
@ -1,4 +1,4 @@
-name: PR slow CI
+name: PR slow CI - Suggestion
 on:
  pull_request_target:
    types: [opened, synchronize, reopened]
@ -23,11 +23,26 @@ jobs:
    outputs:
      jobs: ${{ steps.get_jobs.outputs.jobs_to_run }}
    steps:
+      # This checkout to the main branch
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: "0"
+
+      - name: Write pr_files file
+        env:
+          PR_FILES: ${{ needs.get-pr-info.outputs.PR_FILES }}
+        run: |
+          cat > pr_files.txt << EOF
+          $PR_FILES
+          EOF
+
      - name: Get repository content
        id: repo_content
        uses: actions/github-script@v6
        with:
          script: |
+            const fs = require('node:fs');
+
            const { data: tests_dir } = await github.rest.repos.getContent({
              owner: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_OWNER }}',
              repo: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_NAME }}',
@ -49,38 +64,10 @@ jobs:
              ref: '${{ needs.get-pr-info.outputs.PR_HEAD_SHA }}',
            });

-            core.setOutput('tests_dir', tests_dir);
-            core.setOutput('tests_models_dir', tests_models_dir);
-            core.setOutput('tests_quantization_dir', tests_quantization_dir);
-
-      # This checkout to the main branch
-      - uses: actions/checkout@v4
-        with:
-          fetch-depth: "0"
-
-      - name: Write pr_files file
-        run: |
-          cat > pr_files.txt << 'EOF'
-          ${{ needs.get-pr-info.outputs.PR_FILES }}
-          EOF
-
-      - name: Write tests_dir file
-        run: |
-          cat > tests_dir.txt << 'EOF'
-          ${{ steps.repo_content.outputs.tests_dir }}
-          EOF
-
-      - name: Write tests_models_dir file
-        run: |
-          cat > tests_models_dir.txt << 'EOF'
-          ${{ steps.repo_content.outputs.tests_models_dir }}
-          EOF
-
-      - name: Write tests_quantization_dir file
-        run: |
-          cat > tests_quantization_dir.txt << 'EOF'
-          ${{ steps.repo_content.outputs.tests_quantization_dir }}
-          EOF
+            // Write to files instead of outputs
+            fs.writeFileSync('tests_dir.txt', JSON.stringify(tests_dir, null, 2));
+            fs.writeFileSync('tests_models_dir.txt', JSON.stringify(tests_models_dir, null, 2));
+            fs.writeFileSync('tests_quantization_dir.txt', JSON.stringify(tests_quantization_dir, null, 2));

      - name: Run script to get jobs to run
        id: get_jobs
--- a/.github/workflows/push-important-models.yml
+++ b/.github/workflows/push-important-models.yml
@ -153,5 +153,5 @@ jobs:
      ci_event: push
      report_repo_id: hf-internal-testing/transformers_ci_push
      commit_sha: ${{ github.sha }}
-      models: ${{ needs.get_modified_models.outputs.matrix }}
+      subdirs: ${{ needs.get_modified_models.outputs.matrix }}
    secrets: inherit
--- a/.github/workflows/self-comment-ci.yml
+++ b/.github/workflows/self-comment-ci.yml
@ -23,62 +23,34 @@ env:
  TF_FORCE_GPU_ALLOW_GROWTH: true
  CUDA_VISIBLE_DEVICES: 0,1

+
 jobs:
  get-pr-number:
-    runs-on: ubuntu-22.04
    name: Get PR number
-    # For security: only allow team members to run
    if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "eustlb", "MekkCyber", "vasqu", "ivarflakstad", "stevhliu", "ebezzam", "remi-or", "itazap"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
-    outputs:
-      PR_NUMBER: ${{ steps.set_pr_number.outputs.PR_NUMBER }}
-    steps:
-      - name: Get PR number
-        shell: bash
-        run: |
-          if [[ "${{ github.event.issue.number }}" != "" && "${{ github.event.issue.pull_request }}" != "" ]]; then
-            echo "PR_NUMBER=${{ github.event.issue.number }}" >> $GITHUB_ENV
-          else
-            echo "PR_NUMBER=" >> $GITHUB_ENV
-          fi
+    uses: ./.github/workflows/get-pr-number.yml

-      - name: Check PR number
-        shell: bash
-        run: |
-          echo "${{ env.PR_NUMBER }}"
-
-      - name: Set PR number
-        id: set_pr_number
-        run: echo "PR_NUMBER=${{ env.PR_NUMBER }}" >> "$GITHUB_OUTPUT"
-
-  get-sha:
-    runs-on: ubuntu-22.04
+  get-pr-info:
+    name: Get PR commit SHA
    needs: get-pr-number
    if: ${{ needs.get-pr-number.outputs.PR_NUMBER != ''}}
-    outputs:
-      PR_HEAD_SHA: ${{ steps.get_sha.outputs.PR_HEAD_SHA }}
-      PR_MERGE_SHA: ${{ steps.get_sha.outputs.PR_MERGE_SHA }}
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          fetch-depth: "0"
-          ref: "refs/pull/${{needs.get-pr-number.outputs.PR_NUMBER}}/merge"
+    uses: ./.github/workflows/get-pr-info.yml
+    with:
+      pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}

-      - name: Get SHA (and verify timestamps against the issue comment date)
-        id: get_sha
+  check-timestamps:
+    name: Check timestamps (security check)
+    runs-on: ubuntu-22.04
+    needs: get-pr-info
+    outputs:
+      PR_HEAD_SHA: ${{ needs.get-pr-info.outputs.PR_HEAD_SHA }}
+      PR_MERGE_SHA: ${{ needs.get-pr-info.outputs.PR_MERGE_COMMIT_SHA }}
+    steps:
+      - name: Verify `merge_commit` timestamp is older than the issue comment timestamp
        env:
-          PR_NUMBER: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
          COMMENT_DATE: ${{ github.event.comment.created_at }}
+          PR_MERGE_COMMIT_TIMESTAMP: ${{ needs.get-pr-info.outputs.PR_MERGE_COMMIT_TIMESTAMP }}
        run: |
-            git fetch origin refs/pull/$PR_NUMBER/head:refs/remotes/pull/$PR_NUMBER/head
-            git checkout refs/remotes/pull/$PR_NUMBER/head
-            echo "PR_HEAD_SHA: $(git log -1 --format=%H)"
-            echo "PR_HEAD_SHA=$(git log -1 --format=%H)" >> "$GITHUB_OUTPUT"
-            git fetch origin refs/pull/$PR_NUMBER/merge:refs/remotes/pull/$PR_NUMBER/merge
-            git checkout refs/remotes/pull/$PR_NUMBER/merge
-            echo "PR_MERGE_SHA: $(git log -1 --format=%H)"
-            echo "PR_MERGE_SHA=$(git log -1 --format=%H)" >> "$GITHUB_OUTPUT"
-            PR_MERGE_COMMIT_TIMESTAMP=$(git log -1 --date=unix --format=%cd)
-            echo "PR_MERGE_COMMIT_TIMESTAMP: $PR_MERGE_COMMIT_TIMESTAMP"
            COMMENT_TIMESTAMP=$(date -d "${COMMENT_DATE}" +"%s")
            echo "COMMENT_DATE: $COMMENT_DATE"
            echo "COMMENT_TIMESTAMP: $COMMENT_TIMESTAMP"
@ -87,13 +59,10 @@ jobs:
              exit -1;
            fi

-  # use a python script to handle this complex logic
-  # case 1: `run-slow` (auto. infer with limited number of models, but in particular, new model)
-  # case 2: `run-slow model_1, model_2`
+  # use a python script to handle this complex logic.
  get-tests:
    runs-on: ubuntu-22.04
-    needs: [get-pr-number, get-sha]
-    if: ${{ needs.get-pr-number.outputs.PR_NUMBER != ''}}
+    needs: [get-pr-number, check-timestamps]
    outputs:
      models: ${{ steps.models_to_run.outputs.models }}
      quantizations: ${{ steps.models_to_run.outputs.quantizations }}
@ -101,11 +70,11 @@ jobs:
      - uses: actions/checkout@v4
        with:
          fetch-depth: "0"
-          ref: "refs/pull/${{needs.get-pr-number.outputs.PR_NUMBER}}/merge"
+          ref: "refs/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge"

      - name: Verify merge commit SHA
        env:
-          VERIFIED_PR_MERGE_SHA: ${{ needs.get-sha.outputs.PR_MERGE_SHA }}
+          VERIFIED_PR_MERGE_SHA: ${{ needs.check-timestamps.outputs.PR_MERGE_SHA }}
        run: |
            PR_MERGE_SHA=$(git log -1 --format=%H)
            if [ $PR_MERGE_SHA != $VERIFIED_PR_MERGE_SHA ]; then
@ -126,11 +95,33 @@ jobs:
      - name: Show models to test
        id: models_to_run
        run: |
-          echo "${{ env.models }}"
-          echo "models=${{ env.models }}" >> $GITHUB_ENV
-          echo "models=${{ env.models }}" >> $GITHUB_OUTPUT
-          echo "${{ env.quantizations }}"
-          echo "quantizations=${{ env.quantizations }}" >> $GITHUB_OUTPUT
+          echo "$models"
+          echo "models=$models" >> $GITHUB_OUTPUT
+          echo "$quantizations"
+          echo "quantizations=$quantizations" >> $GITHUB_OUTPUT
+
+  # Report back if we are not able to get the tests (for example, security check is failing)
+  report_error_earlier:
+    name: Report error earlier
+    if: ${{ always() && needs.get-pr-info.result == 'success' && needs.get-tests.result != 'success' }}
+    needs: [get-pr-number, get-pr-info, get-tests]
+    permissions:
+      pull-requests: write
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Reply to the comment
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
+          github_repository: ${{ github.repository }}
+          pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
+        run: |
+          gh api \
+            --method POST \
+            -H "Accept: application/vnd.github+json" \
+            -H "X-GitHub-Api-Version: 2022-11-28" \
+            "repos/${github_repository}/issues/${pr_number}/comments" \
+            -f body="💔 This comment contains \`run-slow\`, but unknown error occurred and [the workflow run]($GITHUB_RUN_URL) aborted!"

  reply_to_comment:
    name: Reply to the comment
@ -143,20 +134,20 @@ jobs:
      - name: Reply to the comment
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          MODELS: ${{ needs.get-tests.outputs.models }}
-          BODY: "\n\nmodels: ${{ needs.get-tests.outputs.models }}\nquantizations: ${{ needs.get-tests.outputs.quantizations }}"
+          BODY: '\n\nmodels: ${{ needs.get-tests.outputs.models }}\nquantizations: ${{ needs.get-tests.outputs.quantizations }}'
+          github_repository: ${{ github.repository }}
+          pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
        run: |
          gh api \
            --method POST \
            -H "Accept: application/vnd.github+json" \
            -H "X-GitHub-Api-Version: 2022-11-28" \
-            repos/${{ github.repository }}/issues/${{ needs.get-pr-number.outputs.PR_NUMBER }}/comments \
-            -f "body=This comment contains run-slow, running the specified jobs: ${{ env.BODY }} ..."
+            "repos/${github_repository}/issues/${pr_number}/comments" \
+            -f body="This comment contains \`run-slow\`, running the specified jobs: $(echo -e "$BODY")"

  create_run:
    name: Create run
-    if: ${{ needs.get-tests.outputs.models != '[]' || needs.get-tests.outputs.quantizations != '[]' }}
-    needs: [get-sha, get-tests, reply_to_comment]
+    needs: [check-timestamps, reply_to_comment]
    permissions:
      statuses: write
    runs-on: ubuntu-22.04
@ -168,248 +159,196 @@ jobs:
          # Create a commit status (pending) for a run of this workflow. The status has to be updated later in `update_run_status`.
          # See https://docs.github.com/en/rest/commits/statuses?apiVersion=2022-11-28#create-a-commit-status
          GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
+          github_repository: ${{ github.repository }}
+          pr_head_sha: ${{ needs.check-timestamps.outputs.PR_HEAD_SHA }}
        run: |
          gh api \
            --method POST \
            -H "Accept: application/vnd.github+json" \
            -H "X-GitHub-Api-Version: 2022-11-28" \
-            repos/${{ github.repository }}/statuses/${{ needs.get-sha.outputs.PR_HEAD_SHA }} \
+            "repos/${github_repository}/statuses/${pr_head_sha}" \
            -f "target_url=$GITHUB_RUN_URL" -f "state=pending" -f "description=Slow CI job" -f "context=pytest/custom-tests"

-  run_models_gpu:
-    name: Run all tests for the model
+  model-ci:
+    name: Model CI
    if: ${{ needs.get-tests.outputs.models != '[]' }}
-    needs: [get-pr-number, get-sha, get-tests, create_run]
-    strategy:
-      fail-fast: false
-      matrix:
-        folders: ${{ fromJson(needs.get-tests.outputs.models) }}
-        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
-    runs-on:
-       group: '${{ matrix.machine_type }}'
-    container:
-      image: huggingface/transformers-all-latest-gpu
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    steps:
-      - name: Echo input and matrix info
-        shell: bash
-        run: |
-          echo "${{ matrix.folders }}"
+    uses: ./.github/workflows/self-scheduled.yml
+    needs: [get-pr-number, check-timestamps, get-tests, create_run]
+    with:
+      job: run_models_gpu
+      slack_report_channel: "#transformers-ci-pr"
+      docker: huggingface/transformers-all-latest-gpu
+      ci_event: PR Comment CI
+      report_repo_id: hf-internal-testing/transformers_pr_ci
+      commit_sha: ${{ needs.check-timestamps.outputs.PR_MERGE_SHA }}
+      subdirs: ${{ needs.get-tests.outputs.models }}
+      pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
+    secrets: inherit

-      - name: Echo folder ${{ matrix.folders }}
-        shell: bash
-        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
-        # set the artifact folder names (because the character `/` is not allowed).
-        run: |
-          echo "${{ matrix.folders }}"
-          matrix_folders=${{ matrix.folders }}
-          matrix_folders=${matrix_folders/'models/'/'models_'}
-          echo "$matrix_folders"
-          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
-
-      - name: Checkout to PR merge commit
-        working-directory: /transformers
-        run: |
-          git fetch origin refs/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge:refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge
-          git checkout refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge
-          git log -1 --format=%H
-
-      - name: Verify merge commit SHA
-        env:
-          VERIFIED_PR_MERGE_SHA: ${{ needs.get-sha.outputs.PR_MERGE_SHA }}
-        working-directory: /transformers
-        run: |
-          PR_MERGE_SHA=$(git log -1 --format=%H)
-          if [ $PR_MERGE_SHA != $VERIFIED_PR_MERGE_SHA ]; then
-            echo "The merged commit SHA is not the same as the verified one! Security issue detected, abort the workflow!";
-            exit -1;
-          fi
-
-      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
-        working-directory: /transformers
-        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
-
-      - name: NVIDIA-SMI
-        run: |
-          nvidia-smi
-
-      - name: Set `machine_type` for report and artifact names
-        working-directory: /transformers
-        shell: bash
-        run: |
-          echo "${{ matrix.machine_type }}"
-          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
-            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
-            machine_type=multi-gpu
-          else
-            machine_type=${{ matrix.machine_type }}
-          fi
-          echo "$machine_type"
-          echo "machine_type=$machine_type" >> $GITHUB_ENV
-
-      - name: Environment
-        working-directory: /transformers
-        run: |
-          python3 utils/print_env.py
-
-      - name: Show installed libraries and their versions
-        working-directory: /transformers
-        run: pip freeze
-
-      - name: Run all tests on GPU
-        working-directory: /transformers
-        run: |
-          export CUDA_VISIBLE_DEVICES="$(python3 utils/set_cuda_devices_for_ci.py --test_folder ${{ matrix.folders }})"
-          echo $CUDA_VISIBLE_DEVICES
-          python3 -m pytest -v -rsfE --make-reports=${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
-
-      - name: Failure short reports
-        if: ${{ failure() }}
-        continue-on-error: true
-        run: cat /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
-
-      - name: Make sure report directory exists
-        shell: bash
-        run: |
-          mkdir -p /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
-          echo "hello" > /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/hello.txt
-          echo "${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports"
-
-      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports"
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports
-          path: /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
-
-  run_quantization_torch_gpu:
-    name: Run all tests for a quantization
+  quantization-ci:
+    name: Quantization CI
    if: ${{ needs.get-tests.outputs.quantizations != '[]' }}
-    needs: [get-pr-number, get-sha, get-tests, create_run]
-    strategy:
-      fail-fast: false
-      matrix:
-        folders: ${{ fromJson(needs.get-tests.outputs.quantizations) }}
-        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
-    runs-on:
-      group: '${{ matrix.machine_type }}'
-    container:
-      image: huggingface/transformers-quantization-latest-gpu
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    steps:
-      - name: Echo folder ${{ matrix.folders }}
-        shell: bash
-        run: |
-          echo "${{ matrix.folders }}"
-          matrix_folders=${{ matrix.folders }}
-          matrix_folders=${matrix_folders/'quantization/'/'quantization_'}
-          echo "$matrix_folders"
-          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
+    uses: ./.github/workflows/self-scheduled.yml
+    needs: [get-pr-number, check-timestamps, get-tests, create_run]
+    with:
+      job: run_quantization_torch_gpu
+      slack_report_channel: "#transformers-ci-pr"
+      docker: huggingface/transformers-quantization-latest-gpu
+      ci_event: PR Comment CI
+      report_repo_id: hf-internal-testing/transformers_pr_ci
+      commit_sha: ${{ needs.check-timestamps.outputs.PR_MERGE_SHA }}
+      subdirs: ${{ needs.get-tests.outputs.quantizations }}
+      pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
+    secrets: inherit

-      - name: Checkout to PR merge commit
-        working-directory: /transformers
-        run: |
-          git fetch origin refs/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge:refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge
-          git checkout refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge
-          git log -1 --format=%H
-
-      - name: Verify merge commit SHA
-        env:
-          VERIFIED_PR_MERGE_SHA: ${{ needs.get-sha.outputs.PR_MERGE_SHA }}
-        working-directory: /transformers
-        run: |
-          PR_MERGE_SHA=$(git log -1 --format=%H)
-          if [ $PR_MERGE_SHA != $VERIFIED_PR_MERGE_SHA ]; then
-            echo "The merged commit SHA is not the same as the verified one! Security issue detected, abort the workflow!";
-            exit -1;
-          fi
-
-      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
-        working-directory: /transformers
-        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
-      - name: NVIDIA-SMI
-        run: |
-          nvidia-smi
-
-      - name: Set `machine_type` for report and artifact names
-        working-directory: /transformers
-        shell: bash
-        run: |
-          echo "${{ matrix.machine_type }}"
-          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
-            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
-            machine_type=multi-gpu
-          else
-            machine_type=${{ matrix.machine_type }}
-          fi
-          echo "$machine_type"
-          echo "machine_type=$machine_type" >> $GITHUB_ENV
-
-      - name: Environment
-        working-directory: /transformers
-        run: |
-          python3 utils/print_env.py
-
-      - name: Show installed libraries and their versions
-        working-directory: /transformers
-        run: pip freeze
-
-      - name: Run quantization tests on GPU
-        working-directory: /transformers
-        run: |
-          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
-
-      - name: Failure short reports
-        if: ${{ failure() }}
-        continue-on-error: true
-        run: cat /transformers/reports/${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
-
-      - name: Make sure report directory exists
-        shell: bash
-        run: |
-          mkdir -p /transformers/reports/${{ env.machine_type }}_run_quantization_gpu_${{ matrix.folders }}_test_reports
-          echo "hello" > /transformers/reports/${{ env.machine_type }}_run_quantization_gpu_${{ matrix.folders }}_test_reports/hello.txt
-          echo "${{ env.machine_type }}_run_quantization_gpu_${{ matrix.folders }}_test_reports"
-
-      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports"
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: ${{ env.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports
-          path: /transformers/reports/${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports
-
-  update_run_status:
-    name: Update Check Run Status
-    needs: [get-sha, create_run, run_models_gpu, run_quantization_torch_gpu]
+  report:
+    name: Check & Report
+    needs: [get-pr-number, check-timestamps, create_run, model-ci, quantization-ci]
    permissions:
+      pull-requests: write
      statuses: write
    if: ${{ always() && needs.create_run.result == 'success' }}
    runs-on: ubuntu-22.04
-    env:
-      GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
-      STATUS_OK: ${{ contains(fromJSON('["skipped", "success"]'), needs.run_models_gpu.result) && contains(fromJSON('["skipped", "success"]'), needs.run_quantization_torch_gpu.result) }}
    steps:
-      - name: Get `run_models_gpu` job status
+      - name: Show reports from jobs
+        env:
+          MODEL_REPORT: ${{ needs.model-ci.outputs.report }}
+          QUANT_REPORT: ${{ needs.quantization-ci.outputs.report }}
        run: |
-          echo "${{ needs.run_models_gpu.result }}"
-          echo "${{ needs.run_quantization_torch_gpu.result }}"
-          echo $STATUS_OK
-          if [ "$STATUS_OK" = "true" ]; then
-            echo "STATUS=success" >> $GITHUB_ENV
-          else
-            echo "STATUS=failure" >> $GITHUB_ENV
-          fi
+          echo "$MODEL_REPORT"
+          echo "$QUANT_REPORT"

-      - name: Update PR commit statuses
+      - name: Process and filter reports
+        env:
+          MODEL_REPORT: ${{ needs.model-ci.outputs.report }}
+          QUANT_REPORT: ${{ needs.quantization-ci.outputs.report }}
        run: |
-          echo "${{ needs.run_models_gpu.result }}"
-          echo "${{ env.STATUS }}"
+          # Preprocess with Python
+          python3 << 'PYTHON_SCRIPT'
+          import json
+          import os
+          
+          def filter_and_format_report(data):
+            """
+            Filter out entries where commit is `None` (failing tests who status is not certain) and format as text
+            """
+            lines = []
+            
+            for model, model_result in data.items():
+                model_lines = []
+                for device, failures in model_result.items():
+                    
+                    # Filter out None commits and extract just the test names
+                    test_names = [
+                        failure['test'] 
+                        for failure in failures 
+                        if isinstance(failure, dict) and failure.get('commit') is not None
+                    ]
+
+                    # Add tests to model lines
+                    for idx, test_name in enumerate(test_names):
+                        if idx == 0:
+                            job_link = failures[idx]['job_link']
+                            model_lines.append(f"- [{model}]({job_link}):")
+          
+                        model_lines.append(f"    {test_name}")
+
+                # Only add model section if it has tests
+                if len(model_lines) > 0:
+                    lines.extend(model_lines)
+                    lines.append("")  # Empty line between models
+            
+            return "\n".join(lines).strip()
+          
+          # Load and filter reports
+          model_report_str = os.environ.get('MODEL_REPORT', '{}')
+          quant_report_str = os.environ.get('QUANT_REPORT', '{}')
+          
+          model_report = json.loads(model_report_str) if model_report_str else {}
+          quant_report = json.loads(quant_report_str) if quant_report_str else {}
+          
+          formatted_model = filter_and_format_report(model_report)
+          formatted_quant = filter_and_format_report(quant_report)
+          
+          # Write to files
+          with open('model_ci.txt', 'w') as f:
+              f.write(formatted_model)
+              if formatted_model:
+                  f.write('\n')
+          
+          with open('quantization_ci.txt', 'w') as f:
+              f.write(formatted_quant)
+              if formatted_quant:
+                  f.write('\n')
+          PYTHON_SCRIPT
+
+      - name: Post results as PR comment
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
+          github_repository: ${{ github.repository }}
+          pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
+          model_ci_result: ${{ needs.model-ci.result }}
+          quantization_ci_result: ${{ needs.quantization-ci.result }}
+        run: |
+          {
+            echo '## CI Results'
+            echo "[Workflow Run ⚙️]($GITHUB_RUN_URL)"
+            echo ''
+
+            # Check if both jobs were skipped or cancelled
+            if [[ "$model_ci_result" == "skipped" || "$model_ci_result" == "cancelled" ]] && \
+               [[ "$quantization_ci_result" == "skipped" || "$quantization_ci_result" == "cancelled" ]]; then
+              echo '⚠️ No test being reported (jobs are skipped or cancelled)!'
+              echo "STATUS=error" >> $GITHUB_ENV
+
+            # Check if either file has content
+            elif [ -s model_ci.txt ] || [ -s quantization_ci.txt ]; then
+              echo "STATUS=failure" >> $GITHUB_ENV
+
+              # Check if model_ci.txt has content
+              if [ -s model_ci.txt ]; then
+                echo '### Model CI Report'
+                echo ''
+                echo '#### ❌ Failed tests'
+                echo ''
+                cat model_ci.txt
+                echo ''
+              fi
+              
+              # Check if quantization_ci.txt has content
+              if [ -s quantization_ci.txt ]; then
+                echo '### Quantization CI Report'
+                echo ''
+                echo '#### ❌ Failed tests'
+                echo ''
+                cat quantization_ci.txt
+                echo ''
+              fi
+            else
+              echo "STATUS=success" >> $GITHUB_ENV
+              echo '✅ No failing test specific to this PR 🎉 !'
+            fi
+          } > comment_body.txt
+
          gh api \
            --method POST \
            -H "Accept: application/vnd.github+json" \
            -H "X-GitHub-Api-Version: 2022-11-28" \
-            repos/${{ github.repository }}/statuses/${{ needs.get-sha.outputs.PR_HEAD_SHA }} \
-            -f "target_url=$GITHUB_RUN_URL" -f "state=${{ env.STATUS }}" -f "description=Slow CI job" -f "context=pytest/custom-tests"
+            "repos/${github_repository}/issues/${pr_number}/comments" \
+            -F body=@comment_body.txt
+
+      - name: Update PR commit statuses
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
+          github_repository: ${{ github.repository }}
+          pr_head_sha: ${{ needs.check-timestamps.outputs.PR_HEAD_SHA }}
+        # The env. variable `STATUS` used here is set in the previous step
+        run: |
+          gh api \
+            --method POST \
+            -H "Accept: application/vnd.github+json" \
+            -H "X-GitHub-Api-Version: 2022-11-28" \
+            "repos/${github_repository}/statuses/${pr_head_sha}" \
+            -f "target_url=$GITHUB_RUN_URL" -f "state=$STATUS" -f "description=Slow CI job" -f "context=pytest/custom-tests"
--- a/.github/workflows/self-nightly-caller.yml
+++ b/.github/workflows/self-nightly-caller.yml
@ -51,6 +51,7 @@ jobs:
      slack_report_channel: "#transformers-ci-past-future"
      docker: huggingface/transformers-all-latest-torch-nightly-gpu
      ci_event: Nightly CI
+      runner_type: "a10"
      report_repo_id: hf-internal-testing/transformers_daily_ci_with_torch_nightly
      commit_sha: ${{ github.event.workflow_run.head_sha || github.sha }}
    secrets: inherit
--- a/.github/workflows/self-scheduled-amd-caller.yml
+++ b/.github/workflows/self-scheduled-amd-caller.yml
@ -2,7 +2,7 @@ name: Self-hosted runner (AMD scheduled CI caller)

 on:
  schedule:
-    - cron: "17 2 * * *"
+    - cron: "17 5 * * *"

 jobs:
  run_scheduled_amd_ci:
--- a/.github/workflows/self-scheduled-amd-mi355-caller.yml
+++ b/.github/workflows/self-scheduled-amd-mi355-caller.yml
@ -21,7 +21,7 @@ jobs:
      job: run_models_gpu
      slack_report_channel: "#amd-hf-ci"
      runner_group: hfc-amd-mi355
-      docker: huggingface/testing-rocm7.0-preview
+      docker: huggingface/transformers-pytorch-amd-gpu
      ci_event: Scheduled CI (AMD) - mi355
      report_repo_id: hf-transformers-bot/transformers-ci-dummy
    secrets: inherit
@ -33,7 +33,7 @@ jobs:
      job: run_pipelines_torch_gpu
      slack_report_channel: "#amd-hf-ci"
      runner_group: hfc-amd-mi355
-      docker: huggingface/testing-rocm7.0-preview
+      docker: huggingface/transformers-pytorch-amd-gpu
      ci_event: Scheduled CI (AMD) - mi355
      report_repo_id: hf-transformers-bot/transformers-ci-dummy
    secrets: inherit
@ -45,7 +45,7 @@ jobs:
      job: run_examples_gpu
      slack_report_channel: "#amd-hf-ci"
      runner_group: hfc-amd-mi355
-      docker: huggingface/testing-rocm7.0-preview
+      docker: huggingface/transformers-pytorch-amd-gpu
      ci_event: Scheduled CI (AMD) - mi355
      report_repo_id: hf-transformers-bot/transformers-ci-dummy
    secrets: inherit
--- a/.github/workflows/self-scheduled-caller.yml
+++ b/.github/workflows/self-scheduled-caller.yml
@ -6,7 +6,7 @@ on:
    - cron: "17 2 * * *"
  push:
    branches:
-      - run_nvidia_ci*
+      - check_cleanup_workflow
  workflow_dispatch:
    inputs:
      prev_workflow_run_id:
@ -23,7 +23,7 @@ on:

 # Used for `push` to easily modify the target workflow runs to compare against
 env:
-    prev_workflow_run_id: ""
+    prev_workflow_run_id: "19056134459"
    other_workflow_run_id: ""


@ -33,10 +33,13 @@ jobs:
    runs-on: ubuntu-22.04
    steps:
      - name: Setup
+        env:
+          prev_workflow_run_id: ${{ inputs.prev_workflow_run_id || env.prev_workflow_run_id }}
+          other_workflow_run_id: ${{ inputs.other_workflow_run_id || env.other_workflow_run_id }}
        run: |
          mkdir "setup_values"
-          echo "${{ inputs.prev_workflow_run_id || env.prev_workflow_run_id }}" > "setup_values/prev_workflow_run_id.txt"
-          echo "${{ inputs.other_workflow_run_id || env.other_workflow_run_id }}" > "setup_values/other_workflow_run_id.txt"
+          echo "$prev_workflow_run_id" > "setup_values/prev_workflow_run_id.txt"
+          echo "$other_workflow_run_id" > "setup_values/other_workflow_run_id.txt"

      - name: Upload artifacts
        uses: actions/upload-artifact@v4
@ -49,72 +52,10 @@ jobs:
    uses: ./.github/workflows/self-scheduled.yml
    with:
      job: run_models_gpu
-      slack_report_channel: "#transformers-ci-daily-models"
+      slack_report_channel: "#transformers-ci-dummy"
      docker: huggingface/transformers-all-latest-gpu
      ci_event: Daily CI
      runner_type: "a10"
      report_repo_id: hf-internal-testing/transformers_daily_ci
      commit_sha: ${{ github.sha }}
    secrets: inherit
-
-  torch-pipeline:
-    name: Torch pipeline CI
-    uses: ./.github/workflows/self-scheduled.yml
-    with:
-      job: run_pipelines_torch_gpu
-      slack_report_channel: "#transformers-ci-daily-pipeline-torch"
-      docker: huggingface/transformers-all-latest-gpu
-      ci_event: Daily CI
-      report_repo_id: hf-internal-testing/transformers_daily_ci
-      commit_sha: ${{ github.sha }}
-    secrets: inherit
-
-  example-ci:
-    name: Example CI
-    uses: ./.github/workflows/self-scheduled.yml
-    with:
-      job: run_examples_gpu
-      slack_report_channel: "#transformers-ci-daily-examples"
-      docker: huggingface/transformers-all-latest-gpu
-      ci_event: Daily CI
-      report_repo_id: hf-internal-testing/transformers_daily_ci
-      commit_sha: ${{ github.sha }}
-    secrets: inherit
-
-  trainer-fsdp-ci:
-    name: Trainer/FSDP CI
-    uses: ./.github/workflows/self-scheduled.yml
-    with:
-      job: run_trainer_and_fsdp_gpu
-      slack_report_channel: "#transformers-ci-daily-training"
-      docker: huggingface/transformers-all-latest-gpu
-      runner_type: "a10"
-      ci_event: Daily CI
-      report_repo_id: hf-internal-testing/transformers_daily_ci
-      commit_sha: ${{ github.sha }}
-    secrets: inherit
-
-  deepspeed-ci:
-    name: DeepSpeed CI
-    uses: ./.github/workflows/self-scheduled.yml
-    with:
-      job: run_torch_cuda_extensions_gpu
-      slack_report_channel: "#transformers-ci-daily-training"
-      docker: huggingface/transformers-pytorch-deepspeed-latest-gpu
-      ci_event: Daily CI
-      working-directory-prefix: /workspace
-      report_repo_id: hf-internal-testing/transformers_daily_ci
-      commit_sha: ${{ github.sha }}
-    secrets: inherit
-
-  quantization-ci:
-    name: Quantization CI
-    uses: ./.github/workflows/self-scheduled.yml
-    with:
-      job: run_quantization_torch_gpu
-      slack_report_channel: "#transformers-ci-daily-quantization"
-      docker: huggingface/transformers-quantization-latest-gpu
-      ci_event: Daily CI
-      report_repo_id: hf-internal-testing/transformers_daily_ci
-      commit_sha: ${{ github.sha }}
-    secrets: inherit
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@ -34,14 +34,20 @@ on:
      runner_type:
        required: false
        type: string
-      models:
+      subdirs:
        default: ""
        required: false
        type: string
      pytest_marker:
        required: false
        type: string
-
+      pr_number:
+        required: false
+        type: string
+    outputs:
+      report:
+        description: "Content of the report of new failures"
+        value: ${{ jobs.check_new_failures.outputs.report }}

 env:
  HF_HOME: /mnt/cache
@ -54,7 +60,6 @@ env:
  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
  TF_FORCE_GPU_ALLOW_GROWTH: true
  CUDA_VISIBLE_DEVICES: 0,1
-  NUM_SLICES: 2

 jobs:
  setup:
@ -62,7 +67,7 @@ jobs:
    if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu", "run_quantization_torch_gpu"]'), inputs.job)
    strategy:
      matrix:
-        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
+        machine_type: [aws-g5-4xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -75,8 +80,11 @@ jobs:
    steps:
      - name: Update clone
        working-directory: /transformers
+        env:
+          commit_sha: ${{ inputs.commit_sha || github.sha }}
        run: |
-          git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
+          git fetch origin $commit_sha
+          git fetch && git checkout $commit_sha

      - name: Cleanup
        working-directory: /transformers
@ -93,11 +101,17 @@ jobs:
        if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu"]'), inputs.job)
        name: Identify models to test
        working-directory: /transformers/tests
+        env:
+          job: ${{ inputs.job }}
+          subdirs: ${{ inputs.subdirs }}
+          NUM_SLICES: 2
        run: |
-          if [ "${{ inputs.job }}" = "run_models_gpu" ]; then
-            echo "folder_slices=$(python3 ../utils/split_model_tests.py --models '${{ inputs.models }}' --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
-            echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
-          elif [ "${{ inputs.job }}" = "run_trainer_and_fsdp_gpu" ]; then
+          if [ "$job" = "run_models_gpu" ]; then
+            python3 ../utils/split_model_tests.py --subdirs "$subdirs" --num_splits "$NUM_SLICES" > folder_slices.txt
+            echo "folder_slices=$(cat folder_slices.txt)" >> $GITHUB_OUTPUT
+            python3 -c "import ast; folder_slices = ast.literal_eval(open('folder_slices.txt').read()); open('slice_ids.txt', 'w').write(str(list(range(len(folder_slices)))))"
+            echo "slice_ids=$(cat slice_ids.txt)" >> $GITHUB_OUTPUT
+          elif [ "$job" = "run_trainer_and_fsdp_gpu" ]; then
            echo "folder_slices=[['trainer'], ['fsdp']]" >> $GITHUB_OUTPUT
            echo "slice_ids=[0, 1]" >> $GITHUB_OUTPUT
          fi
@ -106,8 +120,10 @@ jobs:
        if: ${{ inputs.job == 'run_quantization_torch_gpu' }}
        name: Identify quantization method to test
        working-directory: /transformers/tests
+        env:
+          subdirs: ${{ inputs.subdirs || 'None' }}
        run: |
-          echo "quantization_matrix=$(python3 -c 'import os; tests = os.getcwd(); quantization_tests = os.listdir(os.path.join(tests, "quantization")); d = sorted(list(filter(os.path.isdir, [f"quantization/{x}" for x in quantization_tests]))) ;  print(d)')" >> $GITHUB_OUTPUT
+          echo "quantization_matrix=$(python3 -c 'import ast; import os; tests = os.getcwd(); quantization_tests = os.listdir(os.path.join(tests, "quantization")); subdirs = ast.literal_eval(os.environ["subdirs"]); quantization_tests = [x.removeprefix("quantization/") for x in subdirs] if subdirs is not None else quantization_tests; d = sorted(list(filter(os.path.isdir, [f"quantization/{x}" for x in quantization_tests]))); print(d)')" >> $GITHUB_OUTPUT

      - name: NVIDIA-SMI
        run: |
@ -120,7 +136,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
+        machine_type: [aws-g5-4xlarge-cache]
        slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
    uses: ./.github/workflows/model_jobs.yml
    with:
@ -141,7 +157,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
+        machine_type: [aws-g5-4xlarge-cache]
        slice_id: [0, 1]
    uses: ./.github/workflows/model_jobs.yml
    with:
@ -161,7 +177,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
+        machine_type: [aws-g5-4xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -170,7 +186,9 @@ jobs:
    steps:
      - name: Update clone
        working-directory: /transformers
-        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
+        env:
+          commit_sha: ${{ inputs.commit_sha || github.sha }}
+        run: git fetch && git checkout "$commit_sha"

      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
        working-directory: /transformers
@ -192,15 +210,17 @@ jobs:
      - name: Set `machine_type` for report and artifact names
        working-directory: /transformers
        shell: bash
+        env:
+          matrix_machine_type: ${{ matrix.machine_type }}
        run: |
-          echo "${{ matrix.machine_type }}"
+          echo "$matrix_machine_type"

-          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+          if [ "$matrix_machine_type" = "aws-g5-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
+          elif [ "$matrix_machine_type" = "aws-g5-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
-            machine_type=${{ matrix.machine_type }}
+            machine_type="$matrix_machine_type"
          fi

          echo "$machine_type"
@ -209,12 +229,12 @@ jobs:
      - name: Run all pipeline tests on GPU
        working-directory: /transformers
        run: |
-          python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports tests/pipelines
+          python3 -m pytest -n 1 -v --dist=loadfile --make-reports="${machine_type}_run_pipelines_torch_gpu_test_reports" tests/pipelines

      - name: Failure short reports
        if: ${{ failure() }}
        continue-on-error: true
-        run: cat /transformers/reports/${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports/failures_short.txt
+        run: cat "/transformers/reports/${machine_type}_run_pipelines_torch_gpu_test_reports/failures_short.txt"

      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports"
        if: ${{ always() }}
@ -238,7 +258,9 @@ jobs:
    steps:
      - name: Update clone
        working-directory: /transformers
-        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
+        env:
+          commit_sha: ${{ inputs.commit_sha || github.sha }}
+        run: git fetch && git checkout "$commit_sha"

      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
        working-directory: /transformers
@ -260,15 +282,17 @@ jobs:
      - name: Set `machine_type` for report and artifact names
        working-directory: /transformers
        shell: bash
+        env:
+          matrix_machine_type: ${{ matrix.machine_type }}
        run: |
-          echo "${{ matrix.machine_type }}"
+          echo "$matrix_machine_type"

-          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+          if [ "$matrix_machine_type" = "aws-g5-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
+          elif [ "$matrix_machine_type" = "aws-g5-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
-            machine_type=${{ matrix.machine_type }}
+            machine_type="$matrix_machine_type"
          fi

          echo "$machine_type"
@ -278,12 +302,12 @@ jobs:
        working-directory: /transformers
        run: |
          pip install -r examples/pytorch/_tests_requirements.txt
-          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_examples_gpu_test_reports examples/pytorch
+          python3 -m pytest -v --make-reports="${machine_type}_run_examples_gpu_test_reports" examples/pytorch

      - name: Failure short reports
        if: ${{ failure() }}
        continue-on-error: true
-        run: cat /transformers/reports/${{ env.machine_type }}_run_examples_gpu_test_reports/failures_short.txt
+        run: cat "/transformers/reports/${machine_type}_run_examples_gpu_test_reports/failures_short.txt"

      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_examples_gpu_test_reports"
        if: ${{ always() }}
@ -298,7 +322,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
+        machine_type: [aws-g5-4xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -307,7 +331,9 @@ jobs:
    steps:
      - name: Update clone
        working-directory: ${{ inputs.working-directory-prefix }}/transformers
-        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
+        env:
+          commit_sha: ${{ inputs.commit_sha || github.sha }}
+        run: git fetch && git checkout "$commit_sha"

      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
        working-directory: ${{ inputs.working-directory-prefix }}/transformers
@ -329,7 +355,7 @@ jobs:
        working-directory: ${{ inputs.working-directory-prefix }}/
        run: |
          python3 -m pip uninstall -y deepspeed
-          DS_DISABLE_NINJA=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+          DS_DISABLE_NINJA=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --no-build-isolation --config-settings="--build-option=build_ext" --config-settings="--build-option=-j8" --no-cache -v --disable-pip-version-check

      # To avoid unknown test failures
      - name: Pre build DeepSpeed *again* (for nightly & Past CI)
@ -339,7 +365,7 @@ jobs:
          python3 -m pip uninstall -y deepspeed
          rm -rf DeepSpeed
          git clone https://github.com/deepspeedai/DeepSpeed && cd DeepSpeed && rm -rf build
-          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --no-build-isolation --config-settings="--build-option=build_ext" --config-settings="--build-option=-j8" --no-cache -v --disable-pip-version-check

      - name: NVIDIA-SMI
        run: |
@ -357,15 +383,17 @@ jobs:
      - name: Set `machine_type` for report and artifact names
        working-directory: ${{ inputs.working-directory-prefix }}/transformers
        shell: bash
+        env:
+          matrix_machine_type: ${{ matrix.machine_type }}
        run: |
-          echo "${{ matrix.machine_type }}"
+          echo "$matrix_machine_type"

-          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+          if [ "$matrix_machine_type" = "aws-g5-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
+          elif [ "$matrix_machine_type" = "aws-g5-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
-            machine_type=${{ matrix.machine_type }}
+            machine_type="$matrix_machine_type"
          fi

          echo "$machine_type"
@ -374,12 +402,14 @@ jobs:
      - name: Run all tests on GPU
        working-directory: ${{ inputs.working-directory-prefix }}/transformers
        run: |
-          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended
+          python3 -m pytest -v --make-reports="${machine_type}_run_torch_cuda_extensions_gpu_test_reports" tests/deepspeed tests/extended

      - name: Failure short reports
        if: ${{ failure() }}
        continue-on-error: true
-        run: cat ${{ inputs.working-directory-prefix }}/transformers/reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt
+        env:
+          working_directory_prefix: ${{ inputs.working-directory-prefix }}
+        run: cat "${working_directory_prefix}/transformers/reports/${machine_type}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt"

      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports"
        if: ${{ always() }}
@ -397,7 +427,7 @@ jobs:
      fail-fast: false
      matrix:
        folders: ${{ fromJson(needs.setup.outputs.quantization_matrix) }}
-        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
+        machine_type: [aws-g5-4xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -406,16 +436,19 @@ jobs:
    steps:
      - name: Echo folder ${{ matrix.folders }}
        shell: bash
+        env:
+          matrix_folders_raw: ${{ matrix.folders }}
        run: |
-          echo "${{ matrix.folders }}"
-          matrix_folders=${{ matrix.folders }}
-          matrix_folders=${matrix_folders/'quantization/'/'quantization_'}
+          echo "$matrix_folders_raw"
+          matrix_folders="${matrix_folders_raw/'quantization/'/'quantization_'}"
          echo "$matrix_folders"
          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV

      - name: Update clone
        working-directory: /transformers
-        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
+        env:
+          commit_sha: ${{ inputs.commit_sha || github.sha }}
+        run: git fetch && git checkout "$commit_sha"

      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
        working-directory: /transformers
@ -437,15 +470,17 @@ jobs:
      - name: Set `machine_type` for report and artifact names
        working-directory: /transformers
        shell: bash
+        env:
+          matrix_machine_type: ${{ matrix.machine_type }}
        run: |
-          echo "${{ matrix.machine_type }}"
+          echo "$matrix_machine_type"

-          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+          if [ "$matrix_machine_type" = "aws-g5-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
+          elif [ "$matrix_machine_type" = "aws-g5-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
-            machine_type=${{ matrix.machine_type }}
+            machine_type="$matrix_machine_type"
          fi

          echo "$machine_type"
@ -453,20 +488,96 @@ jobs:

      - name: Run quantization tests on GPU
        working-directory: /transformers
+        env:
+          folders: ${{ matrix.folders }}
        run: |
-          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
+          python3 -m pytest -v --make-reports="${machine_type}_run_quantization_torch_gpu_${matrix_folders}_test_reports" tests/${folders}

      - name: Failure short reports
        if: ${{ failure() }}
        continue-on-error: true
-        run: cat /transformers/reports/${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
+        run: cat "/transformers/reports/${machine_type}_run_quantization_torch_gpu_${matrix_folders}_test_reports/failures_short.txt"

      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports"
        if: ${{ always() }}
        uses: actions/upload-artifact@v4
        with:
          name: ${{ env.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports
-          path: /transformers/reports/${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports
+          path: /transformers/reports/${{ env.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports
+
+  run_kernels_gpu:
+    if: ${{ inputs.job == 'run_kernels_gpu' }}
+    name: Kernel tests
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [aws-g5-4xlarge-cache]
+    runs-on:
+      group: '${{ matrix.machine_type }}'
+    container:
+      image: ${{ inputs.docker }}
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: Update clone
+        working-directory: /transformers
+        env:
+          commit_sha: ${{ inputs.commit_sha || github.sha }}
+        run: git fetch && git checkout "$commit_sha"
+
+      - name: Reinstall transformers in edit mode
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .[testing]
+  
+      - name: Install kernels
+        working-directory: /transformers
+        run: python3 -m pip install -U kernels
+  
+      - name: NVIDIA-SMI
+        run: nvidia-smi
+
+      - name: Environment
+        working-directory: /transformers
+        run: python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Set `machine_type` for report and artifact names
+        working-directory: /transformers
+        shell: bash
+        env:
+          matrix_machine_type: ${{ matrix.machine_type }}
+        run: |
+          echo "$matrix_machine_type"
+
+          if [ "$matrix_machine_type" = "aws-g5-4xlarge-cache" ]; then
+            machine_type=single-gpu
+          elif [ "$matrix_machine_type" = "aws-g5-12xlarge-cache" ]; then
+            machine_type=multi-gpu
+          else
+            machine_type="$matrix_machine_type"
+          fi
+
+          echo "$machine_type"
+          echo "machine_type=$machine_type" >> $GITHUB_ENV
+    
+      - name: Run kernel tests on GPU
+        working-directory: /transformers
+        run: |
+          python3 -m pytest -v --make-reports="${machine_type}_run_kernels_gpu_test_reports" tests/kernels/test_kernels.py
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat "/transformers/reports/${machine_type}_run_kernels_gpu_test_reports/failures_short.txt"
+
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_kernels_gpu_test_reports"
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ env.machine_type }}_run_kernels_gpu_test_reports
+          path: /transformers/reports/${{ env.machine_type }}_run_kernels_gpu_test_reports

  run_extract_warnings:
    # Let's only do this for the job `run_models_gpu` to simplify the (already complex) logic.
@ -499,9 +610,12 @@ jobs:
        working-directory: warnings_in_ci

      - name: Extract warnings in CI artifacts
+        env:
+          github_run_id: ${{ github.run_id }}
+          access_token: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
        run: |
-          python3 utils/extract_warnings.py --workflow_run_id ${{ github.run_id }} --output_dir warnings_in_ci --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }} --from_gh
-          echo "$(python3 -c 'import os; import json; fp = open("warnings_in_ci/selected_warnings.json"); d = json.load(fp); d = "\n".join(d) ;print(d)')"
+          python3 utils/extract_warnings.py --workflow_run_id "$github_run_id" --output_dir warnings_in_ci --token "$access_token" --from_gh
+          echo "$(python3 -c 'import os; import json; fp = open("warnings_in_ci/selected_warnings.json"); d = json.load(fp); d = "\n".join(d); print(d)')"

      - name: Upload artifact
        if: ${{ always() }}
@ -520,6 +634,7 @@ jobs:
      run_examples_gpu,
      run_torch_cuda_extensions_gpu,
      run_quantization_torch_gpu,
+      run_kernels_gpu,
      run_extract_warnings
    ]
    if: always() && !cancelled()
@ -539,16 +654,17 @@ jobs:
    secrets: inherit

  check_new_failures:
-    if: ${{ always() && inputs.ci_event == 'Daily CI' && needs.send_results.result == 'success' }}
+    if: ${{ always() && needs.send_results.result == 'success' }}
    name: Check new failures
    needs: send_results
    uses: ./.github/workflows/check_failed_tests.yml
    with:
      docker: ${{ inputs.docker }}
-      start_sha: ${{ inputs.commit_sha || github.sha }}
+      commit_sha: ${{ inputs.commit_sha || github.sha }}
      job: ${{ inputs.job }}
      slack_report_channel: ${{ inputs.slack_report_channel }}
      ci_event: ${{ inputs.ci_event }}
      report_repo_id: ${{ inputs.report_repo_id }}
+      pr_number: ${{ inputs.pr_number }}

    secrets: inherit
--- a/.github/workflows/slack-report.yml
+++ b/.github/workflows/slack-report.yml
@ -41,8 +41,10 @@ jobs:
      - name: Preliminary job status
        shell: bash
        # For the meaning of these environment variables, see the job `Setup`
+        env:
+          setup_status: ${{ inputs.setup_status }}
        run: |
-          echo "Setup status: ${{ inputs.setup_status }}"
+          echo "Setup status: $setup_status"

      - uses: actions/checkout@v4
        with:
@ -81,6 +83,8 @@ jobs:
          CI_TEST_JOB: ${{ inputs.job }}
          SETUP_STATUS: ${{ inputs.setup_status }}
          REPORT_REPO_ID: ${{ inputs.report_repo_id }}
+          quantization_matrix: ${{ inputs.quantization_matrix }}
+          folder_slices: ${{ inputs.folder_slices }}
        # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
        # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
        # For a job that doesn't depend on (i.e. `needs`) `setup`, the value for `inputs.folder_slices` would be an
@ -89,10 +93,10 @@ jobs:
          pip install huggingface_hub
          pip install slack_sdk
          pip show slack_sdk
-          if [ "${{ inputs.quantization_matrix }}" != "" ]; then
-            python utils/notification_service.py "${{ inputs.quantization_matrix }}"
+          if [ "$quantization_matrix" != "" ]; then
+            python utils/notification_service.py "$quantization_matrix"
          else
-            python utils/notification_service.py "${{ inputs.folder_slices }}"
+            python utils/notification_service.py "$folder_slices"
          fi

      # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.
--- a/.github/workflows/ssh-runner.yml
+++ b/.github/workflows/ssh-runner.yml
@ -4,7 +4,7 @@ on:
  workflow_dispatch:
    inputs:
      runner_type:
-        description: 'Type of runner to test (a10 or t4)'
+        description: 'Type of runner to test (a10)'
        required: true
      docker_image:
        description: 'Name of the Docker image'
@ -36,14 +36,10 @@ jobs:
          NUM_GPUS: ${{ github.event.inputs.num_gpus }}
          RUNNER_TYPE: ${{ github.event.inputs.runner_type }}
        run: |
-          if [[ "$NUM_GPUS" == "single" && "$RUNNER_TYPE" == "t4" ]]; then
-            echo "RUNNER=aws-g4dn-4xlarge-cache" >> $GITHUB_ENV
-          elif [[ "$NUM_GPUS" == "multi" && "$RUNNER_TYPE" == "t4" ]]; then
-            echo "RUNNER=aws-g4dn-12xlarge-cache" >> $GITHUB_ENV
-          elif [[ "$NUM_GPUS" == "single" && "$RUNNER_TYPE" == "a10" ]]; then
-            echo "RUNNER=aws-g5-4xlarge-cache" >> $GITHUB_ENV
+          if [[ "$NUM_GPUS" == "single" && "$RUNNER_TYPE" == "a10" ]]; then
+            echo "RUNNER=aws-g5-4xlarge-cache-ssh" >> $GITHUB_ENV
          elif [[ "$NUM_GPUS" == "multi" && "$RUNNER_TYPE" == "a10" ]]; then
-            echo "RUNNER=aws-g5-12xlarge-cache" >> $GITHUB_ENV
+            echo "RUNNER=aws-g5-12xlarge-cache-ssh" >> $GITHUB_ENV
          else
            echo "RUNNER=" >> $GITHUB_ENV
          fi
@ -51,8 +47,8 @@ jobs:
      - name: Set runner to use
        id: set_runner
        run: |
-          echo ${{ env.RUNNER }}
-          echo "RUNNER=${{ env.RUNNER }}" >> $GITHUB_OUTPUT
+          echo "$RUNNER"
+          echo "RUNNER=$RUNNER" >> $GITHUB_OUTPUT

  ssh_runner:
    name: "SSH"
@ -61,13 +57,13 @@ jobs:
      group: ${{ needs.get_runner.outputs.RUNNER }}
    container:
      image: ${{ github.event.inputs.docker_image }}
-      options: --gpus all --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-
    steps:
      - name: Update clone
        working-directory: /transformers
+        env:
+          commit_sha: ${{ github.sha }}
        run: |
-          git fetch && git checkout ${{ github.sha }}
+          git fetch && git checkout "$commit_sha"

      - name: Cleanup
        working-directory: /transformers
@ -99,14 +95,17 @@ jobs:
      - name: Store Slack infos
        #because the SSH can be enabled dynamically if the workflow failed, so we need to store slack infos to be able to retrieve them during the waitforssh step
        shell: bash
+        env:
+          user_slack_id: ${{ secrets[format('{0}_{1}', env.github_actor, 'SLACK_ID')] }}
+          default_slack_channel: ${{ secrets.SLACK_CIFEEDBACK_CHANNEL }}
        run: |
-          echo "${{ env.github_actor }}"
-          if [ "${{ secrets[format('{0}_{1}', env.github_actor, 'SLACK_ID')] }}" != "" ]; then
-            echo "SLACKCHANNEL=${{ secrets[format('{0}_{1}', env.github_actor, 'SLACK_ID')] }}" >> $GITHUB_ENV
+          echo "$github_actor"
+          if [ "$user_slack_id" != "" ]; then
+            echo "SLACKCHANNEL=$user_slack_id" >> $GITHUB_ENV
          else
-            echo "SLACKCHANNEL=${{ secrets.SLACK_CIFEEDBACK_CHANNEL }}" >> $GITHUB_ENV
+            echo "SLACKCHANNEL=$default_slack_channel" >> $GITHUB_ENV
          fi
-
+        
      - name: Tailscale # In order to be able to SSH when a test fails
        uses: huggingface/tailscale-action@main
        with:
--- a/benchmark_v2/framework/benchmark_config.py
+++ b/benchmark_v2/framework/benchmark_config.py
@ -1,8 +1,11 @@
 import hashlib
+import itertools
 import json
 import logging
 from typing import Any

+from transformers.utils.import_utils import is_flash_attn_2_available
+

 KERNELIZATION_AVAILABLE = False
 try:
@ -18,6 +21,16 @@ logger = logging.getLogger(__name__)
 class BenchmarkConfig:
    """Configuration for a single benchmark scenario."""

+    all_attn_implementations = [
+        ("flash_attention_2", None),
+        ("eager", None),
+        ("sdpa", "math"),
+        ("sdpa", "flash_attention"),
+        ("flex_attention", None),
+    ]
+
+    all_compiled_modes = [None, "default", "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs"]
+
    def __init__(
        self,
        warmup_iterations: int = 5,
@ -59,6 +72,13 @@ class BenchmarkConfig:
    def check_validity(self, skip_validity_check: bool = False) -> None:
        if skip_validity_check:
            return
+        # Check FA is installed
+        if self.attn_implementation == "flash_attention_2" and not is_flash_attn_2_available():
+            logger.warning(
+                "Flash attention does not support compile mode. Defaulting to SDPA w/ flash attention backend."
+            )
+            self.attn_implementation = "sdpa"
+            self.sdpa_backend = "flash_attention"
        # Flash attention does not support compile mode, so we turn it off # FIXME: it would be better to support it
        is_fa = self.attn_implementation == "flash_attention_2"
        is_fa |= self.attn_implementation == "sdpa" and self.sdpa_backend == "flash_attention"
@ -127,88 +147,68 @@ class BenchmarkConfig:
        )


-def cross_generate_configs(
-    attn_impl_and_sdpa_backend: list[tuple[str, str | None]],
-    compiled_mode: list[str | None],
-    kernelized: list[bool],
-    warmup_iterations: int = 5,
-    measurement_iterations: int = 20,
-    batch_size: int = 1,
-    sequence_length: int = 128,
-    num_tokens_to_generate: int = 128,
-    gpu_monitoring: bool = True,
+def adapt_configs(
+    configs: list[BenchmarkConfig],
+    warmup_iterations: int | list[int] = 5,
+    measurement_iterations: int | list[int] = 20,
+    batch_size: int | list[int] = 1,
+    sequence_length: int | list[int] = 128,
+    num_tokens_to_generate: int | list[int] = 128,
+    gpu_monitoring: bool | list[bool] = True,
 ) -> list[BenchmarkConfig]:
-    # Create kwargs common to all configs
-    kwargs = {
-        "warmup_iterations": warmup_iterations,
-        "measurement_iterations": measurement_iterations,
-        "batch_size": batch_size,
-        "sequence_length": sequence_length,
-        "num_tokens_to_generate": num_tokens_to_generate,
-        "gpu_monitoring": gpu_monitoring,
-    }
-    # Cross-generate all combinations of attn_implementation, compiled_mode, and kernelized
-    configs = []
-    for attn_implementation, sdpa_backend in list(dict.fromkeys(attn_impl_and_sdpa_backend)):
-        for cm in list(dict.fromkeys(compiled_mode)):
-            for kernelize_on in list(dict.fromkeys(kernelized)):
-                config = BenchmarkConfig(
-                    attn_implementation=attn_implementation,
-                    sdpa_backend=sdpa_backend,
-                    compile_mode=cm,
-                    kernelize=kernelize_on,
-                    **kwargs,
-                )
-                configs.append(config)
-    return configs
-
-
-def generate_all_configs(
-    warmup_iterations: int = 5,
-    measurement_iterations: int = 20,
-    batch_size: int = 1,
-    sequence_length: int = 128,
-    num_tokens_to_generate: int = 128,
-    gpu_monitoring: bool = True,
-) -> list[BenchmarkConfig]:
-    all_attn_implementations = [
-        ("flash_attention_2", None),
-        ("eager", None),
-        ("sdpa", "math"),
-        ("sdpa", "flash_attention"),
-        ("flex_attention", None),
-    ]
-    return cross_generate_configs(
-        attn_impl_and_sdpa_backend=all_attn_implementations,
-        compiled_mode=[None, "default", "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs"],
-        kernelized=[False, KERNELIZATION_AVAILABLE],
-        warmup_iterations=warmup_iterations,
-        measurement_iterations=measurement_iterations,
-        batch_size=batch_size,
-        sequence_length=sequence_length,
-        num_tokens_to_generate=num_tokens_to_generate,
-        gpu_monitoring=gpu_monitoring,
+    parameters = (
+        x if isinstance(x, list) else [x]
+        for x in [
+            warmup_iterations,
+            measurement_iterations,
+            batch_size,
+            sequence_length,
+            num_tokens_to_generate,
+            gpu_monitoring,
+        ]
    )
+    iterator = itertools.product(*parameters)
+
+    adapted_configs = []
+    for warmup_iters, measurement_iters, bs, seqlen, ntok, monitor in iterator:
+        for config in configs:
+            config = config.to_dict()
+            config["warmup_iterations"] = warmup_iters
+            config["measurement_iterations"] = measurement_iters
+            config["batch_size"] = bs
+            config["sequence_length"] = seqlen
+            config["num_tokens_to_generate"] = ntok
+            config["gpu_monitoring"] = monitor
+            adapted_configs.append(BenchmarkConfig.from_dict(config))
+    return adapted_configs


-def generate_main_configs(
-    warmup_iterations: int = 5,
-    measurement_iterations: int = 20,
-    batch_size: int = 1,
-    sequence_length: int = 128,
-    num_tokens_to_generate: int = 128,
-) -> list[BenchmarkConfig]:
-    # Create kwargs common to all configs
-    kwargs = {
-        "warmup_iterations": warmup_iterations,
-        "measurement_iterations": measurement_iterations,
-        "batch_size": batch_size,
-        "sequence_length": sequence_length,
-        "num_tokens_to_generate": num_tokens_to_generate,
-    }
-    return [  # TODO: test max-autotune instead of default
-        BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default", gpu_monitoring=False, **kwargs),
-        BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default", gpu_monitoring=True, **kwargs),
-        BenchmarkConfig(attn_implementation="eager", compile_mode="default", gpu_monitoring=True, **kwargs),
-        BenchmarkConfig(attn_implementation="flash_attention_2", gpu_monitoring=True, **kwargs),
-    ]
+def get_config_by_level(level: int) -> list[BenchmarkConfig]:
+    configs = []
+    # Early return if level is greater than 3: we generate all combinations of configs, maybe even w/ all compile modes
+    if level >= 3:
+        for attn_implementation, sdpa_backend in BenchmarkConfig.all_attn_implementations:
+            # Usually there is not much to gain by compiling with other modes, but we allow it for level 4
+            compile_modes = BenchmarkConfig.all_compiled_modes if level >= 4 else [None, "default"]
+            for cm in compile_modes:
+                for kernelize_on in [False, KERNELIZATION_AVAILABLE]:
+                    configs.append(
+                        BenchmarkConfig(
+                            attn_implementation=attn_implementation,
+                            sdpa_backend=sdpa_backend,
+                            compile_mode=cm,
+                            kernelize=kernelize_on,
+                        )
+                    )
+        return configs
+    # Otherwise, we add the configs for the given level
+    if level >= 0:
+        configs.append(BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default"))
+    if level >= 1:
+        configs.append(BenchmarkConfig(attn_implementation="flash_attention_2"))
+        configs.append(BenchmarkConfig(attn_implementation="eager", compile_mode="default"))
+    if level >= 2:
+        configs.append(BenchmarkConfig(attn_implementation="sdpa", compile_mode="default"))
+        configs.append(BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default", kernelize=True))
+        configs.append(BenchmarkConfig(attn_implementation="flash_attention_2", kernelize=True))
+    return configs
--- a/benchmark_v2/run_benchmarks.py
+++ b/benchmark_v2/run_benchmarks.py
@ -23,7 +23,7 @@ import logging
 import sys
 import uuid

-from framework.benchmark_config import BenchmarkConfig, generate_all_configs, generate_main_configs
+from framework.benchmark_config import adapt_configs, get_config_by_level
 from framework.benchmark_runner import BenchmarkRunner


@ -40,7 +40,14 @@ if __name__ == "__main__":
    parser.add_argument("--sequence-length", "-s", type=int, nargs="+", help="Sequence length")
    parser.add_argument("--num-tokens-to-generate", "-n", type=int, nargs="+", help="Number of tokens to generate")

-    parser.add_argument("--cross-generate", action="store_true", help="Cross-generate all combinations of configs")
+    parser.add_argument(
+        "--level",
+        type=int,
+        default=1,
+        help="Level of coverage for the benchmark. 0: only the main config, 1: a few important configs, 2: a config for"
+        " each attn implementation an option, 3: cross-generate all combinations of configs, 4: cross-generate all"
+        " combinations of configs w/ all compile modes",
+    )
    parser.add_argument("--num-tokens-to-profile", "-p", type=int, default=0, help="Number of tokens to profile")

    parser.add_argument("--branch-name", type=str, help="Git branch name")
@ -79,64 +86,24 @@ if __name__ == "__main__":
            "At least one of the arguments --batch-size, --sequence-length, or --num-tokens-to-generate is required"
        )

-    # If there is only one (batch_size, sequence_length, num_tokens_to_generate), we benchmark across configs
-    elif len(args.batch_size) * len(args.sequence_length) * len(args.num_tokens_to_generate) == 1:
-        if args.cross_generate:
-            benchmark_configs = generate_all_configs(
-                warmup_iterations=args.warmup,
-                measurement_iterations=args.iterations,
-                batch_size=args.batch_size[0],
-                sequence_length=args.sequence_length[0],
-                num_tokens_to_generate=args.num_tokens_to_generate[0],
-                gpu_monitoring=not args.no_gpu_monitoring,
-            )
-        else:
-            benchmark_configs = generate_main_configs(
-                warmup_iterations=args.warmup,
-                measurement_iterations=args.iterations,
-                batch_size=args.batch_size[0],
-                sequence_length=args.sequence_length[0],
-                num_tokens_to_generate=args.num_tokens_to_generate[0],
-            )
-
-    # Otherwise, we benchmark across all combinations of dimensions
-    else:
-        main_config = generate_main_configs(
-            warmup_iterations=args.warmup,
-            measurement_iterations=args.iterations,
-            batch_size=args.batch_size[0],
-            sequence_length=args.sequence_length[0],
-            num_tokens_to_generate=args.num_tokens_to_generate[0],
-        )[0]
-        benchmark_configs = []
-        for num_tokens_to_generate in args.num_tokens_to_generate:
-            for sequence_length in args.sequence_length:
-                for batch_size in args.batch_size:
-                    cfg_dict = main_config.to_dict()
-                    cfg_dict["batch_size"] = batch_size
-                    cfg_dict["sequence_length"] = sequence_length
-                    cfg_dict["num_tokens_to_generate"] = num_tokens_to_generate
-                    cfg_dict.pop("name")
-                    benchmark_configs.append(BenchmarkConfig.from_dict(cfg_dict))
-
-    runner = BenchmarkRunner(
-        logger,
-        args.output_dir,
-        args.branch_name,
-        args.commit_id,
-        args.commit_message,
+    # Get the configs for the given coverage level
+    configs = get_config_by_level(args.level)
+    # Adapt the configs to the given arguments
+    configs = adapt_configs(
+        configs,
+        args.warmup,
+        args.iterations,
+        args.batch_size,
+        args.sequence_length,
+        args.num_tokens_to_generate,
+        not args.no_gpu_monitoring,
    )
+
+    runner = BenchmarkRunner(logger, args.output_dir, args.branch_name, args.commit_id, args.commit_message)
    timestamp, results = runner.run_benchmarks(
-        args.model_id,
-        benchmark_configs,
-        args.num_tokens_to_profile,
-        pretty_print_summary=True,
+        args.model_id, configs, args.num_tokens_to_profile, pretty_print_summary=True
    )

    dataset_id = args.push_result_to_dataset
    if dataset_id is not None and len(results) > 0:
-        runner.push_results_to_hub(
-            dataset_id,
-            results,
-            timestamp,
-        )
+        runner.push_results_to_hub(dataset_id, results, timestamp)
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@ -67,7 +67,7 @@ RUN set -e; \

 RUN python3 -m pip install --no-cache-dir -U timm

-RUN [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git || echo "Don't install detectron2 with nightly torch"
+RUN [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir --no-build-isolation git+https://github.com/facebookresearch/detectron2.git || echo "Don't install detectron2 with nightly torch"

 RUN python3 -m pip install --no-cache-dir pytesseract

--- a/docker/transformers-doc-builder/Dockerfile
+++ b/docker/transformers-doc-builder/Dockerfile
@ -10,7 +10,7 @@ RUN apt-get -y update && apt-get install -y libsndfile1-dev && apt install -y te
 # Torch needs to be installed before deepspeed
 RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed]

-RUN python3 -m pip install --no-cache-dir torchvision git+https://github.com/facebookresearch/detectron2.git pytesseract
+RUN python3 -m pip install --no-cache-dir --no-build-isolation torchvision git+https://github.com/facebookresearch/detectron2.git pytesseract
 RUN python3 -m pip install -U "itsdangerous<2.1.0"

 # Test if the image could successfully build the doc. before publishing the image
--- a/docker/transformers-pytorch-amd-gpu/Dockerfile
+++ b/docker/transformers-pytorch-amd-gpu/Dockerfile
@ -39,7 +39,7 @@ RUN python3 -m pip install --no-cache-dir "torchcodec==0.5"
 # Install flash attention from source. Tested with commit 6387433156558135a998d5568a9d74c1778666d8
 RUN git clone https://github.com/ROCm/flash-attention/ -b tridao && \
    cd flash-attention && \
-    GPU_ARCHS="gfx942;gfx950" python setup.py install  
-# GPU_ARCHS builds for MI300, MI325 and MI355
+    GPU_ARCHS="gfx942" python setup.py install  
+# GPU_ARCHS builds for MI300, MI325 but not MI355: we would need to add `;gfx950` but it takes too long to build.

 RUN python3 -m pip install --no-cache-dir einops
--- a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
@ -29,7 +29,7 @@ RUN python3 -m pip uninstall -y apex torch torchvision torchaudio
 RUN python3 -m pip install torch==$PYTORCH torchvision==$TORCH_VISION torchaudio==$TORCH_AUDIO --index-url https://download.pytorch.org/whl/rocm$ROCM --no-cache-dir

 # Pre-build DeepSpeed, so it's be ready for testing (to avoid timeout)
-RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache-dir -v --disable-pip-version-check 2>&1
+RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --no-build-isolation --config-settings="--build-option=build_ext" --config-settings="--build-option=-j8" --no-cache-dir -v --disable-pip-version-check 2>&1

 ARG REF=main
 WORKDIR /
--- a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
@ -21,7 +21,7 @@ RUN python3 -m pip install --no-cache-dir './transformers[deepspeed-testing]' 'p
 # Install latest release PyTorch
 # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
 # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
-RUN python3 -m pip uninstall -y torch torchvision torchaudio && python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/$CUDA
+RUN python3 -m pip uninstall -y torch torchvision torchaudio torchcodec && python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA

 RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate

@ -43,7 +43,7 @@ RUN python3 -m pip uninstall -y deepspeed
 # This has to be run (again) inside the GPU VMs running the tests.
 # The installation works here, but some tests fail, if we don't pre-build deepspeed again in the VMs running the tests.
 # TODO: Find out why test fail.
-RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
+RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --no-build-isolation --config-settings="--build-option=build_ext" --config-settings="--build-option=-j8" --no-cache -v --disable-pip-version-check 2>&1

 # `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
 RUN python3 -m pip uninstall -y kernels
--- a/docker/transformers-quantization-latest-gpu/Dockerfile
+++ b/docker/transformers-quantization-latest-gpu/Dockerfile
@ -24,7 +24,7 @@ RUN [ ${#PYTORCH} -gt 0 ] && VERSION='torch=='$PYTORCH'.*' ||  VERSION='torch';
 RUN echo torch=$VERSION
 # `torchvision` and `torchaudio` should be installed along with `torch`, especially for nightly build.
 # Currently, let's just use their latest releases (when `torch` is installed with a release version)
-RUN python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/$CUDA
+RUN python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA

 RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate

@ -50,7 +50,7 @@ RUN python3 -m pip install --no-cache-dir hqq
 RUN python3 -m pip install --no-cache-dir gguf

 # Add autoawq for quantization testing
-RUN python3 -m pip install --no-cache-dir autoawq[kernels]
+RUN python3 -m pip install --no-cache-dir --no-build-isolation autoawq[kernels]

 # Add quanto for quantization testing
 RUN python3 -m pip install --no-cache-dir optimum-quanto
@ -81,7 +81,7 @@ RUN python3 -m pip uninstall -y flash-attn
 RUN cd transformers && python3 setup.py develop

 # Add fp-quant for quantization testing
-RUN python3 -m pip install --no-cache-dir "fp-quant>=0.2.0"
+RUN python3 -m pip install --no-cache-dir "fp-quant>=0.3.2"

 # Low usage or incompatible lib, will enable later on

--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@ -119,6 +119,8 @@
    title: Tools
  - local: transformers_as_backend
    title: Inference server backends
+  - local: continuous_batching
+    title: Continuous Batching
  title: Inference
 - isExpanded: false
  sections:
--- a/docs/source/en/continuous_batching.md
+++ b/docs/source/en/continuous_batching.md
@ -0,0 +1,194 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Continuous Batching
+
+Continuous Batching (CB) is an advanced technique to optimize the inference of transformer models by dynamically grouping multiple requests into batches. This approach maximizes GPU utilization and throughput, specifically for workloads with many variable-length inputs.
+
+We are particularly interested in having Continuous Batching in transformers for the following use cases:
+- Evaluation of models on large datasets with variable-length inputs
+- Generating outputs for multiple sequences for GRPO policies
+
+CB is what makes inference engines like vLLM or SGLang efficient. That being said, transformers does not aim to be a production-ready inference engine, but a complete framework for model development. For this reason, CB is available in `transformers serve`.
+
+If you are not familiar with some of the core concepts CB is built upon, we invite you to read the associated blog post: [Continuous Batching: Efficient Inference for Large Language Models](https://huggingface.co/blog/continuous-batching). _broken link for now_
+
+## API Reference
+
+## Usage Examples
+
+The main way to use CB in transformers is via the `generate_batch` method.
+
+Unlike `generate`, CB takes already tokenized inputs, known as input IDs. Each sequence of input IDs is represented as a list of integers, in python: `list[int]`. Since 
+
+For a more detailed example, please refer to: [examples/continuous_batching](./path/to/example)
+
+### `generate_batch` example
+
+We have created a `ContinuousMixin` that is inherited by the `GenerationMixin` so that all auto regressive text models support CB.
+
+This adds the `generate_batch` method to all models that inherit from `GenerationMixin`.
+
+You can use it as follows:
+
+```py
+import datasets
+import torch
+
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation import GenerationConfig
+
+model = AutoModelForCausalLM.from_pretrained(
+    "Qwen/Qwen3-4B-Instruct-2507",
+    attn_implementation="spda_paged",
+    device_map="cuda",  # if you need cuda
+    dtype=torch.bfloat16,
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, padding_side="left")
+
+# prepare a batch of inputs
+dataset = datasets.load_dataset("openai/gsm8k", "socratic", split="test")
+dataset = dataset.select(range(args.samples))
+tokenized_datasets = dataset.map(lambda x: tokenizer(x["question"]), batched=True)
+simple_batch_inputs = [item["input_ids"] for item in tokenized_datasets]
+
+generation_config = GenerationConfig(
+    max_new_tokens=32,
+    use_cuda_graph=False,  # Not supported for simple version
+    eos_token_id=tokenizer.eos_token_id,
+    pad_token_id=tokenizer.pad_token_id,
+    do_sample=False,
+    max_batch_tokens=512,  # max number of tokens in a batch, this is just a default value you should tune based on your hardware
+)
+
+batch_outputs = model.generate_batch(
+    inputs=simple_batch_inputs,
+    generation_config=generation_config,
+)
+
+for request_id, output in batch_outputs.items():
+    generated_text = tokenizer.decode(output.generated_tokens, skip_special_tokens=True)
+    print(f"Request {request_id} output: {generated_text}")
+```
+
+### `ContinuousBatchingManager` example
+
+If you want more control w.r.t. how you want to schedule requests using CB, you can use the `ContinuousBatchingManager` class directly.
+
+This is what we use in `transformers serve` because requests arrive asynchronously and we can leverage the asynchronous nature of the CB process to make things more efficient.
+
+Under the hood, the `ContinuousBatchingManager` creates a background thread that receives inputs from a python `queue.Queue` which it uses to get requests to batch in each forward pass.
+
+Note that the manager is thread safe!
+
+```py
+import datasets
+import torch
+
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation import GenerationConfig
+from transformers.generation.continuous_batching import RequestStatus
+
+model = AutoModelForCausalLM.from_pretrained(
+    "Qwen/Qwen3-4B-Instruct-2507",
+    attn_implementation="spda_paged",
+    device_map="cuda",  # if you need cuda
+    dtype=torch.bfloat16,
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, padding_side="left")
+
+# prepare a batch of inputs
+dataset = datasets.load_dataset("openai/gsm8k", "socratic", split="test")
+dataset = dataset.select(range(args.samples))
+tokenized_datasets = dataset.map(lambda x: tokenizer(x["question"]), batched=True)
+simple_batch_inputs = [item["input_ids"] for item in tokenized_datasets]
+
+# initialize the manager, available method thanks to the `ContinuousMixin`
+manager = model.init_continuous_batching(generation_config=generation_config)
+
+# start the background thread
+manager.start()
+
+# this is for demonstration purposes only, in practice this is most useful to do concurrently
+for i, input in enumerate(simple_batch_inputs):
+    request_id = manager.add_request(input_ids=input, request_id=f"request_{i}")  # if you do not specify a request_id, one will be generated for you
+
+# Can be done in an other thread
+for id, request in manager.get_result():
+    generated_text = tokenizer.decode(request.generated_tokens, skip_special_tokens=True)
+    print(f"Request {id} output: {generated_text}")
+
+# you can also get results for a specific request id
+result = manager.get_result(request_id="request_5")  # this is blocking and will wait for the result to be ready
+
+# or get results for a request that is streaming
+manager.add_request(
+    input_ids=input,
+    request_id="streaming_request",
+    stream=True,
+)
+for chunk in manager.request_id_iter(request_id="streaming_request"):
+    generated_text = tokenizer.decode(chunk.generated_tokens, skip_special_tokens=True)
+    print(generated_text)
+    # FIXME: stop iteration in `request_id_iter` when finished instead of doing it externally
+    if chunk.status == RequestStatus.FINISHED:
+        break
+
+# stop the background thread before exiting the process
+manager.stop()
+```
+
+## Supported & Unsupported Features
+
+### Supported Features
+
+- Dynamic scheduling of variable-length requests
+- Chunked prefill
+- Paged Attention Cache
+- Sliding window attention
+- Chat templates
+
+### Unsupported Features
+
+At the moment, the following features are not supported with CB. We plan to add support to the following:
+
+- Prefix caching
+- Beam search
+- tool calling
+
+The others are unplanned, but depending on community requests we might consider adding them:
+
+- MTP (multi token prediction)
+- Medusa
+
+## Performance Considerations
+
+
+## Integration with Serving
+
+You can use CB in `transformers serve` by passing the `--continuous-batching` flag when starting the server.
+
+## Monitoring
+
+We have added `opentelemetry` support to Continuous Batching to help you monitor its performance in production. To enable it, you need to install the `opentelemetry` extra when installing `transformers`:
+
+```sh
+# this installs `opentelemetry-api`, `opentelemetry-sdk` and `opentelemetry-exporter-otlp`
+pip install transformers[open-telemetry]
+```
+
+This will enable traces and metrics collection in CB. You will then have to setup the backend to collect and visualize the traces and metrics.
+
--- a/docs/source/en/llm_optims.md
+++ b/docs/source/en/llm_optims.md
@ -393,3 +393,9 @@ model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-v0.1", quantization_config=quant_config, device_map="auto"
 )
 ```
+
+## Continuous Batching
+
+When serving LLMs for inference, you may have multiple requests arriving at different times. Continuous Batching (CB) is a technique that groups incoming requests into batches to maximize GPU utilization and throughput.
+
+See the [Continuous Batching](./continuous_batching) guide for more details on how to use CB in transformers.
--- a/docs/source/en/model_doc/colqwen2.md
+++ b/docs/source/en/model_doc/colqwen2.md
@ -158,6 +158,24 @@ print("Retrieval scores (query x image):")
 print(scores)
 ```

+You can also use checkpoints for `ColQwen2.5` that are **compatible with the ColQwen2 architecture**. This version of the model uses [Qwen2_5_VL](./qwen2_5_vl) as the backbone.
+
+```python
+import torch
+from transformers import ColQwen2ForRetrieval, ColQwen2Processor
+from transformers.utils.import_utils import is_flash_attn_2_available
+
+model_name = "Sahil-Kabir/colqwen2.5-v0.2-hf" # An existing compatible checkpoint
+
+model = ColQwen2ForRetrieval.from_pretrained(
+    model_name,
+    dtype=torch.bfloat16,
+    device_map="auto",
+    attn_implementation="flash_attention_2" if is_flash_attn_2_available() else "sdpa"
+)
+processor = ColQwen2Processor.from_pretrained(model_name)
+```
+
 ## Notes

 - [`~ColQwen2Processor.score_retrieval`] returns a 2D tensor where the first dimension is the number of queries and the second dimension is the number of images. A higher score indicates more similarity between the query and image.
--- a/docs/source/en/model_doc/fuyu.md
+++ b/docs/source/en/model_doc/fuyu.md
@ -75,11 +75,11 @@ A processor requires an image_processor and a tokenizer. Hence, inputs can be lo
 from PIL import Image
 from transformers import AutoTokenizer
 from transformers.models.fuyu.processing_fuyu import FuyuProcessor
-from transformers.models.fuyu.image_processing_fuyu import FuyuImageProcessor
+from transformers.models.fuyu.image_processing_fuyu_fast import FuyuImageProcessorFast


 tokenizer = AutoTokenizer.from_pretrained('adept-hf-collab/fuyu-8b')
-image_processor = FuyuImageProcessor()
+image_processor = FuyuImageProcessorFast()


 processor = FuyuProcessor(image_processor=image_processor, tokenizer=tokenizer)
@ -118,6 +118,11 @@ The `LlamaTokenizer` is used as it is a standard wrapper around sentencepiece.
 [[autodoc]] FuyuImageProcessor
    - __call__

+## FuyuImageProcessor
+
+[[autodoc]] FuyuImageProcessorFast
+    - __call__
+
 ## FuyuProcessor

 [[autodoc]] FuyuProcessor
--- a/docs/source/en/model_doc/glpn.md
+++ b/docs/source/en/model_doc/glpn.md
@ -61,6 +61,11 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] GLPNImageProcessor
    - preprocess

+## GLPNImageProcessorFast
+
+[[autodoc]] GLPNImageProcessorFast
+    - preprocess
+
 ## GLPNModel

 [[autodoc]] GLPNModel
--- a/docs/source/en/quantization/fp_quant.md
+++ b/docs/source/en/quantization/fp_quant.md
@ -40,7 +40,7 @@ You can choose between MXFP4 and NVFP4 with `FPQuantConfig(forward_dtype="mxfp4"

 A **Blackwell-generation GPU is required** to run the kernels. Runtime support for FP-Quant is implemented through the [QuTLASS](https://github.com/IST-DASLab/qutlass) library and a lightweight PyTorch interface lib [`fp_quant`](https://github.com/IST-DASLab/FP-Quant/tree/master/inference_lib). We recommend installing the former **from source** and the latter with  `pip install fp_quant`.

-Users **without a Blackwell-generation GPU** , can use the method with `quantization_config=FPQuantConfig(pseudoquant=True)` without having to install [QuTLASS](https://github.com/IST-DASLab/qutlass). This would provide no speedups but would fully emulate the effect of quantization.
+Users **without a Blackwell-generation GPU** , can use the method with `quantization_config=FPQuantConfig(pseudoquantization=True)` without having to install [QuTLASS](https://github.com/IST-DASLab/qutlass). This would provide no speedups but would fully emulate the effect of quantization.

 > [!TIP]
 > Find models pre-quantized with FP-Quant in the official ISTA-DASLab [collection](https://huggingface.co/collections/ISTA-DASLab/fp-quant-6877c186103a21d3a02568ee).
--- a/docs/source/en/trainer.md
+++ b/docs/source/en/trainer.md
@ -187,7 +187,7 @@ from torch import nn
 from transformers import Trainer

 class CustomTrainer(Trainer):
-    def compute_loss(self, model: nn.Module, inputs: dict[str, Union[torch.Tensor, Any]], return_outputs: bool = False num_items_in_batch: Optional[torch.Tensor] = None):
+    def compute_loss(self, model: nn.Module, inputs: dict[str, Union[torch.Tensor, Any]], return_outputs: bool = False, num_items_in_batch: Optional[torch.Tensor] = None):
        labels = inputs.pop("labels")
        # forward pass
        outputs = model(**inputs)
--- a/examples/legacy/README.md
+++ b/examples/legacy/README.md
@ -1,21 +0,0 @@
-<!---
-Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-->
-
-# Legacy examples
-
-This folder contains examples which are not actively maintained (mostly contributed by the community).
-
-Using these examples together with a recent version of the library usually requires to make small (sometimes big) adaptations to get the scripts working.
--- a/examples/legacy/benchmarking/README.md
+++ b/examples/legacy/benchmarking/README.md
@ -1,26 +0,0 @@
-<!---
-Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-->
-
-# 🤗 Benchmark results
-
-Here, you can find a list of the different benchmark results created by the community.
-
-If you would like to list benchmark results on your favorite models of the [model hub](https://huggingface.co/models) here, please open a Pull Request and add it below.
-
-| Benchmark description | Results | Environment info |      Author      |
-|:----------|:-------------|:-------------|------:|
-| PyTorch Benchmark on inference for `google-bert/bert-base-cased` |[memory](https://github.com/patrickvonplaten/files_to_link_to/blob/master/bert_benchmark/inference_memory.csv) | [env](https://github.com/patrickvonplaten/files_to_link_to/blob/master/bert_benchmark/env.csv) | [Patrick von Platen](https://github.com/patrickvonplaten) | 
-| PyTorch Benchmark on inference for `google-bert/bert-base-cased` |[time](https://github.com/patrickvonplaten/files_to_link_to/blob/master/bert_benchmark/inference_time.csv) | [env](https://github.com/patrickvonplaten/files_to_link_to/blob/master/bert_benchmark/env.csv) | [Patrick von Platen](https://github.com/patrickvonplaten) | 
--- a/examples/legacy/benchmarking/plot_csv_file.py
+++ b/examples/legacy/benchmarking/plot_csv_file.py
@ -1,178 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import csv
-from collections import defaultdict
-from dataclasses import dataclass, field
-from typing import Optional
-
-import matplotlib.pyplot as plt
-import numpy as np
-from matplotlib.ticker import ScalarFormatter
-
-from transformers import HfArgumentParser
-
-
-def list_field(default=None, metadata=None):
-    return field(default_factory=lambda: default, metadata=metadata)
-
-
-@dataclass
-class PlotArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
-    """
-
-    csv_file: str = field(
-        metadata={"help": "The csv file to plot."},
-    )
-    plot_along_batch: bool = field(
-        default=False,
-        metadata={"help": "Whether to plot along batch size or sequence length. Defaults to sequence length."},
-    )
-    is_time: bool = field(
-        default=False,
-        metadata={"help": "Whether the csv file has time results or memory results. Defaults to memory results."},
-    )
-    no_log_scale: bool = field(
-        default=False,
-        metadata={"help": "Disable logarithmic scale when plotting"},
-    )
-    is_train: bool = field(
-        default=False,
-        metadata={
-            "help": "Whether the csv file has training results or inference results. Defaults to inference results."
-        },
-    )
-    figure_png_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "Filename under which the plot will be saved. If unused no plot is saved."},
-    )
-    short_model_names: Optional[list[str]] = list_field(
-        default=None, metadata={"help": "List of model names that are used instead of the ones in the csv file."}
-    )
-
-
-def can_convert_to_int(string):
-    try:
-        int(string)
-        return True
-    except ValueError:
-        return False
-
-
-def can_convert_to_float(string):
-    try:
-        float(string)
-        return True
-    except ValueError:
-        return False
-
-
-class Plot:
-    def __init__(self, args):
-        self.args = args
-        self.result_dict = defaultdict(lambda: {"bsz": [], "seq_len": [], "result": {}})
-
-        with open(self.args.csv_file, newline="") as csv_file:
-            reader = csv.DictReader(csv_file)
-            for row in reader:
-                model_name = row["model"]
-                self.result_dict[model_name]["bsz"].append(int(row["batch_size"]))
-                self.result_dict[model_name]["seq_len"].append(int(row["sequence_length"]))
-                if can_convert_to_int(row["result"]):
-                    # value is not None
-                    self.result_dict[model_name]["result"][(int(row["batch_size"]), int(row["sequence_length"]))] = (
-                        int(row["result"])
-                    )
-                elif can_convert_to_float(row["result"]):
-                    # value is not None
-                    self.result_dict[model_name]["result"][(int(row["batch_size"]), int(row["sequence_length"]))] = (
-                        float(row["result"])
-                    )
-
-    def plot(self):
-        fig, ax = plt.subplots()
-        title_str = "Time usage" if self.args.is_time else "Memory usage"
-        title_str = title_str + " for training" if self.args.is_train else title_str + " for inference"
-
-        if not self.args.no_log_scale:
-            # set logarithm scales
-            ax.set_xscale("log")
-            ax.set_yscale("log")
-
-        for axis in [ax.xaxis, ax.yaxis]:
-            axis.set_major_formatter(ScalarFormatter())
-
-        for model_name_idx, model_name in enumerate(self.result_dict.keys()):
-            batch_sizes = sorted(set(self.result_dict[model_name]["bsz"]))
-            sequence_lengths = sorted(set(self.result_dict[model_name]["seq_len"]))
-            results = self.result_dict[model_name]["result"]
-
-            (x_axis_array, inner_loop_array) = (
-                (batch_sizes, sequence_lengths) if self.args.plot_along_batch else (sequence_lengths, batch_sizes)
-            )
-
-            label_model_name = (
-                model_name if self.args.short_model_names is None else self.args.short_model_names[model_name_idx]
-            )
-
-            for inner_loop_value in inner_loop_array:
-                if self.args.plot_along_batch:
-                    y_axis_array = np.asarray(
-                        [results[(x, inner_loop_value)] for x in x_axis_array if (x, inner_loop_value) in results],
-                        dtype=int,
-                    )
-                else:
-                    y_axis_array = np.asarray(
-                        [results[(inner_loop_value, x)] for x in x_axis_array if (inner_loop_value, x) in results],
-                        dtype=np.float32,
-                    )
-
-                (x_axis_label, inner_loop_label) = (
-                    ("batch_size", "len") if self.args.plot_along_batch else ("in #tokens", "bsz")
-                )
-
-                x_axis_array = np.asarray(x_axis_array, int)[: len(y_axis_array)]
-                plt.scatter(
-                    x_axis_array, y_axis_array, label=f"{label_model_name} - {inner_loop_label}: {inner_loop_value}"
-                )
-                plt.plot(x_axis_array, y_axis_array, "--")
-
-            title_str += f" {label_model_name} vs."
-
-        title_str = title_str[:-4]
-        y_axis_label = "Time in s" if self.args.is_time else "Memory in MB"
-
-        # plot
-        plt.title(title_str)
-        plt.xlabel(x_axis_label)
-        plt.ylabel(y_axis_label)
-        plt.legend()
-
-        if self.args.figure_png_file is not None:
-            plt.savefig(self.args.figure_png_file)
-        else:
-            plt.show()
-
-
-def main():
-    parser = HfArgumentParser(PlotArguments)
-    plot_args = parser.parse_args_into_dataclasses()[0]
-    plot = Plot(args=plot_args)
-    plot.plot()
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/legacy/benchmarking/requirements.txt
+++ b/examples/legacy/benchmarking/requirements.txt
@ -1 +0,0 @@
-torch >= 1.3
--- a/examples/legacy/benchmarking/run_benchmark.py
+++ b/examples/legacy/benchmarking/run_benchmark.py
@ -1,47 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2020 The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Benchmarking the library on inference and training"""
-
-from transformers import HfArgumentParser, PyTorchBenchmark, PyTorchBenchmarkArguments
-
-
-def main():
-    parser = HfArgumentParser(PyTorchBenchmarkArguments)
-    try:
-        benchmark_args = parser.parse_args_into_dataclasses()[0]
-    except ValueError as e:
-        arg_error_msg = "Arg --no_{0} is no longer used, please use --no-{0} instead."
-        begin_error_msg = " ".join(str(e).split(" ")[:-1])
-        full_error_msg = ""
-        depreciated_args = eval(str(e).split(" ")[-1])
-        wrong_args = []
-        for arg in depreciated_args:
-            # arg[2:] removes '--'
-            if arg[2:] in PyTorchBenchmarkArguments.deprecated_args:
-                # arg[5:] removes '--no_'
-                full_error_msg += arg_error_msg.format(arg[5:])
-            else:
-                wrong_args.append(arg)
-        if len(wrong_args) > 0:
-            full_error_msg = full_error_msg + begin_error_msg + str(wrong_args)
-        raise ValueError(full_error_msg)
-
-    benchmark = PyTorchBenchmark(args=benchmark_args)
-    benchmark.run()
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/legacy/multiple_choice/run_multiple_choice.py
+++ b/examples/legacy/multiple_choice/run_multiple_choice.py
@ -1,232 +0,0 @@
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Finetuning the library models for multiple choice (Bert, Roberta, XLNet)."""
-
-import logging
-import os
-from dataclasses import dataclass, field
-from typing import Optional
-
-import numpy as np
-from utils_multiple_choice import MultipleChoiceDataset, Split, processors
-
-import transformers
-from transformers import (
-    AutoConfig,
-    AutoModelForMultipleChoice,
-    AutoTokenizer,
-    DataCollatorWithPadding,
-    EvalPrediction,
-    HfArgumentParser,
-    Trainer,
-    TrainingArguments,
-    set_seed,
-)
-from transformers.trainer_utils import is_main_process
-
-
-logger = logging.getLogger(__name__)
-
-
-def simple_accuracy(preds, labels):
-    return (preds == labels).mean()
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-
-    model_name_or_path: str = field(
-        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
-    )
-    config_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
-    )
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
-    )
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-    """
-
-    task_name: str = field(metadata={"help": "The name of the task to train on: " + ", ".join(processors.keys())})
-    data_dir: str = field(metadata={"help": "Should contain the data files for the task."})
-    max_seq_length: int = field(
-        default=128,
-        metadata={
-            "help": (
-                "The maximum total input sequence length after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded."
-            )
-        },
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
-    )
-
-
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
-    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO if training_args.local_process_index in [-1, 0] else logging.WARN,
-    )
-    logger.warning(
-        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-        training_args.local_process_index,
-        training_args.device,
-        training_args.n_gpu,
-        bool(training_args.parallel_mode.value == "distributed"),
-        training_args.fp16,
-    )
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    if is_main_process(training_args.local_process_index):
-        transformers.utils.logging.set_verbosity_info()
-        transformers.utils.logging.enable_default_handler()
-        transformers.utils.logging.enable_explicit_format()
-    logger.info("Training/evaluation parameters %s", training_args)
-
-    # Set seed
-    set_seed(training_args.seed)
-
-    try:
-        processor = processors[data_args.task_name]()
-        label_list = processor.get_labels()
-        num_labels = len(label_list)
-    except KeyError:
-        raise ValueError("Task not found: %s" % (data_args.task_name))
-
-    # Load pretrained model and tokenizer
-    #
-    # Distributed training:
-    # The .from_pretrained methods guarantee that only one local process can concurrently
-    # download model & vocab.
-
-    config = AutoConfig.from_pretrained(
-        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
-        num_labels=num_labels,
-        finetuning_task=data_args.task_name,
-        cache_dir=model_args.cache_dir,
-    )
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-    )
-    model = AutoModelForMultipleChoice.from_pretrained(
-        model_args.model_name_or_path,
-        from_tf=bool(".ckpt" in model_args.model_name_or_path),
-        config=config,
-        cache_dir=model_args.cache_dir,
-    )
-
-    # Get datasets
-    train_dataset = (
-        MultipleChoiceDataset(
-            data_dir=data_args.data_dir,
-            tokenizer=tokenizer,
-            task=data_args.task_name,
-            max_seq_length=data_args.max_seq_length,
-            overwrite_cache=data_args.overwrite_cache,
-            mode=Split.train,
-        )
-        if training_args.do_train
-        else None
-    )
-    eval_dataset = (
-        MultipleChoiceDataset(
-            data_dir=data_args.data_dir,
-            tokenizer=tokenizer,
-            task=data_args.task_name,
-            max_seq_length=data_args.max_seq_length,
-            overwrite_cache=data_args.overwrite_cache,
-            mode=Split.dev,
-        )
-        if training_args.do_eval
-        else None
-    )
-
-    def compute_metrics(p: EvalPrediction) -> dict:
-        preds = np.argmax(p.predictions, axis=1)
-        return {"acc": simple_accuracy(preds, p.label_ids)}
-
-    # Data collator
-    data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) if training_args.fp16 else None
-
-    # Initialize our Trainer
-    trainer = Trainer(
-        model=model,
-        args=training_args,
-        train_dataset=train_dataset,
-        eval_dataset=eval_dataset,
-        compute_metrics=compute_metrics,
-        data_collator=data_collator,
-    )
-
-    # Training
-    if training_args.do_train:
-        trainer.train(
-            model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
-        )
-        trainer.save_model()
-        # For convenience, we also re-save the tokenizer to the same directory,
-        # so that you can share your model easily on huggingface.co/models =)
-        if trainer.is_world_master():
-            tokenizer.save_pretrained(training_args.output_dir)
-
-    # Evaluation
-    results = {}
-    if training_args.do_eval:
-        logger.info("*** Evaluate ***")
-
-        result = trainer.evaluate()
-
-        output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
-        if trainer.is_world_master():
-            with open(output_eval_file, "w") as writer:
-                logger.info("***** Eval results *****")
-                for key, value in result.items():
-                    logger.info("  %s = %s", key, value)
-                    writer.write("{} = {}\n".format(key, value))
-
-                results.update(result)
-
-    return results
-
-
-def _mp_fn(index):
-    # For xla_spawn (TPUs)
-    main()
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/legacy/multiple_choice/utils_multiple_choice.py
+++ b/examples/legacy/multiple_choice/utils_multiple_choice.py
@ -1,483 +0,0 @@
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Multiple choice fine-tuning: utilities to work with multiple choice tasks of reading comprehension"""
-
-import csv
-import glob
-import json
-import logging
-import os
-from dataclasses import dataclass
-from enum import Enum
-from typing import Optional
-
-import tqdm
-from filelock import FileLock
-
-from transformers import PreTrainedTokenizer, is_torch_available
-
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass(frozen=True)
-class InputExample:
-    """
-    A single training/test example for multiple choice
-
-    Args:
-        example_id: Unique id for the example.
-        question: string. The untokenized text of the second sequence (question).
-        contexts: list of str. The untokenized text of the first sequence (context of corresponding question).
-        endings: list of str. multiple choice's options. Its length must be equal to contexts' length.
-        label: (Optional) string. The label of the example. This should be
-        specified for train and dev examples, but not for test examples.
-    """
-
-    example_id: str
-    question: str
-    contexts: list[str]
-    endings: list[str]
-    label: Optional[str]
-
-
-@dataclass(frozen=True)
-class InputFeatures:
-    """
-    A single set of features of data.
-    Property names are the same names as the corresponding inputs to a model.
-    """
-
-    example_id: str
-    input_ids: list[list[int]]
-    attention_mask: Optional[list[list[int]]]
-    token_type_ids: Optional[list[list[int]]]
-    label: Optional[int]
-
-
-class Split(Enum):
-    train = "train"
-    dev = "dev"
-    test = "test"
-
-
-if is_torch_available():
-    import torch
-    from torch.utils.data import Dataset
-
-    class MultipleChoiceDataset(Dataset):
-        features: list[InputFeatures]
-
-        def __init__(
-            self,
-            data_dir: str,
-            tokenizer: PreTrainedTokenizer,
-            task: str,
-            max_seq_length: Optional[int] = None,
-            overwrite_cache=False,
-            mode: Split = Split.train,
-        ):
-            processor = processors[task]()
-
-            cached_features_file = os.path.join(
-                data_dir,
-                "cached_{}_{}_{}_{}".format(
-                    mode.value,
-                    tokenizer.__class__.__name__,
-                    str(max_seq_length),
-                    task,
-                ),
-            )
-
-            # Make sure only the first process in distributed training processes the dataset,
-            # and the others will use the cache.
-            lock_path = cached_features_file + ".lock"
-            with FileLock(lock_path):
-                if os.path.exists(cached_features_file) and not overwrite_cache:
-                    logger.info(f"Loading features from cached file {cached_features_file}")
-                    self.features = torch.load(cached_features_file, weights_only=True)
-                else:
-                    logger.info(f"Creating features from dataset file at {data_dir}")
-                    label_list = processor.get_labels()
-                    if mode == Split.dev:
-                        examples = processor.get_dev_examples(data_dir)
-                    elif mode == Split.test:
-                        examples = processor.get_test_examples(data_dir)
-                    else:
-                        examples = processor.get_train_examples(data_dir)
-                    logger.info("Training examples: %s", len(examples))
-                    self.features = convert_examples_to_features(
-                        examples,
-                        label_list,
-                        max_seq_length,
-                        tokenizer,
-                    )
-                    logger.info("Saving features into cached file %s", cached_features_file)
-                    torch.save(self.features, cached_features_file)
-
-        def __len__(self):
-            return len(self.features)
-
-        def __getitem__(self, i) -> InputFeatures:
-            return self.features[i]
-
-
-class DataProcessor:
-    """Base class for data converters for multiple choice data sets."""
-
-    def get_train_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the train set."""
-        raise NotImplementedError()
-
-    def get_dev_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the dev set."""
-        raise NotImplementedError()
-
-    def get_test_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the test set."""
-        raise NotImplementedError()
-
-    def get_labels(self):
-        """Gets the list of labels for this data set."""
-        raise NotImplementedError()
-
-
-class RaceProcessor(DataProcessor):
-    """Processor for the RACE data set."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        logger.info(f"LOOKING AT {data_dir} train")
-        high = os.path.join(data_dir, "train/high")
-        middle = os.path.join(data_dir, "train/middle")
-        high = self._read_txt(high)
-        middle = self._read_txt(middle)
-        return self._create_examples(high + middle, "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        logger.info(f"LOOKING AT {data_dir} dev")
-        high = os.path.join(data_dir, "dev/high")
-        middle = os.path.join(data_dir, "dev/middle")
-        high = self._read_txt(high)
-        middle = self._read_txt(middle)
-        return self._create_examples(high + middle, "dev")
-
-    def get_test_examples(self, data_dir):
-        """See base class."""
-        logger.info(f"LOOKING AT {data_dir} test")
-        high = os.path.join(data_dir, "test/high")
-        middle = os.path.join(data_dir, "test/middle")
-        high = self._read_txt(high)
-        middle = self._read_txt(middle)
-        return self._create_examples(high + middle, "test")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1", "2", "3"]
-
-    def _read_txt(self, input_dir):
-        lines = []
-        files = glob.glob(input_dir + "/*txt")
-        for file in tqdm.tqdm(files, desc="read files"):
-            with open(file, encoding="utf-8") as fin:
-                data_raw = json.load(fin)
-                data_raw["race_id"] = file
-                lines.append(data_raw)
-        return lines
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for _, data_raw in enumerate(lines):
-            race_id = "{}-{}".format(set_type, data_raw["race_id"])
-            article = data_raw["article"]
-            for i in range(len(data_raw["answers"])):
-                truth = str(ord(data_raw["answers"][i]) - ord("A"))
-                question = data_raw["questions"][i]
-                options = data_raw["options"][i]
-
-                examples.append(
-                    InputExample(
-                        example_id=race_id,
-                        question=question,
-                        contexts=[article, article, article, article],  # this is not efficient but convenient
-                        endings=[options[0], options[1], options[2], options[3]],
-                        label=truth,
-                    )
-                )
-        return examples
-
-
-class SynonymProcessor(DataProcessor):
-    """Processor for the Synonym data set."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        logger.info(f"LOOKING AT {data_dir} train")
-        return self._create_examples(self._read_csv(os.path.join(data_dir, "mctrain.csv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        logger.info(f"LOOKING AT {data_dir} dev")
-        return self._create_examples(self._read_csv(os.path.join(data_dir, "mchp.csv")), "dev")
-
-    def get_test_examples(self, data_dir):
-        """See base class."""
-        logger.info(f"LOOKING AT {data_dir} dev")
-
-        return self._create_examples(self._read_csv(os.path.join(data_dir, "mctest.csv")), "test")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1", "2", "3", "4"]
-
-    def _read_csv(self, input_file):
-        with open(input_file, encoding="utf-8") as f:
-            return list(csv.reader(f))
-
-    def _create_examples(self, lines: list[list[str]], type: str):
-        """Creates examples for the training and dev sets."""
-
-        examples = [
-            InputExample(
-                example_id=line[0],
-                question="",  # in the swag dataset, the
-                # common beginning of each
-                # choice is stored in "sent2".
-                contexts=[line[1], line[1], line[1], line[1], line[1]],
-                endings=[line[2], line[3], line[4], line[5], line[6]],
-                label=line[7],
-            )
-            for line in lines  # we skip the line with the column names
-        ]
-
-        return examples
-
-
-class SwagProcessor(DataProcessor):
-    """Processor for the SWAG data set."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        logger.info(f"LOOKING AT {data_dir} train")
-        return self._create_examples(self._read_csv(os.path.join(data_dir, "train.csv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        logger.info(f"LOOKING AT {data_dir} dev")
-        return self._create_examples(self._read_csv(os.path.join(data_dir, "val.csv")), "dev")
-
-    def get_test_examples(self, data_dir):
-        """See base class."""
-        logger.info(f"LOOKING AT {data_dir} dev")
-        raise ValueError(
-            "For swag testing, the input file does not contain a label column. It can not be tested in current code "
-            "setting!"
-        )
-        return self._create_examples(self._read_csv(os.path.join(data_dir, "test.csv")), "test")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1", "2", "3"]
-
-    def _read_csv(self, input_file):
-        with open(input_file, encoding="utf-8") as f:
-            return list(csv.reader(f))
-
-    def _create_examples(self, lines: list[list[str]], type: str):
-        """Creates examples for the training and dev sets."""
-        if type == "train" and lines[0][-1] != "label":
-            raise ValueError("For training, the input file must contain a label column.")
-
-        examples = [
-            InputExample(
-                example_id=line[2],
-                question=line[5],  # in the swag dataset, the
-                # common beginning of each
-                # choice is stored in "sent2".
-                contexts=[line[4], line[4], line[4], line[4]],
-                endings=[line[7], line[8], line[9], line[10]],
-                label=line[11],
-            )
-            for line in lines[1:]  # we skip the line with the column names
-        ]
-
-        return examples
-
-
-class ArcProcessor(DataProcessor):
-    """Processor for the ARC data set (request from allennlp)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        logger.info(f"LOOKING AT {data_dir} train")
-        return self._create_examples(self._read_json(os.path.join(data_dir, "train.jsonl")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        logger.info(f"LOOKING AT {data_dir} dev")
-        return self._create_examples(self._read_json(os.path.join(data_dir, "dev.jsonl")), "dev")
-
-    def get_test_examples(self, data_dir):
-        logger.info(f"LOOKING AT {data_dir} test")
-        return self._create_examples(self._read_json(os.path.join(data_dir, "test.jsonl")), "test")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1", "2", "3"]
-
-    def _read_json(self, input_file):
-        with open(input_file, encoding="utf-8") as fin:
-            lines = fin.readlines()
-            return lines
-
-    def _create_examples(self, lines, type):
-        """Creates examples for the training and dev sets."""
-
-        # There are two types of labels. They should be normalized
-        def normalize(truth):
-            if truth in "ABCD":
-                return ord(truth) - ord("A")
-            elif truth in "1234":
-                return int(truth) - 1
-            else:
-                logger.info("truth ERROR! %s", str(truth))
-                return None
-
-        examples = []
-        three_choice = 0
-        four_choice = 0
-        five_choice = 0
-        other_choices = 0
-        # we deleted example which has more than or less than four choices
-        for line in tqdm.tqdm(lines, desc="read arc data"):
-            data_raw = json.loads(line.strip("\n"))
-            if len(data_raw["question"]["choices"]) == 3:
-                three_choice += 1
-                continue
-            elif len(data_raw["question"]["choices"]) == 5:
-                five_choice += 1
-                continue
-            elif len(data_raw["question"]["choices"]) != 4:
-                other_choices += 1
-                continue
-            four_choice += 1
-            truth = str(normalize(data_raw["answerKey"]))
-            assert truth != "None"
-            question_choices = data_raw["question"]
-            question = question_choices["stem"]
-            id = data_raw["id"]
-            options = question_choices["choices"]
-            if len(options) == 4:
-                examples.append(
-                    InputExample(
-                        example_id=id,
-                        question=question,
-                        contexts=[
-                            options[0]["para"].replace("_", ""),
-                            options[1]["para"].replace("_", ""),
-                            options[2]["para"].replace("_", ""),
-                            options[3]["para"].replace("_", ""),
-                        ],
-                        endings=[options[0]["text"], options[1]["text"], options[2]["text"], options[3]["text"]],
-                        label=truth,
-                    )
-                )
-
-        if type == "train":
-            assert len(examples) > 1
-            assert examples[0].label is not None
-        logger.info("len examples: %s}", str(len(examples)))
-        logger.info("Three choices: %s", str(three_choice))
-        logger.info("Five choices: %s", str(five_choice))
-        logger.info("Other choices: %s", str(other_choices))
-        logger.info("four choices: %s", str(four_choice))
-
-        return examples
-
-
-def convert_examples_to_features(
-    examples: list[InputExample],
-    label_list: list[str],
-    max_length: int,
-    tokenizer: PreTrainedTokenizer,
-) -> list[InputFeatures]:
-    """
-    Loads a data file into a list of `InputFeatures`
-    """
-
-    label_map = {label: i for i, label in enumerate(label_list)}
-
-    features = []
-    for ex_index, example in tqdm.tqdm(enumerate(examples), desc="convert examples to features"):
-        if ex_index % 10000 == 0:
-            logger.info("Writing example %d of %d" % (ex_index, len(examples)))
-        choices_inputs = []
-        for ending_idx, (context, ending) in enumerate(zip(example.contexts, example.endings)):
-            text_a = context
-            if example.question.find("_") != -1:
-                # this is for cloze question
-                text_b = example.question.replace("_", ending)
-            else:
-                text_b = example.question + " " + ending
-
-            inputs = tokenizer(
-                text_a,
-                text_b,
-                add_special_tokens=True,
-                max_length=max_length,
-                padding="max_length",
-                truncation=True,
-                return_overflowing_tokens=True,
-            )
-            if "num_truncated_tokens" in inputs and inputs["num_truncated_tokens"] > 0:
-                logger.info(
-                    "Attention! you are cropping tokens (swag task is ok). "
-                    "If you are training ARC and RACE and you are popping question + options, "
-                    "you need to try to use a bigger max seq length!"
-                )
-
-            choices_inputs.append(inputs)
-
-        label = label_map[example.label]
-
-        input_ids = [x["input_ids"] for x in choices_inputs]
-        attention_mask = (
-            [x["attention_mask"] for x in choices_inputs] if "attention_mask" in choices_inputs[0] else None
-        )
-        token_type_ids = (
-            [x["token_type_ids"] for x in choices_inputs] if "token_type_ids" in choices_inputs[0] else None
-        )
-
-        features.append(
-            InputFeatures(
-                example_id=example.example_id,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                token_type_ids=token_type_ids,
-                label=label,
-            )
-        )
-
-    for f in features[:2]:
-        logger.info("*** Example ***")
-        logger.info("feature: %s" % f)
-
-    return features
-
-
-processors = {"race": RaceProcessor, "swag": SwagProcessor, "arc": ArcProcessor, "syn": SynonymProcessor}
-MULTIPLE_CHOICE_TASKS_NUM_LABELS = {"race", 4, "swag", 4, "arc", 4, "syn", 5}
--- a/examples/legacy/pytorch-lightning/lightning_base.py
+++ b/examples/legacy/pytorch-lightning/lightning_base.py
@ -1,397 +0,0 @@
-import argparse
-import logging
-import os
-from pathlib import Path
-from typing import Any
-
-import pytorch_lightning as pl
-from pytorch_lightning.utilities import rank_zero_info
-
-from transformers import (
-    AutoConfig,
-    AutoModel,
-    AutoModelForPreTraining,
-    AutoModelForQuestionAnswering,
-    AutoModelForSeq2SeqLM,
-    AutoModelForSequenceClassification,
-    AutoModelForTokenClassification,
-    AutoModelWithLMHead,
-    AutoTokenizer,
-    PreTrainedConfig,
-    PreTrainedTokenizer,
-    is_torch_available,
-)
-from transformers.optimization import (
-    Adafactor,
-    get_cosine_schedule_with_warmup,
-    get_cosine_with_hard_restarts_schedule_with_warmup,
-    get_linear_schedule_with_warmup,
-    get_polynomial_decay_schedule_with_warmup,
-)
-from transformers.utils.versions import require_version
-
-
-if is_torch_available():
-    import torch
-
-
-logger = logging.getLogger(__name__)
-
-require_version("pytorch_lightning>=1.0.4")
-
-MODEL_MODES = {
-    "base": AutoModel,
-    "sequence-classification": AutoModelForSequenceClassification,
-    "question-answering": AutoModelForQuestionAnswering,
-    "pretraining": AutoModelForPreTraining,
-    "token-classification": AutoModelForTokenClassification,
-    "language-modeling": AutoModelWithLMHead,
-    "summarization": AutoModelForSeq2SeqLM,
-    "translation": AutoModelForSeq2SeqLM,
-}
-
-
-# update this and the import above to support new schedulers from transformers.optimization
-arg_to_scheduler = {
-    "linear": get_linear_schedule_with_warmup,
-    "cosine": get_cosine_schedule_with_warmup,
-    "cosine_w_restarts": get_cosine_with_hard_restarts_schedule_with_warmup,
-    "polynomial": get_polynomial_decay_schedule_with_warmup,
-    # '': get_constant_schedule,             # not supported for now
-    # '': get_constant_schedule_with_warmup, # not supported for now
-}
-arg_to_scheduler_choices = sorted(arg_to_scheduler.keys())
-arg_to_scheduler_metavar = "{" + ", ".join(arg_to_scheduler_choices) + "}"
-
-
-class BaseTransformer(pl.LightningModule):
-    def __init__(
-        self,
-        hparams: argparse.Namespace,
-        num_labels=None,
-        mode="base",
-        config=None,
-        tokenizer=None,
-        model=None,
-        **config_kwargs,
-    ):
-        """Initialize a model, tokenizer and config."""
-        super().__init__()
-        # TODO: move to self.save_hyperparameters()
-        # self.save_hyperparameters()
-        # can also expand arguments into trainer signature for easier reading
-
-        self.save_hyperparameters(hparams)
-        self.step_count = 0
-        self.output_dir = Path(self.hparams.output_dir)
-        cache_dir = self.hparams.cache_dir if self.hparams.cache_dir else None
-        if config is None:
-            self.config = AutoConfig.from_pretrained(
-                self.hparams.config_name if self.hparams.config_name else self.hparams.model_name_or_path,
-                **({"num_labels": num_labels} if num_labels is not None else {}),
-                cache_dir=cache_dir,
-                **config_kwargs,
-            )
-        else:
-            self.config: PreTrainedConfig = config
-
-        extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "dropout", "attention_dropout")
-        for p in extra_model_params:
-            if getattr(self.hparams, p, None):
-                assert hasattr(self.config, p), f"model config doesn't have a `{p}` attribute"
-                setattr(self.config, p, getattr(self.hparams, p))
-
-        if tokenizer is None:
-            self.tokenizer = AutoTokenizer.from_pretrained(
-                self.hparams.tokenizer_name if self.hparams.tokenizer_name else self.hparams.model_name_or_path,
-                cache_dir=cache_dir,
-            )
-        else:
-            self.tokenizer: PreTrainedTokenizer = tokenizer
-        self.model_type = MODEL_MODES[mode]
-        if model is None:
-            self.model = self.model_type.from_pretrained(
-                self.hparams.model_name_or_path,
-                from_tf=bool(".ckpt" in self.hparams.model_name_or_path),
-                config=self.config,
-                cache_dir=cache_dir,
-            )
-        else:
-            self.model = model
-
-    def load_hf_checkpoint(self, *args, **kwargs):
-        self.model = self.model_type.from_pretrained(*args, **kwargs)
-
-    def get_lr_scheduler(self):
-        get_schedule_func = arg_to_scheduler[self.hparams.lr_scheduler]
-        scheduler = get_schedule_func(
-            self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.total_steps()
-        )
-        scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}
-        return scheduler
-
-    def configure_optimizers(self):
-        """Prepare optimizer and schedule (linear warmup and decay)"""
-        model = self.model
-        no_decay = ["bias", "LayerNorm.weight"]
-        optimizer_grouped_parameters = [
-            {
-                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
-                "weight_decay": self.hparams.weight_decay,
-            },
-            {
-                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
-                "weight_decay": 0.0,
-            },
-        ]
-        if self.hparams.adafactor:
-            optimizer = Adafactor(
-                optimizer_grouped_parameters, lr=self.hparams.learning_rate, scale_parameter=False, relative_step=False
-            )
-
-        else:
-            optimizer = torch.optim.AdamW(
-                optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon
-            )
-        self.opt = optimizer
-
-        scheduler = self.get_lr_scheduler()
-
-        return [optimizer], [scheduler]
-
-    def test_step(self, batch, batch_nb):
-        return self.validation_step(batch, batch_nb)
-
-    def test_epoch_end(self, outputs):
-        return self.validation_end(outputs)
-
-    def total_steps(self) -> int:
-        """The number of total training steps that will be run. Used for lr scheduler purposes."""
-        num_devices = max(1, self.hparams.gpus)  # TODO: consider num_tpu_cores
-        effective_batch_size = self.hparams.train_batch_size * self.hparams.accumulate_grad_batches * num_devices
-        return (self.dataset_size / effective_batch_size) * self.hparams.max_epochs
-
-    def setup(self, mode):
-        if mode == "test":
-            self.dataset_size = len(self.test_dataloader().dataset)
-        else:
-            self.train_loader = self.get_dataloader("train", self.hparams.train_batch_size, shuffle=True)
-            self.dataset_size = len(self.train_dataloader().dataset)
-
-    def get_dataloader(self, type_path: str, batch_size: int, shuffle: bool = False):
-        raise NotImplementedError("You must implement this for your task")
-
-    def train_dataloader(self):
-        return self.train_loader
-
-    def val_dataloader(self):
-        return self.get_dataloader("dev", self.hparams.eval_batch_size, shuffle=False)
-
-    def test_dataloader(self):
-        return self.get_dataloader("test", self.hparams.eval_batch_size, shuffle=False)
-
-    def _feature_file(self, mode):
-        return os.path.join(
-            self.hparams.data_dir,
-            "cached_{}_{}_{}".format(
-                mode,
-                list(filter(None, self.hparams.model_name_or_path.split("/"))).pop(),
-                str(self.hparams.max_seq_length),
-            ),
-        )
-
-    @pl.utilities.rank_zero_only
-    def on_save_checkpoint(self, checkpoint: dict[str, Any]) -> None:
-        save_path = self.output_dir.joinpath("best_tfmr")
-        self.model.config.save_step = self.step_count
-        self.model.save_pretrained(save_path)
-        self.tokenizer.save_pretrained(save_path)
-
-    @staticmethod
-    def add_model_specific_args(parser, root_dir):
-        parser.add_argument(
-            "--model_name_or_path",
-            default=None,
-            type=str,
-            required=True,
-            help="Path to pretrained model or model identifier from huggingface.co/models",
-        )
-        parser.add_argument(
-            "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
-        )
-        parser.add_argument(
-            "--tokenizer_name",
-            default=None,
-            type=str,
-            help="Pretrained tokenizer name or path if not the same as model_name",
-        )
-        parser.add_argument(
-            "--cache_dir",
-            default="",
-            type=str,
-            help="Where do you want to store the pre-trained models downloaded from huggingface.co",
-        )
-        parser.add_argument(
-            "--encoder_layerdrop",
-            type=float,
-            help="Encoder layer dropout probability (Optional). Goes into model.config",
-        )
-        parser.add_argument(
-            "--decoder_layerdrop",
-            type=float,
-            help="Decoder layer dropout probability (Optional). Goes into model.config",
-        )
-        parser.add_argument(
-            "--dropout",
-            type=float,
-            help="Dropout probability (Optional). Goes into model.config",
-        )
-        parser.add_argument(
-            "--attention_dropout",
-            type=float,
-            help="Attention dropout probability (Optional). Goes into model.config",
-        )
-        parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
-        parser.add_argument(
-            "--lr_scheduler",
-            default="linear",
-            choices=arg_to_scheduler_choices,
-            metavar=arg_to_scheduler_metavar,
-            type=str,
-            help="Learning rate scheduler",
-        )
-        parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
-        parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
-        parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
-        parser.add_argument("--num_workers", default=4, type=int, help="kwarg passed to DataLoader")
-        parser.add_argument("--num_train_epochs", dest="max_epochs", default=3, type=int)
-        parser.add_argument("--train_batch_size", default=32, type=int)
-        parser.add_argument("--eval_batch_size", default=32, type=int)
-        parser.add_argument("--adafactor", action="store_true")
-
-
-class LoggingCallback(pl.Callback):
-    def on_batch_end(self, trainer, pl_module):
-        lr_scheduler = trainer.lr_schedulers[0]["scheduler"]
-        lrs = {f"lr_group_{i}": lr for i, lr in enumerate(lr_scheduler.get_lr())}
-        pl_module.logger.log_metrics(lrs)
-
-    def on_validation_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
-        rank_zero_info("***** Validation results *****")
-        metrics = trainer.callback_metrics
-        # Log results
-        for key in sorted(metrics):
-            if key not in ["log", "progress_bar"]:
-                rank_zero_info(f"{key} = {str(metrics[key])}\n")
-
-    def on_test_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
-        rank_zero_info("***** Test results *****")
-        metrics = trainer.callback_metrics
-        # Log and save results to file
-        output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
-        with open(output_test_results_file, "w") as writer:
-            for key in sorted(metrics):
-                if key not in ["log", "progress_bar"]:
-                    rank_zero_info(f"{key} = {str(metrics[key])}\n")
-                    writer.write(f"{key} = {str(metrics[key])}\n")
-
-
-def add_generic_args(parser, root_dir) -> None:
-    #  To allow all pl args uncomment the following line
-    #  parser = pl.Trainer.add_argparse_args(parser)
-    parser.add_argument(
-        "--output_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-    parser.add_argument(
-        "--fp16",
-        action="store_true",
-        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
-    )
-
-    parser.add_argument(
-        "--fp16_opt_level",
-        type=str,
-        default="O2",
-        help=(
-            "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. "
-            "See details at https://nvidia.github.io/apex/amp.html"
-        ),
-    )
-    parser.add_argument("--n_tpu_cores", dest="tpu_cores", type=int)
-    parser.add_argument("--max_grad_norm", dest="gradient_clip_val", default=1.0, type=float, help="Max gradient norm")
-    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
-    parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.")
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        dest="accumulate_grad_batches",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
-    parser.add_argument(
-        "--data_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The input data dir. Should contain the training files for the CoNLL-2003 NER task.",
-    )
-
-
-def generic_train(
-    model: BaseTransformer,
-    args: argparse.Namespace,
-    early_stopping_callback=None,
-    logger=True,  # can pass WandbLogger() here
-    extra_callbacks=[],
-    checkpoint_callback=None,
-    logging_callback=None,
-    **extra_train_kwargs,
-):
-    pl.seed_everything(args.seed)
-
-    # init model
-    odir = Path(model.hparams.output_dir)
-    odir.mkdir(exist_ok=True)
-
-    # add custom checkpoints
-    if checkpoint_callback is None:
-        checkpoint_callback = pl.callbacks.ModelCheckpoint(
-            filepath=args.output_dir, prefix="checkpoint", monitor="val_loss", mode="min", save_top_k=1
-        )
-    if early_stopping_callback:
-        extra_callbacks.append(early_stopping_callback)
-    if logging_callback is None:
-        logging_callback = LoggingCallback()
-
-    train_params = {}
-
-    # TODO: remove with PyTorch 1.6 since pl uses native amp
-    if args.fp16:
-        train_params["precision"] = 16
-        train_params["amp_level"] = args.fp16_opt_level
-
-    if args.gpus > 1:
-        train_params["distributed_backend"] = "ddp"
-
-    train_params["accumulate_grad_batches"] = args.accumulate_grad_batches
-    train_params["accelerator"] = extra_train_kwargs.get("accelerator")
-    train_params["profiler"] = extra_train_kwargs.get("profiler")
-
-    trainer = pl.Trainer.from_argparse_args(
-        args,
-        weights_summary=None,
-        callbacks=[logging_callback] + extra_callbacks,
-        logger=logger,
-        checkpoint_callback=checkpoint_callback,
-        **train_params,
-    )
-
-    if args.do_train:
-        trainer.fit(model)
-
-    return trainer
--- a/examples/legacy/pytorch-lightning/requirements.txt
+++ b/examples/legacy/pytorch-lightning/requirements.txt
@ -1,21 +0,0 @@
-tensorboard
-scikit-learn
-seqeval
-psutil
-sacrebleu
-rouge-score
-tensorflow_datasets
-matplotlib
-git-python==1.0.3
-faiss-cpu
-streamlit
-elasticsearch
-nltk
-pandas
-datasets >= 1.1.3
-fire
-pytest<8.0.1
-conllu
-sentencepiece != 0.1.92
-protobuf
-ray
--- a/examples/legacy/pytorch-lightning/run_glue.py
+++ b/examples/legacy/pytorch-lightning/run_glue.py
@ -1,201 +0,0 @@
-import argparse
-import glob
-import logging
-import os
-import time
-from argparse import Namespace
-
-import numpy as np
-import torch
-from lightning_base import BaseTransformer, add_generic_args, generic_train
-from torch.utils.data import DataLoader, TensorDataset
-
-from transformers import glue_compute_metrics as compute_metrics
-from transformers import glue_convert_examples_to_features as convert_examples_to_features
-from transformers import glue_output_modes, glue_tasks_num_labels
-from transformers import glue_processors as processors
-
-
-logger = logging.getLogger(__name__)
-
-
-class GLUETransformer(BaseTransformer):
-    mode = "sequence-classification"
-
-    def __init__(self, hparams):
-        if isinstance(hparams, dict):
-            hparams = Namespace(**hparams)
-        hparams.glue_output_mode = glue_output_modes[hparams.task]
-        num_labels = glue_tasks_num_labels[hparams.task]
-
-        super().__init__(hparams, num_labels, self.mode)
-
-    def forward(self, **inputs):
-        return self.model(**inputs)
-
-    def training_step(self, batch, batch_idx):
-        inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
-
-        if self.config.model_type not in ["distilbert", "bart"]:
-            inputs["token_type_ids"] = batch[2] if self.config.model_type in ["bert", "xlnet", "albert"] else None
-
-        outputs = self(**inputs)
-        loss = outputs[0]
-
-        lr_scheduler = self.trainer.lr_schedulers[0]["scheduler"]
-        tensorboard_logs = {"loss": loss, "rate": lr_scheduler.get_last_lr()[-1]}
-        return {"loss": loss, "log": tensorboard_logs}
-
-    def prepare_data(self):
-        "Called to initialize data. Use the call to construct features"
-        args = self.hparams
-        processor = processors[args.task]()
-        self.labels = processor.get_labels()
-
-        for mode in ["train", "dev"]:
-            cached_features_file = self._feature_file(mode)
-            if os.path.exists(cached_features_file) and not args.overwrite_cache:
-                logger.info("Loading features from cached file %s", cached_features_file)
-            else:
-                logger.info("Creating features from dataset file at %s", args.data_dir)
-                examples = (
-                    processor.get_dev_examples(args.data_dir)
-                    if mode == "dev"
-                    else processor.get_train_examples(args.data_dir)
-                )
-                features = convert_examples_to_features(
-                    examples,
-                    self.tokenizer,
-                    max_length=args.max_seq_length,
-                    label_list=self.labels,
-                    output_mode=args.glue_output_mode,
-                )
-                logger.info("Saving features into cached file %s", cached_features_file)
-                torch.save(features, cached_features_file)
-
-    def get_dataloader(self, mode: str, batch_size: int, shuffle: bool = False) -> DataLoader:
-        "Load datasets. Called after prepare data."
-
-        # We test on dev set to compare to benchmarks without having to submit to GLUE server
-        mode = "dev" if mode == "test" else mode
-
-        cached_features_file = self._feature_file(mode)
-        logger.info("Loading features from cached file %s", cached_features_file)
-        features = torch.load(cached_features_file, weights_only=True)
-        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
-        all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
-        all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
-        if self.hparams.glue_output_mode == "classification":
-            all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
-        elif self.hparams.glue_output_mode == "regression":
-            all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
-
-        return DataLoader(
-            TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels),
-            batch_size=batch_size,
-            shuffle=shuffle,
-        )
-
-    def validation_step(self, batch, batch_idx):
-        inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
-
-        if self.config.model_type not in ["distilbert", "bart"]:
-            inputs["token_type_ids"] = batch[2] if self.config.model_type in ["bert", "xlnet", "albert"] else None
-
-        outputs = self(**inputs)
-        tmp_eval_loss, logits = outputs[:2]
-        preds = logits.detach().cpu().numpy()
-        out_label_ids = inputs["labels"].detach().cpu().numpy()
-
-        return {"val_loss": tmp_eval_loss.detach().cpu(), "pred": preds, "target": out_label_ids}
-
-    def _eval_end(self, outputs) -> tuple:
-        val_loss_mean = torch.stack([x["val_loss"] for x in outputs]).mean().detach().cpu().item()
-        preds = np.concatenate([x["pred"] for x in outputs], axis=0)
-
-        if self.hparams.glue_output_mode == "classification":
-            preds = np.argmax(preds, axis=1)
-        elif self.hparams.glue_output_mode == "regression":
-            preds = np.squeeze(preds)
-
-        out_label_ids = np.concatenate([x["target"] for x in outputs], axis=0)
-        out_label_list = [[] for _ in range(out_label_ids.shape[0])]
-        preds_list = [[] for _ in range(out_label_ids.shape[0])]
-
-        results = {"val_loss": val_loss_mean, **compute_metrics(self.hparams.task, preds, out_label_ids)}
-
-        ret = dict(results.items())
-        ret["log"] = results
-        return ret, preds_list, out_label_list
-
-    def validation_epoch_end(self, outputs: list) -> dict:
-        ret, preds, targets = self._eval_end(outputs)
-        logs = ret["log"]
-        return {"val_loss": logs["val_loss"], "log": logs, "progress_bar": logs}
-
-    def test_epoch_end(self, outputs) -> dict:
-        ret, predictions, targets = self._eval_end(outputs)
-        logs = ret["log"]
-        # `val_loss` is the key returned by `self._eval_end()` but actually refers to `test_loss`
-        return {"avg_test_loss": logs["val_loss"], "log": logs, "progress_bar": logs}
-
-    @staticmethod
-    def add_model_specific_args(parser, root_dir):
-        BaseTransformer.add_model_specific_args(parser, root_dir)
-        parser.add_argument(
-            "--max_seq_length",
-            default=128,
-            type=int,
-            help=(
-                "The maximum total input sequence length after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded."
-            ),
-        )
-
-        parser.add_argument(
-            "--task",
-            default="",
-            type=str,
-            required=True,
-            help="The GLUE task to run",
-        )
-        parser.add_argument(
-            "--gpus",
-            default=0,
-            type=int,
-            help="The number of GPUs allocated for this, it is by default 0 meaning none",
-        )
-
-        parser.add_argument(
-            "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
-        )
-
-        return parser
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    add_generic_args(parser, os.getcwd())
-    parser = GLUETransformer.add_model_specific_args(parser, os.getcwd())
-    args = parser.parse_args()
-
-    # If output_dir not provided, a folder will be generated in pwd
-    if args.output_dir is None:
-        args.output_dir = os.path.join(
-            "./results",
-            f"{args.task}_{time.strftime('%Y%m%d_%H%M%S')}",
-        )
-        os.makedirs(args.output_dir)
-
-    model = GLUETransformer(args)
-    trainer = generic_train(model, args)
-
-    # Optionally, predict on dev set and write to output_dir
-    if args.do_predict:
-        checkpoints = sorted(glob.glob(os.path.join(args.output_dir, "checkpoint-epoch=*.ckpt"), recursive=True))
-        model = model.load_from_checkpoint(checkpoints[-1])
-        return trainer.test(model)
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/legacy/pytorch-lightning/run_glue.sh
+++ b/examples/legacy/pytorch-lightning/run_glue.sh
@ -1,34 +0,0 @@
-# Install example requirements
-pip install -r ../requirements.txt
-
-# Download glue data
-python3 ../../utils/download_glue_data.py
-
-export TASK=mrpc
-export DATA_DIR=./glue_data/MRPC/
-export MAX_LENGTH=128
-export LEARNING_RATE=2e-5
-export BERT_MODEL=bert-base-cased
-export BATCH_SIZE=32
-export NUM_EPOCHS=3
-export SEED=2
-export OUTPUT_DIR_NAME=mrpc-pl-bert
-export CURRENT_DIR=${PWD}
-export OUTPUT_DIR=${CURRENT_DIR}/${OUTPUT_DIR_NAME}
-
-# Make output directory if it doesn't exist
-mkdir -p $OUTPUT_DIR
-# Add parent directory to python path to access lightning_base.py
-export PYTHONPATH="../":"${PYTHONPATH}"
-
-python3 run_glue.py --gpus 1 --data_dir $DATA_DIR \
--task $TASK \
--model_name_or_path $BERT_MODEL \
--output_dir $OUTPUT_DIR \
--max_seq_length  $MAX_LENGTH \
--learning_rate $LEARNING_RATE \
--num_train_epochs $NUM_EPOCHS \
--train_batch_size $BATCH_SIZE \
--seed $SEED \
--do_train \
--do_predict
--- a/examples/legacy/pytorch-lightning/run_ner.py
+++ b/examples/legacy/pytorch-lightning/run_ner.py
@ -1,216 +0,0 @@
-import argparse
-import glob
-import logging
-import os
-from argparse import Namespace
-from importlib import import_module
-
-import numpy as np
-import torch
-from lightning_base import BaseTransformer, add_generic_args, generic_train
-from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
-from torch.nn import CrossEntropyLoss
-from torch.utils.data import DataLoader, TensorDataset
-from utils_ner import TokenClassificationTask
-
-
-logger = logging.getLogger(__name__)
-
-
-class NERTransformer(BaseTransformer):
-    """
-    A training module for NER. See BaseTransformer for the core options.
-    """
-
-    mode = "token-classification"
-
-    def __init__(self, hparams):
-        if isinstance(hparams, dict):
-            hparams = Namespace(**hparams)
-        module = import_module("tasks")
-        try:
-            token_classification_task_clazz = getattr(module, hparams.task_type)
-            self.token_classification_task: TokenClassificationTask = token_classification_task_clazz()
-        except AttributeError:
-            raise ValueError(
-                f"Task {hparams.task_type} needs to be defined as a TokenClassificationTask subclass in {module}. "
-                f"Available tasks classes are: {TokenClassificationTask.__subclasses__()}"
-            )
-        self.labels = self.token_classification_task.get_labels(hparams.labels)
-        self.pad_token_label_id = CrossEntropyLoss().ignore_index
-        super().__init__(hparams, len(self.labels), self.mode)
-
-    def forward(self, **inputs):
-        return self.model(**inputs)
-
-    def training_step(self, batch, batch_num):
-        "Compute loss and log."
-        inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
-        if self.config.model_type != "distilbert":
-            inputs["token_type_ids"] = (
-                batch[2] if self.config.model_type in ["bert", "xlnet"] else None
-            )  # XLM and RoBERTa don"t use token_type_ids
-
-        outputs = self(**inputs)
-        loss = outputs[0]
-        # tensorboard_logs = {"loss": loss, "rate": self.lr_scheduler.get_last_lr()[-1]}
-        return {"loss": loss}
-
-    def prepare_data(self):
-        "Called to initialize data. Use the call to construct features"
-        args = self.hparams
-        for mode in ["train", "dev", "test"]:
-            cached_features_file = self._feature_file(mode)
-            if os.path.exists(cached_features_file) and not args.overwrite_cache:
-                logger.info("Loading features from cached file %s", cached_features_file)
-                features = torch.load(cached_features_file, weights_only=True)
-            else:
-                logger.info("Creating features from dataset file at %s", args.data_dir)
-                examples = self.token_classification_task.read_examples_from_file(args.data_dir, mode)
-                features = self.token_classification_task.convert_examples_to_features(
-                    examples,
-                    self.labels,
-                    args.max_seq_length,
-                    self.tokenizer,
-                    cls_token_at_end=bool(self.config.model_type == "xlnet"),
-                    cls_token=self.tokenizer.cls_token,
-                    cls_token_segment_id=2 if self.config.model_type == "xlnet" else 0,
-                    sep_token=self.tokenizer.sep_token,
-                    sep_token_extra=False,
-                    pad_on_left=bool(self.config.model_type == "xlnet"),
-                    pad_token=self.tokenizer.pad_token_id,
-                    pad_token_segment_id=self.tokenizer.pad_token_type_id,
-                    pad_token_label_id=self.pad_token_label_id,
-                )
-                logger.info("Saving features into cached file %s", cached_features_file)
-                torch.save(features, cached_features_file)
-
-    def get_dataloader(self, mode: int, batch_size: int, shuffle: bool = False) -> DataLoader:
-        "Load datasets. Called after prepare data."
-        cached_features_file = self._feature_file(mode)
-        logger.info("Loading features from cached file %s", cached_features_file)
-        features = torch.load(cached_features_file, weights_only=True)
-        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
-        all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
-        if features[0].token_type_ids is not None:
-            all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
-        else:
-            all_token_type_ids = torch.tensor([0 for f in features], dtype=torch.long)
-            # HACK(we will not use this anymore soon)
-        all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long)
-        return DataLoader(
-            TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_label_ids), batch_size=batch_size
-        )
-
-    def validation_step(self, batch, batch_nb):
-        """Compute validation""" ""
-        inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
-        if self.config.model_type != "distilbert":
-            inputs["token_type_ids"] = (
-                batch[2] if self.config.model_type in ["bert", "xlnet"] else None
-            )  # XLM and RoBERTa don"t use token_type_ids
-        outputs = self(**inputs)
-        tmp_eval_loss, logits = outputs[:2]
-        preds = logits.detach().cpu().numpy()
-        out_label_ids = inputs["labels"].detach().cpu().numpy()
-        return {"val_loss": tmp_eval_loss.detach().cpu(), "pred": preds, "target": out_label_ids}
-
-    def _eval_end(self, outputs):
-        "Evaluation called for both Val and Test"
-        val_loss_mean = torch.stack([x["val_loss"] for x in outputs]).mean()
-        preds = np.concatenate([x["pred"] for x in outputs], axis=0)
-        preds = np.argmax(preds, axis=2)
-        out_label_ids = np.concatenate([x["target"] for x in outputs], axis=0)
-
-        label_map = dict(enumerate(self.labels))
-        out_label_list = [[] for _ in range(out_label_ids.shape[0])]
-        preds_list = [[] for _ in range(out_label_ids.shape[0])]
-
-        for i in range(out_label_ids.shape[0]):
-            for j in range(out_label_ids.shape[1]):
-                if out_label_ids[i, j] != self.pad_token_label_id:
-                    out_label_list[i].append(label_map[out_label_ids[i][j]])
-                    preds_list[i].append(label_map[preds[i][j]])
-
-        results = {
-            "val_loss": val_loss_mean,
-            "accuracy_score": accuracy_score(out_label_list, preds_list),
-            "precision": precision_score(out_label_list, preds_list),
-            "recall": recall_score(out_label_list, preds_list),
-            "f1": f1_score(out_label_list, preds_list),
-        }
-
-        ret = dict(results.items())
-        ret["log"] = results
-        return ret, preds_list, out_label_list
-
-    def validation_epoch_end(self, outputs):
-        # when stable
-        ret, preds, targets = self._eval_end(outputs)
-        logs = ret["log"]
-        return {"val_loss": logs["val_loss"], "log": logs, "progress_bar": logs}
-
-    def test_epoch_end(self, outputs):
-        # updating to test_epoch_end instead of deprecated test_end
-        ret, predictions, targets = self._eval_end(outputs)
-
-        # Converting to the dict required by pl
-        # https://github.com/PyTorchLightning/pytorch-lightning/blob/master/\
-        # pytorch_lightning/trainer/logging.py#L139
-        logs = ret["log"]
-        # `val_loss` is the key returned by `self._eval_end()` but actually refers to `test_loss`
-        return {"avg_test_loss": logs["val_loss"], "log": logs, "progress_bar": logs}
-
-    @staticmethod
-    def add_model_specific_args(parser, root_dir):
-        # Add NER specific options
-        BaseTransformer.add_model_specific_args(parser, root_dir)
-        parser.add_argument(
-            "--task_type", default="NER", type=str, help="Task type to fine tune in training (e.g. NER, POS, etc)"
-        )
-        parser.add_argument(
-            "--max_seq_length",
-            default=128,
-            type=int,
-            help=(
-                "The maximum total input sequence length after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded."
-            ),
-        )
-
-        parser.add_argument(
-            "--labels",
-            default="",
-            type=str,
-            help="Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.",
-        )
-        parser.add_argument(
-            "--gpus",
-            default=0,
-            type=int,
-            help="The number of GPUs allocated for this, it is by default 0 meaning none",
-        )
-
-        parser.add_argument(
-            "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
-        )
-
-        return parser
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    add_generic_args(parser, os.getcwd())
-    parser = NERTransformer.add_model_specific_args(parser, os.getcwd())
-    args = parser.parse_args()
-    model = NERTransformer(args)
-    trainer = generic_train(model, args)
-
-    if args.do_predict:
-        # See https://github.com/huggingface/transformers/issues/3159
-        # pl use this default format to create a checkpoint:
-        # https://github.com/PyTorchLightning/pytorch-lightning/blob/master\
-        # /pytorch_lightning/callbacks/model_checkpoint.py#L322
-        checkpoints = sorted(glob.glob(os.path.join(args.output_dir, "checkpoint-epoch=*.ckpt"), recursive=True))
-        model = model.load_from_checkpoint(checkpoints[-1])
-        trainer.test(model)
--- a/examples/legacy/pytorch-lightning/run_ner.sh
+++ b/examples/legacy/pytorch-lightning/run_ner.sh
@ -1,44 +0,0 @@
-#!/usr/bin/env bash
-
-# for seqeval metrics import
-pip install -r ../requirements.txt
-
-## The relevant files are currently on a shared Google
-## drive at https://drive.google.com/drive/folders/1kC0I2UGl2ltrluI9NqDjaQJGw5iliw_J
-## Monitor for changes and eventually migrate to use the `datasets` library
-curl -L 'https://drive.google.com/uc?export=download&id=1Jjhbal535VVz2ap4v4r_rN1UEHTdLK5P' \
-| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > train.txt.tmp
-curl -L 'https://drive.google.com/uc?export=download&id=1ZfRcQThdtAR5PPRjIDtrVP7BtXSCUBbm' \
-| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > dev.txt.tmp
-curl -L 'https://drive.google.com/uc?export=download&id=1u9mb7kNJHWQCWyweMDRMuTFoOHOfeBTH' \
-| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > test.txt.tmp
-
-export MAX_LENGTH=128
-export BERT_MODEL=bert-base-multilingual-cased
-python3 scripts/preprocess.py train.txt.tmp $BERT_MODEL $MAX_LENGTH > train.txt
-python3 scripts/preprocess.py dev.txt.tmp $BERT_MODEL $MAX_LENGTH > dev.txt
-python3 scripts/preprocess.py test.txt.tmp $BERT_MODEL $MAX_LENGTH > test.txt
-cat train.txt dev.txt test.txt | cut -d " " -f 2 | grep -v "^$"| sort | uniq > labels.txt
-export BATCH_SIZE=32
-export NUM_EPOCHS=3
-export SEED=1
-
-export OUTPUT_DIR_NAME=germeval-model
-export CURRENT_DIR=${PWD}
-export OUTPUT_DIR=${CURRENT_DIR}/${OUTPUT_DIR_NAME}
-mkdir -p $OUTPUT_DIR
-
-# Add parent directory to python path to access lightning_base.py
-export PYTHONPATH="../":"${PYTHONPATH}"
-
-python3 run_ner.py --data_dir ./ \
--labels ./labels.txt \
--model_name_or_path $BERT_MODEL \
--output_dir $OUTPUT_DIR \
--max_seq_length  $MAX_LENGTH \
--num_train_epochs $NUM_EPOCHS \
--train_batch_size $BATCH_SIZE \
--seed $SEED \
--gpus 1 \
--do_train \
--do_predict
--- a/examples/legacy/pytorch-lightning/run_pos.sh
+++ b/examples/legacy/pytorch-lightning/run_pos.sh
@ -1,39 +0,0 @@
-#!/usr/bin/env bash
-if ! [ -f ./dev.txt ]; then
-  echo "Download dev dataset...."
-  curl -L -o ./dev.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-dev.conllu'
-fi
-
-if ! [ -f ./test.txt ]; then
-  echo "Download test dataset...."
-  curl -L -o ./test.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-test.conllu'
-fi
-
-if ! [ -f ./train.txt ]; then
-  echo "Download train dataset...."
-  curl -L -o ./train.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-train.conllu'
-fi
-
-export MAX_LENGTH=200
-export BERT_MODEL=bert-base-uncased
-export OUTPUT_DIR=postagger-model
-export BATCH_SIZE=32
-export NUM_EPOCHS=3
-export SAVE_STEPS=750
-export SEED=1
-
-
-# Add parent directory to python path to access lightning_base.py
-export PYTHONPATH="../":"${PYTHONPATH}"
-
-python3 run_ner.py --data_dir ./ \
--task_type POS \
--model_name_or_path $BERT_MODEL \
--output_dir $OUTPUT_DIR \
--max_seq_length  $MAX_LENGTH \
--num_train_epochs $NUM_EPOCHS \
--train_batch_size $BATCH_SIZE \
--seed $SEED \
--gpus 1 \
--do_train \
--do_predict
--- a/examples/legacy/question-answering/README.md
+++ b/examples/legacy/question-answering/README.md
@ -1,126 +0,0 @@
-#### Fine-tuning BERT on SQuAD1.0 with relative position embeddings
-
-The following examples show how to fine-tune BERT models with different relative position embeddings. The BERT model 
-`google-bert/bert-base-uncased` was pretrained with default absolute position embeddings. We provide the following pretrained 
-models which were pre-trained on the same training data (BooksCorpus and English Wikipedia) as in the BERT model 
-training, but with different relative position embeddings. 
-
-* `zhiheng-huang/bert-base-uncased-embedding-relative-key`, trained from scratch with relative embedding proposed by 
-Shaw et al., [Self-Attention with Relative Position Representations](https://huggingface.co/papers/1803.02155)
-* `zhiheng-huang/bert-base-uncased-embedding-relative-key-query`, trained from scratch with relative embedding method 4 
-in Huang et al. [Improve Transformer Models with Better Relative Position Embeddings](https://huggingface.co/papers/2009.13658)
-* `zhiheng-huang/bert-large-uncased-whole-word-masking-embedding-relative-key-query`, fine-tuned from model 
-`google-bert/bert-large-uncased-whole-word-masking` with 3 additional epochs with relative embedding method 4 in Huang et al. 
-[Improve Transformer Models with Better Relative Position Embeddings](https://huggingface.co/papers/2009.13658)
-
-
-##### Base models fine-tuning
-
-```bash
-export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-torchrun --nproc_per_node=8 ./examples/question-answering/run_squad.py \
-    --model_name_or_path zhiheng-huang/bert-base-uncased-embedding-relative-key-query \
-    --dataset_name squad \
-    --do_train \
-    --do_eval \
-    --learning_rate 3e-5 \
-    --num_train_epochs 2 \
-    --max_seq_length 512 \
-    --doc_stride 128 \
-    --output_dir relative_squad \
-    --per_device_eval_batch_size=60 \
-    --per_device_train_batch_size=6
-```
-Training with the above command leads to the following results. It boosts the BERT default from f1 score of 88.52 to 90.54.
-
-```bash
-'exact': 83.6802270577105, 'f1': 90.54772098174814
-```
-
-The change of `max_seq_length` from 512 to 384 in the above command leads to the f1 score of 90.34. Replacing the above 
-model `zhiheng-huang/bert-base-uncased-embedding-relative-key-query` with 
-`zhiheng-huang/bert-base-uncased-embedding-relative-key` leads to the f1 score of 89.51. The changing of 8 gpus to one 
-gpu training leads to the f1 score of 90.71.
-
-##### Large models fine-tuning
-
-```bash
-export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-torchrun --nproc_per_node=8 ./examples/question-answering/run_squad.py \
-    --model_name_or_path zhiheng-huang/bert-large-uncased-whole-word-masking-embedding-relative-key-query \
-    --dataset_name squad \
-    --do_train \
-    --do_eval \
-    --learning_rate 3e-5 \
-    --num_train_epochs 2 \
-    --max_seq_length 512 \
-    --doc_stride 128 \
-    --output_dir relative_squad \
-    --per_gpu_eval_batch_size=6 \
-    --per_gpu_train_batch_size=2 \
-    --gradient_accumulation_steps 3
-```
-Training with the above command leads to the f1 score of 93.52, which is slightly better than the f1 score of 93.15 for 
-`google-bert/bert-large-uncased-whole-word-masking`.
-
-#### Distributed training
-
-Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD1.1:
-
-```bash
-torchrun --nproc_per_node=8 ./examples/question-answering/run_squad.py \
-    --model_name_or_path google-bert/bert-large-uncased-whole-word-masking \
-    --dataset_name squad \
-    --do_train \
-    --do_eval \
-    --learning_rate 3e-5 \
-    --num_train_epochs 2 \
-    --max_seq_length 384 \
-    --doc_stride 128 \
-    --output_dir ./examples/models/wwm_uncased_finetuned_squad/ \
-    --per_device_eval_batch_size=3   \
-    --per_device_train_batch_size=3   \
-```
-
-Training with the previously defined hyper-parameters yields the following results:
-
-```bash
-f1 = 93.15
-exact_match = 86.91
-```
-
-This fine-tuned model is available as a checkpoint under the reference
-[`google-bert/bert-large-uncased-whole-word-masking-finetuned-squad`](https://huggingface.co/google-bert/bert-large-uncased-whole-word-masking-finetuned-squad).
-
-## Results
-
-Larger batch size may improve the performance while costing more memory.
-
-##### Results for SQuAD1.0 with the previously defined hyper-parameters:
-
-```python
-{
-"exact": 85.45884578997162,
-"f1": 92.5974600601065,
-"total": 10570,
-"HasAns_exact": 85.45884578997162,
-"HasAns_f1": 92.59746006010651,
-"HasAns_total": 10570
-}
-```
-
-##### Results for SQuAD2.0 with the previously defined hyper-parameters:
-
-```python
-{
-"exact": 80.4177545691906,
-"f1": 84.07154997729623,
-"total": 11873,
-"HasAns_exact": 76.73751686909581,
-"HasAns_f1": 84.05558584352873,
-"HasAns_total": 5928,
-"NoAns_exact": 84.0874684608915,
-"NoAns_f1": 84.0874684608915,
-"NoAns_total": 5945
-}
-```
--- a/examples/legacy/question-answering/run_squad.py
+++ b/examples/legacy/question-answering/run_squad.py
@ -1,824 +0,0 @@
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Finetuning the library models for question-answering on SQuAD (DistilBERT, Bert, XLM, XLNet)."""
-
-import argparse
-import glob
-import logging
-import os
-import random
-import timeit
-
-import numpy as np
-import torch
-from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
-from torch.utils.data.distributed import DistributedSampler
-from tqdm import tqdm, trange
-
-import transformers
-from transformers import (
-    MODEL_FOR_QUESTION_ANSWERING_MAPPING,
-    WEIGHTS_NAME,
-    AutoConfig,
-    AutoModelForQuestionAnswering,
-    AutoTokenizer,
-    get_linear_schedule_with_warmup,
-    squad_convert_examples_to_features,
-)
-from transformers.data.metrics.squad_metrics import (
-    compute_predictions_log_probs,
-    compute_predictions_logits,
-    squad_evaluate,
-)
-from transformers.data.processors.squad import SquadResult, SquadV1Processor, SquadV2Processor
-from transformers.trainer_utils import is_main_process
-
-
-try:
-    from torch.utils.tensorboard import SummaryWriter
-except ImportError:
-    from tensorboardX import SummaryWriter
-
-
-logger = logging.getLogger(__name__)
-
-MODEL_CONFIG_CLASSES = list(MODEL_FOR_QUESTION_ANSWERING_MAPPING.keys())
-MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
-
-
-def set_seed(args):
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if args.n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
-
-
-def to_list(tensor):
-    return tensor.tolist()
-
-
-def train(args, train_dataset, model, tokenizer):
-    """Train the model"""
-    if args.local_rank in [-1, 0]:
-        tb_writer = SummaryWriter()
-
-    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
-    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
-    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
-
-    if args.max_steps > 0:
-        t_total = args.max_steps
-        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
-    else:
-        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
-
-    # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ["bias", "LayerNorm.weight"]
-    optimizer_grouped_parameters = [
-        {
-            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
-            "weight_decay": args.weight_decay,
-        },
-        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
-    ]
-    optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(
-        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
-    )
-
-    # Check if saved optimizer or scheduler states exist
-    if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
-        os.path.join(args.model_name_or_path, "scheduler.pt")
-    ):
-        # Load in optimizer and scheduler states
-        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"), weights_only=True))
-        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"), weights_only=True))
-
-    if args.fp16:
-        try:
-            from apex import amp
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
-
-        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
-
-    # multi-gpu training (should be after apex fp16 initialization)
-    if args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)
-
-    # Distributed training (should be after apex fp16 initialization)
-    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(
-            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
-        )
-
-    # Train!
-    logger.info("***** Running training *****")
-    logger.info("  Num examples = %d", len(train_dataset))
-    logger.info("  Num Epochs = %d", args.num_train_epochs)
-    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info(
-        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
-        args.train_batch_size
-        * args.gradient_accumulation_steps
-        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
-    )
-    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
-    logger.info("  Total optimization steps = %d", t_total)
-
-    global_step = 1
-    epochs_trained = 0
-    steps_trained_in_current_epoch = 0
-    # Check if continuing training from a checkpoint
-    if os.path.exists(args.model_name_or_path):
-        try:
-            # set global_step to global_step of last saved checkpoint from model path
-            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
-            global_step = int(checkpoint_suffix)
-            epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
-            steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)
-
-            logger.info("  Continuing training from checkpoint, will skip to saved global_step")
-            logger.info("  Continuing training from epoch %d", epochs_trained)
-            logger.info("  Continuing training from global step %d", global_step)
-            logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
-        except ValueError:
-            logger.info("  Starting fine-tuning.")
-
-    tr_loss, logging_loss = 0.0, 0.0
-    model.zero_grad()
-    train_iterator = trange(
-        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
-    )
-    # Added here for reproducibility
-    set_seed(args)
-
-    for _ in train_iterator:
-        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
-        for step, batch in enumerate(epoch_iterator):
-            # Skip past any already trained steps if resuming training
-            if steps_trained_in_current_epoch > 0:
-                steps_trained_in_current_epoch -= 1
-                continue
-
-            model.train()
-            batch = tuple(t.to(args.device) for t in batch)
-
-            inputs = {
-                "input_ids": batch[0],
-                "attention_mask": batch[1],
-                "token_type_ids": batch[2],
-                "start_positions": batch[3],
-                "end_positions": batch[4],
-            }
-
-            if args.model_type in ["xlm", "roberta", "distilbert", "camembert", "bart", "longformer"]:
-                del inputs["token_type_ids"]
-
-            if args.model_type in ["xlnet", "xlm"]:
-                inputs.update({"cls_index": batch[5], "p_mask": batch[6]})
-                if args.version_2_with_negative:
-                    inputs.update({"is_impossible": batch[7]})
-                if hasattr(model, "config") and hasattr(model.config, "lang2id"):
-                    inputs.update(
-                        {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)}
-                    )
-
-            outputs = model(**inputs)
-            # model outputs are always tuple in transformers (see doc)
-            loss = outputs[0]
-
-            if args.n_gpu > 1:
-                loss = loss.mean()  # mean() to average on multi-gpu parallel (not distributed) training
-            if args.gradient_accumulation_steps > 1:
-                loss = loss / args.gradient_accumulation_steps
-
-            if args.fp16:
-                with amp.scale_loss(loss, optimizer) as scaled_loss:
-                    scaled_loss.backward()
-            else:
-                loss.backward()
-
-            tr_loss += loss.item()
-            if (step + 1) % args.gradient_accumulation_steps == 0:
-                if args.fp16:
-                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
-                else:
-                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
-
-                optimizer.step()
-                scheduler.step()  # Update learning rate schedule
-                model.zero_grad()
-                global_step += 1
-
-                # Log metrics
-                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
-                    # Only evaluate when single GPU otherwise metrics may not average well
-                    if args.local_rank == -1 and args.evaluate_during_training:
-                        results = evaluate(args, model, tokenizer)
-                        for key, value in results.items():
-                            tb_writer.add_scalar(f"eval_{key}", value, global_step)
-                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
-                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
-                    logging_loss = tr_loss
-
-                # Save model checkpoint
-                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
-                    output_dir = os.path.join(args.output_dir, f"checkpoint-{global_step}")
-                    # Take care of distributed/parallel training
-                    model_to_save = model.module if hasattr(model, "module") else model
-                    model_to_save.save_pretrained(output_dir)
-                    tokenizer.save_pretrained(output_dir)
-
-                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
-                    logger.info("Saving model checkpoint to %s", output_dir)
-
-                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
-                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
-                    logger.info("Saving optimizer and scheduler states to %s", output_dir)
-
-            if args.max_steps > 0 and global_step > args.max_steps:
-                epoch_iterator.close()
-                break
-        if args.max_steps > 0 and global_step > args.max_steps:
-            train_iterator.close()
-            break
-
-    if args.local_rank in [-1, 0]:
-        tb_writer.close()
-
-    return global_step, tr_loss / global_step
-
-
-def evaluate(args, model, tokenizer, prefix=""):
-    dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)
-
-    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
-        os.makedirs(args.output_dir)
-
-    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
-
-    # Note that DistributedSampler samples randomly
-    eval_sampler = SequentialSampler(dataset)
-    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
-
-    # multi-gpu evaluate
-    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
-        model = torch.nn.DataParallel(model)
-
-    # Eval!
-    logger.info(f"***** Running evaluation {prefix} *****")
-    logger.info("  Num examples = %d", len(dataset))
-    logger.info("  Batch size = %d", args.eval_batch_size)
-
-    all_results = []
-    start_time = timeit.default_timer()
-
-    for batch in tqdm(eval_dataloader, desc="Evaluating"):
-        model.eval()
-        batch = tuple(t.to(args.device) for t in batch)
-
-        with torch.no_grad():
-            inputs = {
-                "input_ids": batch[0],
-                "attention_mask": batch[1],
-                "token_type_ids": batch[2],
-            }
-
-            if args.model_type in ["xlm", "roberta", "distilbert", "camembert", "bart", "longformer"]:
-                del inputs["token_type_ids"]
-
-            feature_indices = batch[3]
-
-            # XLNet and XLM use more arguments for their predictions
-            if args.model_type in ["xlnet", "xlm"]:
-                inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
-                # for lang_id-sensitive xlm models
-                if hasattr(model, "config") and hasattr(model.config, "lang2id"):
-                    inputs.update(
-                        {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)}
-                    )
-            outputs = model(**inputs)
-
-        for i, feature_index in enumerate(feature_indices):
-            eval_feature = features[feature_index.item()]
-            unique_id = int(eval_feature.unique_id)
-
-            output = [to_list(output[i]) for output in outputs.to_tuple()]
-
-            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
-            # models only use two.
-            if len(output) >= 5:
-                start_logits = output[0]
-                start_top_index = output[1]
-                end_logits = output[2]
-                end_top_index = output[3]
-                cls_logits = output[4]
-
-                result = SquadResult(
-                    unique_id,
-                    start_logits,
-                    end_logits,
-                    start_top_index=start_top_index,
-                    end_top_index=end_top_index,
-                    cls_logits=cls_logits,
-                )
-
-            else:
-                start_logits, end_logits = output
-                result = SquadResult(unique_id, start_logits, end_logits)
-
-            all_results.append(result)
-
-    evalTime = timeit.default_timer() - start_time
-    logger.info("  Evaluation done in total %f secs (%f sec per example)", evalTime, evalTime / len(dataset))
-
-    # Compute predictions
-    output_prediction_file = os.path.join(args.output_dir, f"predictions_{prefix}.json")
-    output_nbest_file = os.path.join(args.output_dir, f"nbest_predictions_{prefix}.json")
-
-    if args.version_2_with_negative:
-        output_null_log_odds_file = os.path.join(args.output_dir, f"null_odds_{prefix}.json")
-    else:
-        output_null_log_odds_file = None
-
-    # XLNet and XLM use a more complex post-processing procedure
-    if args.model_type in ["xlnet", "xlm"]:
-        start_n_top = model.config.start_n_top if hasattr(model, "config") else model.module.config.start_n_top
-        end_n_top = model.config.end_n_top if hasattr(model, "config") else model.module.config.end_n_top
-
-        predictions = compute_predictions_log_probs(
-            examples,
-            features,
-            all_results,
-            args.n_best_size,
-            args.max_answer_length,
-            output_prediction_file,
-            output_nbest_file,
-            output_null_log_odds_file,
-            start_n_top,
-            end_n_top,
-            args.version_2_with_negative,
-            tokenizer,
-            args.verbose_logging,
-        )
-    else:
-        predictions = compute_predictions_logits(
-            examples,
-            features,
-            all_results,
-            args.n_best_size,
-            args.max_answer_length,
-            args.do_lower_case,
-            output_prediction_file,
-            output_nbest_file,
-            output_null_log_odds_file,
-            args.verbose_logging,
-            args.version_2_with_negative,
-            args.null_score_diff_threshold,
-            tokenizer,
-        )
-
-    # Compute the F1 and exact scores.
-    results = squad_evaluate(examples, predictions)
-    return results
-
-
-def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
-    if args.local_rank not in [-1, 0] and not evaluate:
-        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
-        torch.distributed.barrier()
-
-    # Load data features from cache or dataset file
-    input_dir = args.data_dir if args.data_dir else "."
-    cached_features_file = os.path.join(
-        input_dir,
-        "cached_{}_{}_{}".format(
-            "dev" if evaluate else "train",
-            list(filter(None, args.model_name_or_path.split("/"))).pop(),
-            str(args.max_seq_length),
-        ),
-    )
-
-    # Init features and dataset from cache if it exists
-    if os.path.exists(cached_features_file) and not args.overwrite_cache:
-        logger.info("Loading features from cached file %s", cached_features_file)
-        features_and_dataset = torch.load(cached_features_file, weights_only=True)
-        features, dataset, examples = (
-            features_and_dataset["features"],
-            features_and_dataset["dataset"],
-            features_and_dataset["examples"],
-        )
-    else:
-        logger.info("Creating features from dataset file at %s", input_dir)
-
-        if not args.data_dir and ((evaluate and not args.predict_file) or (not evaluate and not args.train_file)):
-            try:
-                import tensorflow_datasets as tfds
-            except ImportError:
-                raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.")
-
-            if args.version_2_with_negative:
-                logger.warning("tensorflow_datasets does not handle version 2 of SQuAD.")
-
-            tfds_examples = tfds.load("squad")
-            examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
-        else:
-            processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()
-            if evaluate:
-                examples = processor.get_dev_examples(args.data_dir, filename=args.predict_file)
-            else:
-                examples = processor.get_train_examples(args.data_dir, filename=args.train_file)
-
-        features, dataset = squad_convert_examples_to_features(
-            examples=examples,
-            tokenizer=tokenizer,
-            max_seq_length=args.max_seq_length,
-            doc_stride=args.doc_stride,
-            max_query_length=args.max_query_length,
-            is_training=not evaluate,
-            return_dataset="pt",
-            threads=args.threads,
-        )
-
-        if args.local_rank in [-1, 0]:
-            logger.info("Saving features into cached file %s", cached_features_file)
-            torch.save({"features": features, "dataset": dataset, "examples": examples}, cached_features_file)
-
-    if args.local_rank == 0 and not evaluate:
-        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
-        torch.distributed.barrier()
-
-    if output_examples:
-        return dataset, examples, features
-    return dataset
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    # Required parameters
-    parser.add_argument(
-        "--model_type",
-        default=None,
-        type=str,
-        required=True,
-        help="Model type selected in the list: " + ", ".join(MODEL_TYPES),
-    )
-    parser.add_argument(
-        "--model_name_or_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to pretrained model or model identifier from huggingface.co/models",
-    )
-    parser.add_argument(
-        "--output_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The output directory where the model checkpoints and predictions will be written.",
-    )
-
-    # Other parameters
-    parser.add_argument(
-        "--data_dir",
-        default=None,
-        type=str,
-        help="The input data dir. Should contain the .json files for the task."
-        + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
-    )
-    parser.add_argument(
-        "--train_file",
-        default=None,
-        type=str,
-        help="The input training file. If a data dir is specified, will look for the file there"
-        + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
-    )
-    parser.add_argument(
-        "--predict_file",
-        default=None,
-        type=str,
-        help="The input evaluation file. If a data dir is specified, will look for the file there"
-        + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
-    )
-    parser.add_argument(
-        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        default="",
-        type=str,
-        help="Pretrained tokenizer name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--cache_dir",
-        default="",
-        type=str,
-        help="Where do you want to store the pre-trained models downloaded from huggingface.co",
-    )
-
-    parser.add_argument(
-        "--version_2_with_negative",
-        action="store_true",
-        help="If true, the SQuAD examples contain some that do not have an answer.",
-    )
-    parser.add_argument(
-        "--null_score_diff_threshold",
-        type=float,
-        default=0.0,
-        help="If null_score - best_non_null is greater than the threshold predict null.",
-    )
-
-    parser.add_argument(
-        "--max_seq_length",
-        default=384,
-        type=int,
-        help=(
-            "The maximum total input sequence length after WordPiece tokenization. Sequences "
-            "longer than this will be truncated, and sequences shorter than this will be padded."
-        ),
-    )
-    parser.add_argument(
-        "--doc_stride",
-        default=128,
-        type=int,
-        help="When splitting up a long document into chunks, how much stride to take between chunks.",
-    )
-    parser.add_argument(
-        "--max_query_length",
-        default=64,
-        type=int,
-        help=(
-            "The maximum number of tokens for the question. Questions longer than this will "
-            "be truncated to this length."
-        ),
-    )
-    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
-    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
-    parser.add_argument(
-        "--evaluate_during_training", action="store_true", help="Run evaluation during training at each logging step."
-    )
-    parser.add_argument(
-        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
-    )
-
-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
-    parser.add_argument(
-        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
-    )
-    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
-    parser.add_argument(
-        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
-    )
-    parser.add_argument(
-        "--max_steps",
-        default=-1,
-        type=int,
-        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
-    )
-    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
-    parser.add_argument(
-        "--n_best_size",
-        default=20,
-        type=int,
-        help="The total number of n-best predictions to generate in the nbest_predictions.json output file.",
-    )
-    parser.add_argument(
-        "--max_answer_length",
-        default=30,
-        type=int,
-        help=(
-            "The maximum length of an answer that can be generated. This is needed because the start "
-            "and end predictions are not conditioned on one another."
-        ),
-    )
-    parser.add_argument(
-        "--verbose_logging",
-        action="store_true",
-        help=(
-            "If true, all of the warnings related to data processing will be printed. "
-            "A number of warnings are expected for a normal SQuAD evaluation."
-        ),
-    )
-    parser.add_argument(
-        "--lang_id",
-        default=0,
-        type=int,
-        help=(
-            "language id of input for language-specific xlm models (see"
-            " tokenization_xlm.PRETRAINED_INIT_CONFIGURATION)"
-        ),
-    )
-
-    parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
-    parser.add_argument("--save_steps", type=int, default=500, help="Save checkpoint every X updates steps.")
-    parser.add_argument(
-        "--eval_all_checkpoints",
-        action="store_true",
-        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
-    )
-    parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
-    parser.add_argument(
-        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
-    )
-    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
-
-    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
-    parser.add_argument(
-        "--fp16",
-        action="store_true",
-        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
-    )
-    parser.add_argument(
-        "--fp16_opt_level",
-        type=str,
-        default="O1",
-        help=(
-            "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. "
-            "See details at https://nvidia.github.io/apex/amp.html"
-        ),
-    )
-    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
-    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
-
-    parser.add_argument("--threads", type=int, default=1, help="multiple threads for converting example to features")
-    args = parser.parse_args()
-
-    if args.doc_stride >= args.max_seq_length - args.max_query_length:
-        logger.warning(
-            "WARNING - You've set a doc stride which may be superior to the document length in some "
-            "examples. This could result in errors when building features from the examples. Please reduce the doc "
-            "stride or increase the maximum length to ensure the features are correctly built."
-        )
-
-    # Setup distant debugging if needed
-    if args.server_ip and args.server_port:
-        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
-        import ptvsd
-
-        print("Waiting for debugger attach")
-        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
-        ptvsd.wait_for_attach()
-
-    # Setup CUDA, GPU & distributed training
-    if args.local_rank == -1 or args.no_cuda:
-        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
-    else:  # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
-        torch.cuda.set_device(args.local_rank)
-        device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend="nccl")
-        args.n_gpu = 1
-    args.device = device
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
-    )
-    logger.warning(
-        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-        args.local_rank,
-        device,
-        args.n_gpu,
-        bool(args.local_rank != -1),
-        args.fp16,
-    )
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    if is_main_process(args.local_rank):
-        transformers.utils.logging.set_verbosity_info()
-        transformers.utils.logging.enable_default_handler()
-        transformers.utils.logging.enable_explicit_format()
-    # Set seed
-    set_seed(args)
-
-    # Load pretrained model and tokenizer
-    if args.local_rank not in [-1, 0]:
-        # Make sure only the first process in distributed training will download model & vocab
-        torch.distributed.barrier()
-
-    args.model_type = args.model_type.lower()
-    config = AutoConfig.from_pretrained(
-        args.config_name if args.config_name else args.model_name_or_path,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-    tokenizer = AutoTokenizer.from_pretrained(
-        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-        do_lower_case=args.do_lower_case,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-        use_fast=False,  # SquadDataset is not compatible with Fast tokenizers which have a smarter overflow handling
-    )
-    model = AutoModelForQuestionAnswering.from_pretrained(
-        args.model_name_or_path,
-        from_tf=bool(".ckpt" in args.model_name_or_path),
-        config=config,
-        cache_dir=args.cache_dir if args.cache_dir else None,
-    )
-
-    if args.local_rank == 0:
-        # Make sure only the first process in distributed training will download model & vocab
-        torch.distributed.barrier()
-
-    model.to(args.device)
-
-    logger.info("Training/evaluation parameters %s", args)
-
-    # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is set.
-    # Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running `--fp16_opt_level="O2"` will
-    # remove the need for this code, but it is still valid.
-    if args.fp16:
-        try:
-            import apex
-
-            apex.amp.register_half_function(torch, "einsum")
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
-
-    # Training
-    if args.do_train:
-        train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False)
-        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
-        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
-
-    # Save the trained model and the tokenizer
-    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        logger.info("Saving model checkpoint to %s", args.output_dir)
-        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
-        # They can then be reloaded using `from_pretrained()`
-        # Take care of distributed/parallel training
-        model_to_save = model.module if hasattr(model, "module") else model
-        model_to_save.save_pretrained(args.output_dir)
-        tokenizer.save_pretrained(args.output_dir)
-
-        # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
-
-        # Load a trained model and vocabulary that you have fine-tuned
-        model = AutoModelForQuestionAnswering.from_pretrained(args.output_dir)  # , force_download=True)
-
-        # SquadDataset is not compatible with Fast tokenizers which have a smarter overflow handling
-        # So we use use_fast=False here for now until Fast-tokenizer-compatible-examples are out
-        tokenizer = AutoTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case, use_fast=False)
-        model.to(args.device)
-
-    # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
-    results = {}
-    if args.do_eval and args.local_rank in [-1, 0]:
-        if args.do_train:
-            logger.info("Loading checkpoints saved during training for evaluation")
-            checkpoints = [args.output_dir]
-            if args.eval_all_checkpoints:
-                checkpoints = [
-                    os.path.dirname(c)
-                    for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
-                ]
-
-        else:
-            logger.info("Loading checkpoint %s for evaluation", args.model_name_or_path)
-            checkpoints = [args.model_name_or_path]
-
-        logger.info("Evaluate the following checkpoints: %s", checkpoints)
-
-        for checkpoint in checkpoints:
-            # Reload the model
-            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
-            model = AutoModelForQuestionAnswering.from_pretrained(checkpoint)  # , force_download=True)
-            model.to(args.device)
-
-            # Evaluate
-            result = evaluate(args, model, tokenizer, prefix=global_step)
-
-            result = {k + (f"_{global_step}" if global_step else ""): v for k, v in result.items()}
-            results.update(result)
-
-    logger.info(f"Results: {results}")
-
-    return results
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/legacy/question-answering/run_squad_trainer.py
+++ b/examples/legacy/question-answering/run_squad_trainer.py
@ -1,174 +0,0 @@
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Fine-tuning the library models for question-answering."""
-
-import logging
-import os
-import sys
-from dataclasses import dataclass, field
-from typing import Optional
-
-import transformers
-from transformers import (
-    AutoConfig,
-    AutoModelForQuestionAnswering,
-    AutoTokenizer,
-    DataCollatorWithPadding,
-    HfArgumentParser,
-    SquadDataset,
-    Trainer,
-    TrainingArguments,
-)
-from transformers import SquadDataTrainingArguments as DataTrainingArguments
-from transformers.trainer_utils import is_main_process
-
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-
-    model_name_or_path: str = field(
-        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
-    )
-    config_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
-    )
-    use_fast: bool = field(default=False, metadata={"help": "Set this flag to use fast tokenization."})
-    # If you want to tweak more attributes on your tokenizer, you should do it in a distinct script,
-    # or just modify its tokenizer_config.json.
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
-    )
-
-
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
-
-    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-        # If we pass only one argument to the script and it's the path to a json file,
-        # let's parse it to get our arguments.
-        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
-    else:
-        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO if training_args.local_process_index in [-1, 0] else logging.WARN,
-    )
-    logger.warning(
-        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-        training_args.local_process_index,
-        training_args.device,
-        training_args.n_gpu,
-        bool(training_args.parallel_mode.value == "distributed"),
-        training_args.fp16,
-    )
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    if is_main_process(training_args.local_process_index):
-        transformers.utils.logging.set_verbosity_info()
-        transformers.utils.logging.enable_default_handler()
-        transformers.utils.logging.enable_explicit_format()
-    logger.info("Training/evaluation parameters %s", training_args)
-
-    # Prepare Question-Answering task
-    # Load pretrained model and tokenizer
-    #
-    # Distributed training:
-    # The .from_pretrained methods guarantee that only one local process can concurrently
-    # download model & vocab.
-
-    config = AutoConfig.from_pretrained(
-        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-    )
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-        use_fast=False,  # SquadDataset is not compatible with Fast tokenizers which have a smarter overflow handling
-    )
-    model = AutoModelForQuestionAnswering.from_pretrained(
-        model_args.model_name_or_path,
-        from_tf=bool(".ckpt" in model_args.model_name_or_path),
-        config=config,
-        cache_dir=model_args.cache_dir,
-    )
-
-    # Get datasets
-    is_language_sensitive = hasattr(model.config, "lang2id")
-    train_dataset = (
-        SquadDataset(
-            data_args, tokenizer=tokenizer, is_language_sensitive=is_language_sensitive, cache_dir=model_args.cache_dir
-        )
-        if training_args.do_train
-        else None
-    )
-    eval_dataset = (
-        SquadDataset(
-            data_args,
-            tokenizer=tokenizer,
-            mode="dev",
-            is_language_sensitive=is_language_sensitive,
-            cache_dir=model_args.cache_dir,
-        )
-        if training_args.do_eval
-        else None
-    )
-
-    # Data collator
-    data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) if training_args.fp16 else None
-
-    # Initialize our Trainer
-    trainer = Trainer(
-        model=model,
-        args=training_args,
-        train_dataset=train_dataset,
-        eval_dataset=eval_dataset,
-        data_collator=data_collator,
-    )
-
-    # Training
-    if training_args.do_train:
-        trainer.train(
-            model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
-        )
-        trainer.save_model()
-        # For convenience, we also re-save the tokenizer to the same directory,
-        # so that you can share your model easily on huggingface.co/models =)
-        if trainer.is_world_master():
-            tokenizer.save_pretrained(training_args.output_dir)
-
-
-def _mp_fn(index):
-    # For xla_spawn (TPUs)
-    main()
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/legacy/run_camembert.py
+++ b/examples/legacy/run_camembert.py
@ -1,47 +0,0 @@
-#!/usr/bin/env python
-import torch
-
-from transformers import CamembertForMaskedLM, CamembertTokenizer
-
-
-def fill_mask(masked_input, model, tokenizer, topk=5):
-    # Adapted from https://github.com/pytorch/fairseq/blob/master/fairseq/models/roberta/hub_interface.py
-    assert masked_input.count("<mask>") == 1
-    input_ids = torch.tensor(tokenizer.encode(masked_input, add_special_tokens=True)).unsqueeze(0)  # Batch size 1
-    logits = model(input_ids)[0]  # The last hidden-state is the first element of the output tuple
-    masked_index = (input_ids.squeeze() == tokenizer.mask_token_id).nonzero().item()
-    logits = logits[0, masked_index, :]
-    prob = logits.softmax(dim=0)
-    values, indices = prob.topk(k=topk, dim=0)
-    topk_predicted_token_bpe = " ".join(
-        [tokenizer.convert_ids_to_tokens(indices[i].item()) for i in range(len(indices))]
-    )
-    masked_token = tokenizer.mask_token
-    topk_filled_outputs = []
-    for index, predicted_token_bpe in enumerate(topk_predicted_token_bpe.split(" ")):
-        predicted_token = predicted_token_bpe.replace("\u2581", " ")
-        if f" {masked_token}" in masked_input:
-            topk_filled_outputs.append(
-                (
-                    masked_input.replace(f" {masked_token}", predicted_token),
-                    values[index].item(),
-                    predicted_token,
-                )
-            )
-        else:
-            topk_filled_outputs.append(
-                (
-                    masked_input.replace(masked_token, predicted_token),
-                    values[index].item(),
-                    predicted_token,
-                )
-            )
-    return topk_filled_outputs
-
-
-tokenizer = CamembertTokenizer.from_pretrained("almanach/camembert-base")
-model = CamembertForMaskedLM.from_pretrained("almanach/camembert-base")
-model.eval()
-
-masked_input = "Le camembert est <mask> :)"
-print(fill_mask(masked_input, model, tokenizer, topk=3))
--- a/examples/legacy/run_chinese_ref.py
+++ b/examples/legacy/run_chinese_ref.py
@ -1,147 +0,0 @@
-#!/usr/bin/env python
-import argparse
-import json
-
-from ltp import LTP
-
-from transformers import BertTokenizer
-
-
-def _is_chinese_char(cp):
-    """Checks whether CP is the codepoint of a CJK character."""
-    # This defines a "chinese character" as anything in the CJK Unicode block:
-    #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
-    #
-    # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
-    # despite its name. The modern Korean Hangul alphabet is a different block,
-    # as is Japanese Hiragana and Katakana. Those alphabets are used to write
-    # space-separated words, so they are not treated specially and handled
-    # like the all of the other languages.
-    if (
-        (cp >= 0x4E00 and cp <= 0x9FFF)
-        or (cp >= 0x3400 and cp <= 0x4DBF)
-        or (cp >= 0x20000 and cp <= 0x2A6DF)
-        or (cp >= 0x2A700 and cp <= 0x2B73F)
-        or (cp >= 0x2B740 and cp <= 0x2B81F)
-        or (cp >= 0x2B820 and cp <= 0x2CEAF)
-        or (cp >= 0xF900 and cp <= 0xFAFF)
-        or (cp >= 0x2F800 and cp <= 0x2FA1F)
-    ):
-        return True
-
-    return False
-
-
-def is_chinese(word: str):
-    # word like '180' or '身高' or '神'
-    for char in word:
-        char = ord(char)
-        if not _is_chinese_char(char):
-            return 0
-    return 1
-
-
-def get_chinese_word(tokens: list[str]):
-    word_set = set()
-
-    for token in tokens:
-        chinese_word = len(token) > 1 and is_chinese(token)
-        if chinese_word:
-            word_set.add(token)
-    word_list = list(word_set)
-    return word_list
-
-
-def add_sub_symbol(bert_tokens: list[str], chinese_word_set: set()):
-    if not chinese_word_set:
-        return bert_tokens
-    max_word_len = max(len(w) for w in chinese_word_set)
-
-    bert_word = bert_tokens
-    start, end = 0, len(bert_word)
-    while start < end:
-        single_word = True
-        if is_chinese(bert_word[start]):
-            l = min(end - start, max_word_len)
-            for i in range(l, 1, -1):
-                whole_word = "".join(bert_word[start : start + i])
-                if whole_word in chinese_word_set:
-                    for j in range(start + 1, start + i):
-                        bert_word[j] = "##" + bert_word[j]
-                    start = start + i
-                    single_word = False
-                    break
-        if single_word:
-            start += 1
-    return bert_word
-
-
-def prepare_ref(lines: list[str], ltp_tokenizer: LTP, bert_tokenizer: BertTokenizer):
-    ltp_res = []
-
-    for i in range(0, len(lines), 100):
-        res = ltp_tokenizer.seg(lines[i : i + 100])[0]
-        res = [get_chinese_word(r) for r in res]
-        ltp_res.extend(res)
-    assert len(ltp_res) == len(lines)
-
-    bert_res = []
-    for i in range(0, len(lines), 100):
-        res = bert_tokenizer(lines[i : i + 100], add_special_tokens=True, truncation=True, max_length=512)
-        bert_res.extend(res["input_ids"])
-    assert len(bert_res) == len(lines)
-
-    ref_ids = []
-    for input_ids, chinese_word in zip(bert_res, ltp_res):
-        input_tokens = []
-        for id in input_ids:
-            token = bert_tokenizer._convert_id_to_token(id)
-            input_tokens.append(token)
-        input_tokens = add_sub_symbol(input_tokens, chinese_word)
-        ref_id = []
-        # We only save pos of chinese subwords start with ##, which mean is part of a whole word.
-        for i, token in enumerate(input_tokens):
-            if token[:2] == "##":
-                clean_token = token[2:]
-                # save chinese tokens' pos
-                if len(clean_token) == 1 and _is_chinese_char(ord(clean_token)):
-                    ref_id.append(i)
-        ref_ids.append(ref_id)
-
-    assert len(ref_ids) == len(bert_res)
-
-    return ref_ids
-
-
-def main(args):
-    # For Chinese (Ro)Bert, the best result is from : RoBERTa-wwm-ext (https://github.com/ymcui/Chinese-BERT-wwm)
-    # If we want to fine-tune these model, we have to use same tokenizer : LTP (https://github.com/HIT-SCIR/ltp)
-    with open(args.file_name, encoding="utf-8") as f:
-        data = f.readlines()
-    data = [line.strip() for line in data if len(line) > 0 and not line.isspace()]  # avoid delimiter like '\u2029'
-    ltp_tokenizer = LTP(args.ltp)  # faster in GPU device
-    bert_tokenizer = BertTokenizer.from_pretrained(args.bert)
-
-    ref_ids = prepare_ref(data, ltp_tokenizer, bert_tokenizer)
-
-    with open(args.save_path, "w", encoding="utf-8") as f:
-        data = [json.dumps(ref) + "\n" for ref in ref_ids]
-        f.writelines(data)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="prepare_chinese_ref")
-    parser.add_argument(
-        "--file_name",
-        type=str,
-        default="./resources/chinese-demo.txt",
-        help="file need process, same as training data in lm",
-    )
-    parser.add_argument(
-        "--ltp", type=str, default="./resources/ltp", help="resources for LTP tokenizer, usually a path"
-    )
-    parser.add_argument("--bert", type=str, default="./resources/robert", help="resources for Bert tokenizer")
-    parser.add_argument("--save_path", type=str, default="./resources/ref.txt", help="path to save res")
-
-    args = parser.parse_args()
-    main(args)
--- a/examples/legacy/run_language_modeling.py
+++ b/examples/legacy/run_language_modeling.py
@ -1,363 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, CTRL, BERT, RoBERTa, XLNet).
-GPT, GPT-2 and CTRL are fine-tuned using a causal language modeling (CLM) loss. BERT and RoBERTa are fine-tuned
-using a masked language modeling (MLM) loss. XLNet is fine-tuned using a permutation language modeling (PLM) loss.
-"""
-
-import logging
-import math
-import os
-from dataclasses import dataclass, field
-from glob import glob
-from typing import Optional
-
-from torch.utils.data import ConcatDataset
-
-import transformers
-from transformers import (
-    CONFIG_MAPPING,
-    MODEL_WITH_LM_HEAD_MAPPING,
-    AutoConfig,
-    AutoModelWithLMHead,
-    AutoTokenizer,
-    DataCollatorForLanguageModeling,
-    DataCollatorForPermutationLanguageModeling,
-    DataCollatorForWholeWordMask,
-    HfArgumentParser,
-    LineByLineTextDataset,
-    LineByLineWithRefDataset,
-    PreTrainedTokenizer,
-    TextDataset,
-    Trainer,
-    TrainingArguments,
-    set_seed,
-)
-from transformers.trainer_utils import is_main_process
-
-
-logger = logging.getLogger(__name__)
-
-
-MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
-MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
-    """
-
-    model_name_or_path: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": (
-                "The model checkpoint for weights initialization. Leave None if you want to train a model from"
-                " scratch."
-            )
-        },
-    )
-    model_type: Optional[str] = field(
-        default=None,
-        metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
-    )
-    config_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
-    )
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
-    )
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-    """
-
-    train_data_file: Optional[str] = field(
-        default=None, metadata={"help": "The input training data file (a text file)."}
-    )
-    train_data_files: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": (
-                "The input training data files (multiple files in glob format). "
-                "Very often splitting large files to smaller files can prevent tokenizer going out of memory"
-            )
-        },
-    )
-    eval_data_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
-    )
-    train_ref_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input train ref data file for whole word mask in Chinese."},
-    )
-    eval_ref_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "An optional input eval ref data file for whole word mask in Chinese."},
-    )
-    line_by_line: bool = field(
-        default=False,
-        metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
-    )
-
-    mlm: bool = field(
-        default=False, metadata={"help": "Train with masked-language modeling loss instead of language modeling."}
-    )
-    whole_word_mask: bool = field(default=False, metadata={"help": "Whether ot not to use whole word mask."})
-    mlm_probability: float = field(
-        default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
-    )
-    plm_probability: float = field(
-        default=1 / 6,
-        metadata={
-            "help": (
-                "Ratio of length of a span of masked tokens to surrounding context length for permutation language"
-                " modeling."
-            )
-        },
-    )
-    max_span_length: int = field(
-        default=5, metadata={"help": "Maximum length of a span of masked tokens for permutation language modeling."}
-    )
-
-    block_size: int = field(
-        default=-1,
-        metadata={
-            "help": (
-                "Optional input sequence length after tokenization. "
-                "The training dataset will be truncated in block of this size for training."
-                "Default to the model max input length for single sentence inputs (take into account special tokens)."
-            )
-        },
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
-    )
-
-
-def get_dataset(
-    args: DataTrainingArguments,
-    tokenizer: PreTrainedTokenizer,
-    evaluate: bool = False,
-    cache_dir: Optional[str] = None,
-):
-    def _dataset(file_path, ref_path=None):
-        if args.line_by_line:
-            if ref_path is not None:
-                if not args.whole_word_mask or not args.mlm:
-                    raise ValueError("You need to set world whole masking and mlm to True for Chinese Whole Word Mask")
-                return LineByLineWithRefDataset(
-                    tokenizer=tokenizer,
-                    file_path=file_path,
-                    block_size=args.block_size,
-                    ref_path=ref_path,
-                )
-
-            return LineByLineTextDataset(tokenizer=tokenizer, file_path=file_path, block_size=args.block_size)
-        else:
-            return TextDataset(
-                tokenizer=tokenizer,
-                file_path=file_path,
-                block_size=args.block_size,
-                overwrite_cache=args.overwrite_cache,
-                cache_dir=cache_dir,
-            )
-
-    if evaluate:
-        return _dataset(args.eval_data_file, args.eval_ref_file)
-    elif args.train_data_files:
-        return ConcatDataset([_dataset(f) for f in glob(args.train_data_files)])
-    else:
-        return _dataset(args.train_data_file, args.train_ref_file)
-
-
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
-    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
-    if data_args.eval_data_file is None and training_args.do_eval:
-        raise ValueError(
-            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
-            "or remove the --do_eval argument."
-        )
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO if training_args.local_process_index in [-1, 0] else logging.WARN,
-    )
-    logger.warning(
-        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-        training_args.local_process_index,
-        training_args.device,
-        training_args.n_gpu,
-        bool(training_args.parallel_mode.value == "distributed"),
-        training_args.fp16,
-    )
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    if is_main_process(training_args.local_process_index):
-        transformers.utils.logging.set_verbosity_info()
-        transformers.utils.logging.enable_default_handler()
-        transformers.utils.logging.enable_explicit_format()
-    logger.info("Training/evaluation parameters %s", training_args)
-
-    # Set seed
-    set_seed(training_args.seed)
-
-    # Load pretrained model and tokenizer
-    #
-    # Distributed training:
-    # The .from_pretrained methods guarantee that only one local process can concurrently
-    # download model & vocab.
-
-    if model_args.config_name:
-        config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir)
-    elif model_args.model_name_or_path:
-        config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
-    else:
-        config = CONFIG_MAPPING[model_args.model_type]()
-        logger.warning("You are instantiating a new config instance from scratch.")
-
-    if model_args.tokenizer_name:
-        tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, cache_dir=model_args.cache_dir)
-    elif model_args.model_name_or_path:
-        tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
-    else:
-        raise ValueError(
-            "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another"
-            " script, save it,and load it from here, using --tokenizer_name"
-        )
-
-    if model_args.model_name_or_path:
-        model = AutoModelWithLMHead.from_pretrained(
-            model_args.model_name_or_path,
-            from_tf=bool(".ckpt" in model_args.model_name_or_path),
-            config=config,
-            cache_dir=model_args.cache_dir,
-        )
-    else:
-        logger.info("Training new model from scratch")
-        model = AutoModelWithLMHead.from_config(config)
-
-    model.resize_token_embeddings(len(tokenizer))
-
-    if config.model_type in ["bert", "roberta", "distilbert", "camembert"] and not data_args.mlm:
-        raise ValueError(
-            "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the "
-            "--mlm flag (masked language modeling)."
-        )
-
-    if data_args.block_size <= 0:
-        data_args.block_size = tokenizer.max_len
-        # Our input block size will be the max possible for the model
-    else:
-        data_args.block_size = min(data_args.block_size, tokenizer.max_len)
-
-    # Get datasets
-
-    train_dataset = (
-        get_dataset(data_args, tokenizer=tokenizer, cache_dir=model_args.cache_dir) if training_args.do_train else None
-    )
-    eval_dataset = (
-        get_dataset(data_args, tokenizer=tokenizer, evaluate=True, cache_dir=model_args.cache_dir)
-        if training_args.do_eval
-        else None
-    )
-    if config.model_type == "xlnet":
-        data_collator = DataCollatorForPermutationLanguageModeling(
-            tokenizer=tokenizer,
-            plm_probability=data_args.plm_probability,
-            max_span_length=data_args.max_span_length,
-        )
-    else:
-        if data_args.mlm and data_args.whole_word_mask:
-            data_collator = DataCollatorForWholeWordMask(
-                tokenizer=tokenizer, mlm_probability=data_args.mlm_probability
-            )
-        else:
-            data_collator = DataCollatorForLanguageModeling(
-                tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability
-            )
-
-    # Initialize our Trainer
-    trainer = Trainer(
-        model=model,
-        args=training_args,
-        data_collator=data_collator,
-        train_dataset=train_dataset,
-        eval_dataset=eval_dataset,
-        prediction_loss_only=True,
-    )
-
-    # Training
-    if training_args.do_train:
-        model_path = (
-            model_args.model_name_or_path
-            if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path)
-            else None
-        )
-        trainer.train(model_path=model_path)
-        trainer.save_model()
-        # For convenience, we also re-save the tokenizer to the same directory,
-        # so that you can share your model easily on huggingface.co/models =)
-        if trainer.is_world_master():
-            tokenizer.save_pretrained(training_args.output_dir)
-
-    # Evaluation
-    results = {}
-    if training_args.do_eval:
-        logger.info("*** Evaluate ***")
-
-        eval_output = trainer.evaluate()
-
-        perplexity = math.exp(eval_output["eval_loss"])
-        result = {"perplexity": perplexity}
-
-        output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt")
-        if trainer.is_world_master():
-            with open(output_eval_file, "w") as writer:
-                logger.info("***** Eval results *****")
-                for key in sorted(result.keys()):
-                    logger.info("  %s = %s", key, str(result[key]))
-                    writer.write("{} = {}\n".format(key, str(result[key])))
-
-        results.update(result)
-
-    return results
-
-
-def _mp_fn(index):
-    # For xla_spawn (TPUs)
-    main()
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/legacy/run_openai_gpt.py
+++ b/examples/legacy/run_openai_gpt.py
@ -1,319 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" OpenAI GPT model fine-tuning script.
-    Adapted from https://github.com/huggingface/pytorch-openai-transformer-lm/blob/master/train.py
-    It self adapted from https://github.com/openai/finetune-transformer-lm/blob/master/train.py
-
-    This script with default values fine-tunes and evaluate a pretrained OpenAI GPT on the RocStories dataset:
-        python run_openai_gpt.py \
-          --model_name openai-community/openai-gpt \
-          --do_train \
-          --do_eval \
-          --train_dataset "$ROC_STORIES_DIR/cloze_test_val__spring2016 - cloze_test_ALL_val.csv" \
-          --eval_dataset "$ROC_STORIES_DIR/cloze_test_test__spring2016 - cloze_test_ALL_test.csv" \
-          --output_dir ../log \
-          --train_batch_size 16 \
-"""
-
-import argparse
-import csv
-import logging
-import os
-import random
-
-import numpy as np
-import torch
-from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
-from tqdm import tqdm, trange
-
-from transformers import (
-    CONFIG_NAME,
-    WEIGHTS_NAME,
-    OpenAIGPTDoubleHeadsModel,
-    OpenAIGPTTokenizer,
-    get_linear_schedule_with_warmup,
-)
-
-
-logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
-)
-logger = logging.getLogger(__name__)
-
-
-def accuracy(out, labels):
-    outputs = np.argmax(out, axis=1)
-    return np.sum(outputs == labels)
-
-
-def load_rocstories_dataset(dataset_path):
-    """Output a list of tuples(story, 1st continuation, 2nd continuation, label)"""
-    with open(dataset_path, encoding="utf_8") as f:
-        f = csv.reader(f)
-        output = []
-        next(f)  # skip the first line
-        for line in tqdm(f):
-            output.append((" ".join(line[1:5]), line[5], line[6], int(line[-1]) - 1))
-    return output
-
-
-def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, delimiter_token, clf_token):
-    """Pre-process datasets containing lists of tuples(story, 1st continuation, 2nd continuation, label)
-
-    To Transformer inputs of shape (n_batch, n_alternative, length) comprising for each batch, continuation:
-    input_ids[batch, alternative, :] = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
-    """
-    tensor_datasets = []
-    for dataset in encoded_datasets:
-        n_batch = len(dataset)
-        input_ids = np.zeros((n_batch, 2, input_len), dtype=np.int64)
-        mc_token_ids = np.zeros((n_batch, 2), dtype=np.int64)
-        lm_labels = np.full((n_batch, 2, input_len), fill_value=-100, dtype=np.int64)
-        mc_labels = np.zeros((n_batch,), dtype=np.int64)
-        for (
-            i,
-            (story, cont1, cont2, mc_label),
-        ) in enumerate(dataset):
-            with_cont1 = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
-            with_cont2 = [start_token] + story[:cap_length] + [delimiter_token] + cont2[:cap_length] + [clf_token]
-            input_ids[i, 0, : len(with_cont1)] = with_cont1
-            input_ids[i, 1, : len(with_cont2)] = with_cont2
-            mc_token_ids[i, 0] = len(with_cont1) - 1
-            mc_token_ids[i, 1] = len(with_cont2) - 1
-            lm_labels[i, 0, : len(with_cont1)] = with_cont1
-            lm_labels[i, 1, : len(with_cont2)] = with_cont2
-            mc_labels[i] = mc_label
-        all_inputs = (input_ids, mc_token_ids, lm_labels, mc_labels)
-        tensor_datasets.append(tuple(torch.tensor(t) for t in all_inputs))
-    return tensor_datasets
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model_name", type=str, default="openai-community/openai-gpt", help="pretrained model name")
-    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
-    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
-    parser.add_argument(
-        "--output_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-    parser.add_argument("--train_dataset", type=str, default="")
-    parser.add_argument("--eval_dataset", type=str, default="")
-    parser.add_argument("--seed", type=int, default=42)
-    parser.add_argument("--num_train_epochs", type=int, default=3)
-    parser.add_argument("--train_batch_size", type=int, default=8)
-    parser.add_argument("--eval_batch_size", type=int, default=16)
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", type=int, default=1)
-    parser.add_argument(
-        "--max_steps",
-        default=-1,
-        type=int,
-        help=(
-            "If > 0: set total number of training                         steps to perform. Override num_train_epochs."
-        ),
-    )
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before                        performing a backward/update pass.",
-    )
-    parser.add_argument("--learning_rate", type=float, default=6.25e-5)
-    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
-    parser.add_argument("--lr_schedule", type=str, default="warmup_linear")
-    parser.add_argument("--weight_decay", type=float, default=0.01)
-    parser.add_argument("--lm_coef", type=float, default=0.9)
-    parser.add_argument("--n_valid", type=int, default=374)
-
-    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
-    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
-    args = parser.parse_args()
-    print(args)
-
-    if args.server_ip and args.server_port:
-        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
-        import ptvsd
-
-        print("Waiting for debugger attach")
-        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
-        ptvsd.wait_for_attach()
-
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    torch.cuda.manual_seed_all(args.seed)
-
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    n_gpu = torch.cuda.device_count()
-    logger.info(f"device: {device}, n_gpu {n_gpu}")
-
-    if not args.do_train and not args.do_eval:
-        raise ValueError("At least one of `do_train` or `do_eval` must be True.")
-
-    if not os.path.exists(args.output_dir):
-        os.makedirs(args.output_dir)
-
-    # Load tokenizer and model
-    # This loading functions also add new tokens and embeddings called `special tokens`
-    # These new embeddings will be fine-tuned on the RocStories dataset
-    special_tokens = ["_start_", "_delimiter_", "_classify_"]
-    tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name)
-    tokenizer.add_tokens(special_tokens)
-    special_tokens_ids = tokenizer.convert_tokens_to_ids(special_tokens)
-    model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name)
-    model.resize_token_embeddings(len(tokenizer))
-    model.to(device)
-
-    # Load and encode the datasets
-    def tokenize_and_encode(obj):
-        """Tokenize and encode a nested object"""
-        if isinstance(obj, str):
-            return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
-        elif isinstance(obj, int):
-            return obj
-        return [tokenize_and_encode(o) for o in obj]
-
-    logger.info("Encoding dataset...")
-    train_dataset = load_rocstories_dataset(args.train_dataset)
-    eval_dataset = load_rocstories_dataset(args.eval_dataset)
-    datasets = (train_dataset, eval_dataset)
-    encoded_datasets = tokenize_and_encode(datasets)
-
-    # Compute the max input length for the Transformer
-    max_length = model.config.n_positions // 2 - 2
-    input_length = max(
-        len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3
-        for dataset in encoded_datasets
-        for story, cont1, cont2, _ in dataset
-    )
-    input_length = min(input_length, model.config.n_positions)  # Max size of input for the pre-trained model
-
-    # Prepare inputs tensors and dataloaders
-    tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids)
-    train_tensor_dataset, eval_tensor_dataset = tensor_datasets[0], tensor_datasets[1]
-
-    train_data = TensorDataset(*train_tensor_dataset)
-    train_sampler = RandomSampler(train_data)
-    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
-
-    eval_data = TensorDataset(*eval_tensor_dataset)
-    eval_sampler = SequentialSampler(eval_data)
-    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
-
-    # Prepare optimizer
-    if args.do_train:
-        if args.max_steps > 0:
-            t_total = args.max_steps
-            args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
-        else:
-            t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
-
-        param_optimizer = list(model.named_parameters())
-        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
-        optimizer_grouped_parameters = [
-            {
-                "params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
-                "weight_decay": args.weight_decay,
-            },
-            {"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
-        ]
-        optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-        scheduler = get_linear_schedule_with_warmup(
-            optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
-        )
-
-    if args.do_train:
-        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
-        model.train()
-        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
-            tr_loss = 0
-            nb_tr_steps = 0
-            tqdm_bar = tqdm(train_dataloader, desc="Training")
-            for step, batch in enumerate(tqdm_bar):
-                batch = tuple(t.to(device) for t in batch)
-                input_ids, mc_token_ids, lm_labels, mc_labels = batch
-                losses = model(input_ids, mc_token_ids=mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels)
-                loss = args.lm_coef * losses[0] + losses[1]
-                loss.backward()
-                optimizer.step()
-                scheduler.step()
-                optimizer.zero_grad()
-                tr_loss += loss.item()
-                exp_average_loss = (
-                    loss.item() if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item()
-                )
-                nb_tr_steps += 1
-                tqdm_bar.desc = f"Training loss: {exp_average_loss:.2e} lr: {scheduler.get_lr()[0]:.2e}"
-
-    # Save a trained model
-    if args.do_train:
-        # Save a trained model, configuration and tokenizer
-        model_to_save = model.module if hasattr(model, "module") else model  # Only save the model itself
-
-        # If we save using the predefined names, we can load using `from_pretrained`
-        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
-        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
-
-        torch.save(model_to_save.state_dict(), output_model_file)
-        model_to_save.config.to_json_file(output_config_file)
-        tokenizer.save_vocabulary(args.output_dir)
-
-        # Load a trained model and vocabulary that you have fine-tuned
-        model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.output_dir)
-        tokenizer = OpenAIGPTTokenizer.from_pretrained(args.output_dir)
-        model.to(device)
-
-    if args.do_eval:
-        model.eval()
-        eval_loss, eval_accuracy = 0, 0
-        nb_eval_steps, nb_eval_examples = 0, 0
-        for batch in tqdm(eval_dataloader, desc="Evaluating"):
-            batch = tuple(t.to(device) for t in batch)
-            input_ids, mc_token_ids, lm_labels, mc_labels = batch
-            with torch.no_grad():
-                _, mc_loss, _, mc_logits = model(
-                    input_ids, mc_token_ids=mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels
-                )
-
-            mc_logits = mc_logits.detach().cpu().numpy()
-            mc_labels = mc_labels.to("cpu").numpy()
-            tmp_eval_accuracy = accuracy(mc_logits, mc_labels)
-
-            eval_loss += mc_loss.mean().item()
-            eval_accuracy += tmp_eval_accuracy
-
-            nb_eval_examples += input_ids.size(0)
-            nb_eval_steps += 1
-
-        eval_loss = eval_loss / nb_eval_steps
-        eval_accuracy = eval_accuracy / nb_eval_examples
-        train_loss = tr_loss / nb_tr_steps if args.do_train else None
-        result = {"eval_loss": eval_loss, "eval_accuracy": eval_accuracy, "train_loss": train_loss}
-
-        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
-        with open(output_eval_file, "w") as writer:
-            logger.info("***** Eval results *****")
-            for key in sorted(result.keys()):
-                logger.info("  %s = %s", key, str(result[key]))
-                writer.write("{} = {}\n".format(key, str(result[key])))
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/legacy/run_swag.py
+++ b/examples/legacy/run_swag.py
@ -1,706 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""BERT finetuning runner.
-Finetuning the library models for multiple choice on SWAG (Bert).
-"""
-
-import argparse
-import csv
-import glob
-import logging
-import os
-import random
-
-import numpy as np
-import torch
-from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
-from torch.utils.data.distributed import DistributedSampler
-from tqdm import tqdm, trange
-
-import transformers
-from transformers import (
-    WEIGHTS_NAME,
-    AutoConfig,
-    AutoModelForMultipleChoice,
-    AutoTokenizer,
-    get_linear_schedule_with_warmup,
-)
-from transformers.trainer_utils import is_main_process
-
-
-try:
-    from torch.utils.tensorboard import SummaryWriter
-except ImportError:
-    from tensorboardX import SummaryWriter
-
-
-logger = logging.getLogger(__name__)
-
-
-class SwagExample:
-    """A single training/test example for the SWAG dataset."""
-
-    def __init__(self, swag_id, context_sentence, start_ending, ending_0, ending_1, ending_2, ending_3, label=None):
-        self.swag_id = swag_id
-        self.context_sentence = context_sentence
-        self.start_ending = start_ending
-        self.endings = [
-            ending_0,
-            ending_1,
-            ending_2,
-            ending_3,
-        ]
-        self.label = label
-
-    def __str__(self):
-        return self.__repr__()
-
-    def __repr__(self):
-        attributes = [
-            f"swag_id: {self.swag_id}",
-            f"context_sentence: {self.context_sentence}",
-            f"start_ending: {self.start_ending}",
-            f"ending_0: {self.endings[0]}",
-            f"ending_1: {self.endings[1]}",
-            f"ending_2: {self.endings[2]}",
-            f"ending_3: {self.endings[3]}",
-        ]
-
-        if self.label is not None:
-            attributes.append(f"label: {self.label}")
-
-        return ", ".join(attributes)
-
-
-class InputFeatures:
-    def __init__(self, example_id, choices_features, label):
-        self.example_id = example_id
-        self.choices_features = [
-            {"input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids}
-            for _, input_ids, input_mask, segment_ids in choices_features
-        ]
-        self.label = label
-
-
-def read_swag_examples(input_file, is_training=True):
-    with open(input_file, encoding="utf-8") as f:
-        lines = list(csv.reader(f))
-
-    if is_training and lines[0][-1] != "label":
-        raise ValueError("For training, the input file must contain a label column.")
-
-    examples = [
-        SwagExample(
-            swag_id=line[2],
-            context_sentence=line[4],
-            start_ending=line[5],  # in the swag dataset, the
-            # common beginning of each
-            # choice is stored in "sent2".
-            ending_0=line[7],
-            ending_1=line[8],
-            ending_2=line[9],
-            ending_3=line[10],
-            label=int(line[11]) if is_training else None,
-        )
-        for line in lines[1:]  # we skip the line with the column names
-    ]
-
-    return examples
-
-
-def convert_examples_to_features(examples, tokenizer, max_seq_length, is_training):
-    """Loads a data file into a list of `InputBatch`s."""
-
-    # Swag is a multiple choice task. To perform this task using Bert,
-    # we will use the formatting proposed in "Improving Language
-    # Understanding by Generative Pre-Training" and suggested by
-    # @jacobdevlin-google in this issue
-    # https://github.com/google-research/bert/issues/38.
-    #
-    # Each choice will correspond to a sample on which we run the
-    # inference. For a given Swag example, we will create the 4
-    # following inputs:
-    # - [CLS] context [SEP] choice_1 [SEP]
-    # - [CLS] context [SEP] choice_2 [SEP]
-    # - [CLS] context [SEP] choice_3 [SEP]
-    # - [CLS] context [SEP] choice_4 [SEP]
-    # The model will output a single value for each input. To get the
-    # final decision of the model, we will run a softmax over these 4
-    # outputs.
-    features = []
-    for example_index, example in tqdm(enumerate(examples)):
-        context_tokens = tokenizer.tokenize(example.context_sentence)
-        start_ending_tokens = tokenizer.tokenize(example.start_ending)
-
-        choices_features = []
-        for ending_index, ending in enumerate(example.endings):
-            # We create a copy of the context tokens in order to be
-            # able to shrink it according to ending_tokens
-            context_tokens_choice = context_tokens[:]
-            ending_tokens = start_ending_tokens + tokenizer.tokenize(ending)
-            # Modifies `context_tokens_choice` and `ending_tokens` in
-            # place so that the total length is less than the
-            # specified length.  Account for [CLS], [SEP], [SEP] with
-            # "- 3"
-            _truncate_seq_pair(context_tokens_choice, ending_tokens, max_seq_length - 3)
-
-            tokens = ["[CLS]"] + context_tokens_choice + ["[SEP]"] + ending_tokens + ["[SEP]"]
-            segment_ids = [0] * (len(context_tokens_choice) + 2) + [1] * (len(ending_tokens) + 1)
-
-            input_ids = tokenizer.convert_tokens_to_ids(tokens)
-            input_mask = [1] * len(input_ids)
-
-            # Zero-pad up to the sequence length.
-            padding = [0] * (max_seq_length - len(input_ids))
-            input_ids += padding
-            input_mask += padding
-            segment_ids += padding
-
-            assert len(input_ids) == max_seq_length
-            assert len(input_mask) == max_seq_length
-            assert len(segment_ids) == max_seq_length
-
-            choices_features.append((tokens, input_ids, input_mask, segment_ids))
-
-        label = example.label
-        if example_index < 5:
-            logger.info("*** Example ***")
-            logger.info(f"swag_id: {example.swag_id}")
-            for choice_idx, (tokens, input_ids, input_mask, segment_ids) in enumerate(choices_features):
-                logger.info(f"choice: {choice_idx}")
-                logger.info("tokens: {}".format(" ".join(tokens)))
-                logger.info("input_ids: {}".format(" ".join(map(str, input_ids))))
-                logger.info("input_mask: {}".format(" ".join(map(str, input_mask))))
-                logger.info("segment_ids: {}".format(" ".join(map(str, segment_ids))))
-            if is_training:
-                logger.info(f"label: {label}")
-
-        features.append(InputFeatures(example_id=example.swag_id, choices_features=choices_features, label=label))
-
-    return features
-
-
-def _truncate_seq_pair(tokens_a, tokens_b, max_length):
-    """Truncates a sequence pair in place to the maximum length."""
-
-    # This is a simple heuristic which will always truncate the longer sequence
-    # one token at a time. This makes more sense than truncating an equal percent
-    # of tokens from each, since if one sequence is very short then each token
-    # that's truncated likely contains more information than a longer sequence.
-    while True:
-        total_length = len(tokens_a) + len(tokens_b)
-        if total_length <= max_length:
-            break
-        if len(tokens_a) > len(tokens_b):
-            tokens_a.pop()
-        else:
-            tokens_b.pop()
-
-
-def accuracy(out, labels):
-    outputs = np.argmax(out, axis=1)
-    return np.sum(outputs == labels)
-
-
-def select_field(features, field):
-    return [[choice[field] for choice in feature.choices_features] for feature in features]
-
-
-def set_seed(args):
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if args.n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
-
-
-def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
-    if args.local_rank not in [-1, 0]:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
-
-    # Load data features from cache or dataset file
-    input_file = args.predict_file if evaluate else args.train_file
-    cached_features_file = os.path.join(
-        os.path.dirname(input_file),
-        "cached_{}_{}_{}".format(
-            "dev" if evaluate else "train",
-            list(filter(None, args.model_name_or_path.split("/"))).pop(),
-            str(args.max_seq_length),
-        ),
-    )
-    if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
-        logger.info("Loading features from cached file %s", cached_features_file)
-        features = torch.load(cached_features_file, weights_only=True)
-    else:
-        logger.info("Creating features from dataset file at %s", input_file)
-        examples = read_swag_examples(input_file)
-        features = convert_examples_to_features(examples, tokenizer, args.max_seq_length, not evaluate)
-
-        if args.local_rank in [-1, 0]:
-            logger.info("Saving features into cached file %s", cached_features_file)
-            torch.save(features, cached_features_file)
-
-    if args.local_rank == 0:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
-
-    # Convert to Tensors and build dataset
-    all_input_ids = torch.tensor(select_field(features, "input_ids"), dtype=torch.long)
-    all_input_mask = torch.tensor(select_field(features, "input_mask"), dtype=torch.long)
-    all_segment_ids = torch.tensor(select_field(features, "segment_ids"), dtype=torch.long)
-    all_label = torch.tensor([f.label for f in features], dtype=torch.long)
-
-    if evaluate:
-        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
-    else:
-        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label)
-
-    if output_examples:
-        return dataset, examples, features
-    return dataset
-
-
-def train(args, train_dataset, model, tokenizer):
-    """Train the model"""
-    if args.local_rank in [-1, 0]:
-        tb_writer = SummaryWriter()
-
-    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
-    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
-    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
-
-    if args.max_steps > 0:
-        t_total = args.max_steps
-        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
-    else:
-        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
-
-    # Prepare optimizer and schedule (linear warmup and decay)
-    no_decay = ["bias", "LayerNorm.weight"]
-    optimizer_grouped_parameters = [
-        {
-            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
-            "weight_decay": args.weight_decay,
-        },
-        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
-    ]
-    optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
-    scheduler = get_linear_schedule_with_warmup(
-        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
-    )
-    if args.fp16:
-        try:
-            from apex import amp
-        except ImportError:
-            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
-        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
-
-    # multi-gpu training (should be after apex fp16 initialization)
-    if args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)
-
-    # Distributed training (should be after apex fp16 initialization)
-    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(
-            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
-        )
-
-    # Train!
-    logger.info("***** Running training *****")
-    logger.info("  Num examples = %d", len(train_dataset))
-    logger.info("  Num Epochs = %d", args.num_train_epochs)
-    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info(
-        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
-        args.train_batch_size
-        * args.gradient_accumulation_steps
-        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
-    )
-    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
-    logger.info("  Total optimization steps = %d", t_total)
-
-    global_step = 0
-    tr_loss, logging_loss = 0.0, 0.0
-    model.zero_grad()
-    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
-    set_seed(args)  # Added here for reproducibility
-    for _ in train_iterator:
-        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
-        for step, batch in enumerate(epoch_iterator):
-            model.train()
-            batch = tuple(t.to(args.device) for t in batch)
-            inputs = {
-                "input_ids": batch[0],
-                "attention_mask": batch[1],
-                # 'token_type_ids':  None if args.model_type == 'xlm' else batch[2],
-                "token_type_ids": batch[2],
-                "labels": batch[3],
-            }
-            # if args.model_type in ['xlnet', 'xlm']:
-            #     inputs.update({'cls_index': batch[5],
-            #                    'p_mask':       batch[6]})
-            outputs = model(**inputs)
-            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
-
-            if args.n_gpu > 1:
-                loss = loss.mean()  # mean() to average on multi-gpu parallel (not distributed) training
-            if args.gradient_accumulation_steps > 1:
-                loss = loss / args.gradient_accumulation_steps
-
-            if args.fp16:
-                with amp.scale_loss(loss, optimizer) as scaled_loss:
-                    scaled_loss.backward()
-                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
-            else:
-                loss.backward()
-                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
-
-            tr_loss += loss.item()
-            if (step + 1) % args.gradient_accumulation_steps == 0:
-                optimizer.step()
-                scheduler.step()  # Update learning rate schedule
-                model.zero_grad()
-                global_step += 1
-
-                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
-                    # Log metrics
-                    if (
-                        args.local_rank == -1 and args.evaluate_during_training
-                    ):  # Only evaluate when single GPU otherwise metrics may not average well
-                        results = evaluate(args, model, tokenizer)
-                        for key, value in results.items():
-                            tb_writer.add_scalar(f"eval_{key}", value, global_step)
-                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
-                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
-                    logging_loss = tr_loss
-
-                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
-                    # Save model checkpoint
-                    output_dir = os.path.join(args.output_dir, f"checkpoint-{global_step}")
-                    model_to_save = (
-                        model.module if hasattr(model, "module") else model
-                    )  # Take care of distributed/parallel training
-                    model_to_save.save_pretrained(output_dir)
-                    tokenizer.save_vocabulary(output_dir)
-                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
-                    logger.info("Saving model checkpoint to %s", output_dir)
-
-            if args.max_steps > 0 and global_step > args.max_steps:
-                epoch_iterator.close()
-                break
-        if args.max_steps > 0 and global_step > args.max_steps:
-            train_iterator.close()
-            break
-
-    if args.local_rank in [-1, 0]:
-        tb_writer.close()
-
-    return global_step, tr_loss / global_step
-
-
-def evaluate(args, model, tokenizer, prefix=""):
-    dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)
-
-    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
-        os.makedirs(args.output_dir)
-
-    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
-    # Note that DistributedSampler samples randomly
-    eval_sampler = SequentialSampler(dataset) if args.local_rank == -1 else DistributedSampler(dataset)
-    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
-
-    # Eval!
-    logger.info(f"***** Running evaluation {prefix} *****")
-    logger.info("  Num examples = %d", len(dataset))
-    logger.info("  Batch size = %d", args.eval_batch_size)
-
-    eval_loss, eval_accuracy = 0, 0
-    nb_eval_steps, nb_eval_examples = 0, 0
-
-    for batch in tqdm(eval_dataloader, desc="Evaluating"):
-        model.eval()
-        batch = tuple(t.to(args.device) for t in batch)
-        with torch.no_grad():
-            inputs = {
-                "input_ids": batch[0],
-                "attention_mask": batch[1],
-                # 'token_type_ids': None if args.model_type == 'xlm' else batch[2]  # XLM don't use segment_ids
-                "token_type_ids": batch[2],
-                "labels": batch[3],
-            }
-
-            # if args.model_type in ['xlnet', 'xlm']:
-            #     inputs.update({'cls_index': batch[4],
-            #                    'p_mask':    batch[5]})
-            outputs = model(**inputs)
-            tmp_eval_loss, logits = outputs[:2]
-            eval_loss += tmp_eval_loss.mean().item()
-
-        logits = logits.detach().cpu().numpy()
-        label_ids = inputs["labels"].to("cpu").numpy()
-        tmp_eval_accuracy = accuracy(logits, label_ids)
-        eval_accuracy += tmp_eval_accuracy
-
-        nb_eval_steps += 1
-        nb_eval_examples += inputs["input_ids"].size(0)
-
-    eval_loss = eval_loss / nb_eval_steps
-    eval_accuracy = eval_accuracy / nb_eval_examples
-    result = {"eval_loss": eval_loss, "eval_accuracy": eval_accuracy}
-
-    output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
-    with open(output_eval_file, "w") as writer:
-        logger.info("***** Eval results *****")
-        for key in sorted(result.keys()):
-            logger.info("%s = %s", key, str(result[key]))
-            writer.write("{} = {}\n".format(key, str(result[key])))
-
-    return result
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    # Required parameters
-    parser.add_argument(
-        "--train_file", default=None, type=str, required=True, help="SWAG csv for training. E.g., train.csv"
-    )
-    parser.add_argument(
-        "--predict_file",
-        default=None,
-        type=str,
-        required=True,
-        help="SWAG csv for predictions. E.g., val.csv or test.csv",
-    )
-    parser.add_argument(
-        "--model_name_or_path",
-        default=None,
-        type=str,
-        required=True,
-        help="Path to pretrained model or model identifier from huggingface.co/models",
-    )
-    parser.add_argument(
-        "--output_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The output directory where the model checkpoints and predictions will be written.",
-    )
-
-    # Other parameters
-    parser.add_argument(
-        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
-    )
-    parser.add_argument(
-        "--tokenizer_name",
-        default="",
-        type=str,
-        help="Pretrained tokenizer name or path if not the same as model_name",
-    )
-    parser.add_argument(
-        "--max_seq_length",
-        default=384,
-        type=int,
-        help=(
-            "The maximum total input sequence length after tokenization. Sequences "
-            "longer than this will be truncated, and sequences shorter than this will be padded."
-        ),
-    )
-    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
-    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
-    parser.add_argument(
-        "--evaluate_during_training", action="store_true", help="Rul evaluation during training at each logging step."
-    )
-
-    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
-    parser.add_argument(
-        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
-    )
-    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
-    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
-    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
-    parser.add_argument(
-        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
-    )
-    parser.add_argument(
-        "--max_steps",
-        default=-1,
-        type=int,
-        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
-    )
-    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
-
-    parser.add_argument("--logging_steps", type=int, default=50, help="Log every X updates steps.")
-    parser.add_argument("--save_steps", type=int, default=50, help="Save checkpoint every X updates steps.")
-    parser.add_argument(
-        "--eval_all_checkpoints",
-        action="store_true",
-        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
-    )
-    parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
-    parser.add_argument(
-        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
-    )
-    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
-
-    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
-    parser.add_argument(
-        "--fp16",
-        action="store_true",
-        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
-    )
-    parser.add_argument(
-        "--fp16_opt_level",
-        type=str,
-        default="O1",
-        help=(
-            "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. "
-            "See details at https://nvidia.github.io/apex/amp.html"
-        ),
-    )
-    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
-    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
-    args = parser.parse_args()
-
-    # Setup distant debugging if needed
-    if args.server_ip and args.server_port:
-        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
-        import ptvsd
-
-        print("Waiting for debugger attach")
-        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
-        ptvsd.wait_for_attach()
-
-    # Setup CUDA, GPU & distributed training
-    if args.local_rank == -1 or args.no_cuda:
-        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count()
-    else:  # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
-        torch.cuda.set_device(args.local_rank)
-        device = torch.device("cuda", args.local_rank)
-        torch.distributed.init_process_group(backend="nccl")
-        args.n_gpu = 1
-    args.device = device
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
-    )
-    logger.warning(
-        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-        args.local_rank,
-        device,
-        args.n_gpu,
-        bool(args.local_rank != -1),
-        args.fp16,
-    )
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    if is_main_process(args.local_rank):
-        transformers.utils.logging.set_verbosity_info()
-        transformers.utils.logging.enable_default_handler()
-        transformers.utils.logging.enable_explicit_format()
-
-    # Set seed
-    set_seed(args)
-
-    # Load pretrained model and tokenizer
-    if args.local_rank not in [-1, 0]:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
-
-    config = AutoConfig.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
-    tokenizer = AutoTokenizer.from_pretrained(
-        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
-    )
-    model = AutoModelForMultipleChoice.from_pretrained(
-        args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config
-    )
-
-    if args.local_rank == 0:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
-
-    model.to(args.device)
-
-    logger.info("Training/evaluation parameters %s", args)
-
-    # Training
-    if args.do_train:
-        train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False)
-        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
-        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
-
-    # Save the trained model and the tokenizer
-    if args.local_rank == -1 or torch.distributed.get_rank() == 0:
-        logger.info("Saving model checkpoint to %s", args.output_dir)
-        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
-        # They can then be reloaded using `from_pretrained()`
-        model_to_save = (
-            model.module if hasattr(model, "module") else model
-        )  # Take care of distributed/parallel training
-        model_to_save.save_pretrained(args.output_dir)
-        tokenizer.save_pretrained(args.output_dir)
-
-        # Good practice: save your training arguments together with the trained model
-        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
-
-        # Load a trained model and vocabulary that you have fine-tuned
-        model = AutoModelForMultipleChoice.from_pretrained(args.output_dir)
-        tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
-        model.to(args.device)
-
-    # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
-    results = {}
-    if args.do_eval and args.local_rank in [-1, 0]:
-        if args.do_train:
-            checkpoints = [args.output_dir]
-        else:
-            # if do_train is False and do_eval is true, load model directly from pretrained.
-            checkpoints = [args.model_name_or_path]
-
-        if args.eval_all_checkpoints:
-            checkpoints = [
-                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
-            ]
-
-        logger.info("Evaluate the following checkpoints: %s", checkpoints)
-
-        for checkpoint in checkpoints:
-            # Reload the model
-            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
-            model = AutoModelForMultipleChoice.from_pretrained(checkpoint)
-            tokenizer = AutoTokenizer.from_pretrained(checkpoint)
-            model.to(args.device)
-
-            # Evaluate
-            result = evaluate(args, model, tokenizer, prefix=global_step)
-
-            result = {k + (f"_{global_step}" if global_step else ""): v for k, v in result.items()}
-            results.update(result)
-
-    logger.info(f"Results: {results}")
-
-    return results
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/legacy/run_transfo_xl.py
+++ b/examples/legacy/run_transfo_xl.py
@ -1,143 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch Transformer XL model evaluation script.
-Adapted from https://github.com/kimiyoung/transformer-xl.
-In particular https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/eval.py
-
-This script with default values evaluates a pretrained Transformer-XL on WikiText 103
-"""
-
-import argparse
-import logging
-import math
-import time
-
-import torch
-
-from transformers import TransfoXLCorpus, TransfoXLLMHeadModel
-
-
-logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
-)
-logger = logging.getLogger(__name__)
-
-
-def main():
-    parser = argparse.ArgumentParser(description="PyTorch Transformer Language Model")
-    parser.add_argument("--model_name", type=str, default="transfo-xl/transfo-xl-wt103", help="pretrained model name")
-    parser.add_argument(
-        "--split", type=str, default="test", choices=["all", "valid", "test"], help="which split to evaluate"
-    )
-    parser.add_argument("--batch_size", type=int, default=10, help="batch size")
-    parser.add_argument("--tgt_len", type=int, default=128, help="number of tokens to predict")
-    parser.add_argument("--ext_len", type=int, default=0, help="length of the extended context")
-    parser.add_argument("--mem_len", type=int, default=1600, help="length of the retained previous heads")
-    parser.add_argument("--clamp_len", type=int, default=1000, help="max positional embedding index")
-    parser.add_argument("--no_cuda", action="store_true", help="Do not use CUDA even though CUA is available")
-    parser.add_argument("--work_dir", type=str, required=True, help="path to the work_dir")
-    parser.add_argument("--no_log", action="store_true", help="do not log the eval result")
-    parser.add_argument("--same_length", action="store_true", help="set same length attention with masking")
-    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
-    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
-    args = parser.parse_args()
-    assert args.ext_len >= 0, "extended context length must be non-negative"
-
-    if args.server_ip and args.server_port:
-        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
-        import ptvsd
-
-        print("Waiting for debugger attach")
-        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
-        ptvsd.wait_for_attach()
-
-    device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-    logger.info(f"device: {device}")
-
-    # Load a pre-processed dataset
-    # You can also build the corpus yourself using TransfoXLCorpus methods
-    # The pre-processing involve computing word frequencies to prepare the Adaptive input and SoftMax
-    # and tokenizing the dataset
-    # The pre-processed corpus is a conversion (using the conversion script )
-    corpus = TransfoXLCorpus.from_pretrained(args.model_name)
-
-    va_iter = corpus.get_iterator("valid", args.batch_size, args.tgt_len, device=device, ext_len=args.ext_len)
-    te_iter = corpus.get_iterator("test", args.batch_size, args.tgt_len, device=device, ext_len=args.ext_len)
-
-    # Load a pre-trained model
-    model = TransfoXLLMHeadModel.from_pretrained(args.model_name)
-    model.to(device)
-
-    logger.info(
-        "Evaluating with bsz {} tgt_len {} ext_len {} mem_len {} clamp_len {}".format(
-            args.batch_size, args.tgt_len, args.ext_len, args.mem_len, args.clamp_len
-        )
-    )
-
-    model.reset_memory_length(args.mem_len)
-    if args.clamp_len > 0:
-        model.clamp_len = args.clamp_len
-    if args.same_length:
-        model.same_length = True
-
-    ###############################################################################
-    # Evaluation code
-    ###############################################################################
-    def evaluate(eval_iter):
-        # Turn on evaluation mode which disables dropout.
-        model.eval()
-        total_len, total_loss = 0, 0.0
-        start_time = time.time()
-        with torch.no_grad():
-            mems = None
-            for idx, (data, target, seq_len) in enumerate(eval_iter):
-                ret = model(data, lm_labels=target, mems=mems)
-                loss, _, mems = ret
-                loss = loss.mean()
-                total_loss += seq_len * loss.item()
-                total_len += seq_len
-            total_time = time.time() - start_time
-        logger.info(f"Time : {total_time:.2f}s, {1000 * total_time / (idx + 1):.2f}ms/segment")
-        return total_loss / total_len
-
-    # Run on test data.
-    if args.split == "all":
-        test_loss = evaluate(te_iter)
-        valid_loss = evaluate(va_iter)
-    elif args.split == "valid":
-        valid_loss = evaluate(va_iter)
-        test_loss = None
-    elif args.split == "test":
-        test_loss = evaluate(te_iter)
-        valid_loss = None
-
-    def format_log(loss, split):
-        log_str = "| {0} loss {1:5.2f} | {0} ppl {2:9.3f} ".format(split, loss, math.exp(loss))
-        return log_str
-
-    log_str = ""
-    if valid_loss is not None:
-        log_str += format_log(valid_loss, "valid")
-    if test_loss is not None:
-        log_str += format_log(test_loss, "test")
-
-    logger.info("=" * 100)
-    logger.info(log_str)
-    logger.info("=" * 100)
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/legacy/seq2seq/README.md
+++ b/examples/legacy/seq2seq/README.md
@ -1,327 +0,0 @@
-<!---
-Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-->
-
-# Sequence-to-Sequence Training and Evaluation
-
-This directory contains examples for finetuning and evaluating transformers on summarization and translation tasks.
-For deprecated `bertabs` instructions, see https://github.com/huggingface/transformers-research-projects/blob/main/bertabs/README.md.
-
-### Supported Architectures
-
- `BartForConditionalGeneration`
- `MarianMTModel`
- `PegasusForConditionalGeneration`
- `MBartForConditionalGeneration`
- `FSMTForConditionalGeneration`
- `T5ForConditionalGeneration`
-
-### Download the Datasets
-
-#### XSUM
-
-```bash
-cd examples/legacy/seq2seq
-wget https://cdn-datasets.huggingface.co/summarization/xsum.tar.gz
-tar -xzvf xsum.tar.gz
-export XSUM_DIR=${PWD}/xsum
-```
-this should make a directory called `xsum/` with files like `test.source`.
-To use your own data, copy that files format. Each article to be summarized is on its own line.
-
-#### CNN/DailyMail
-
-```bash
-cd examples/legacy/seq2seq
-wget https://cdn-datasets.huggingface.co/summarization/cnn_dm_v2.tgz
-tar -xzvf cnn_dm_v2.tgz  # empty lines removed
-mv cnn_cln cnn_dm
-export CNN_DIR=${PWD}/cnn_dm
-```
-this should make a directory called `cnn_dm/` with 6 files.
-
-#### WMT16 English-Romanian Translation Data
-
-download with this command:
-```bash
-wget https://cdn-datasets.huggingface.co/translation/wmt_en_ro.tar.gz
-tar -xzvf wmt_en_ro.tar.gz
-export ENRO_DIR=${PWD}/wmt_en_ro
-```
-this should make a directory called `wmt_en_ro/` with 6 files.
-
-#### WMT English-German
-
-```bash
-wget https://cdn-datasets.huggingface.co/translation/wmt_en_de.tgz
-tar -xzvf wmt_en_de.tgz
-export DATA_DIR=${PWD}/wmt_en_de
-```
-
-#### FSMT datasets (wmt)
-
-Refer to the scripts starting with `eval_` under:
-https://github.com/huggingface/transformers/tree/main/scripts/fsmt
-
-#### Pegasus (multiple datasets)
-
-Multiple eval datasets are available for download from:
-https://github.com/stas00/porting/tree/master/datasets/pegasus
-
-
-#### Your Data
-
-If you are using your own data, it must be formatted as one directory with 6 files:
-```
-train.source
-train.target
-val.source
-val.target
-test.source
-test.target
-```
-The `.source` files are the input, the `.target` files are the desired output.
-
-### Tips and Tricks
-
-General Tips:
- since you need to run from `examples/legacy/seq2seq`, and likely need to modify code, the easiest workflow is fork transformers, clone your fork, and run `pip install -e .` before you get started.
- try `--freeze_encoder` or `--freeze_embeds` for faster training/larger batch size.  (3hr per epoch with bs=8, see the "xsum_shared_task" command below)
-
- In addition to the pytorch-lightning .ckpt checkpoint, a transformers checkpoint will be saved.
-Load it with `BartForConditionalGeneration.from_pretrained(f'{output_dir}/best_tfmr)`.
- At the moment, `--do_predict` does not work in a multi-gpu setting. You need to use `evaluate_checkpoint` or the `run_eval.py` code.
- This warning can be safely ignored:
-    > "Some weights of BartForConditionalGeneration were not initialized from the model checkpoint at facebook/bart-large-xsum and are newly initialized: ['final_logits_bias']"
- Both finetuning and eval are 30% faster with `--fp16`.
- Read scripts before you run them!
-
-Summarization Tips:
- (summ) 1 epoch at batch size 1 for bart-large takes 24 hours and requires 13GB GPU RAM with fp16 on an NVIDIA-V100.
- If you want to run experiments on improving the summarization finetuning process, try the XSUM Shared Task (below). It's faster to train than CNNDM because the summaries are shorter.
- For CNN/DailyMail, the default `val_max_target_length` and `test_max_target_length` will truncate the ground truth labels, resulting in slightly higher rouge scores. To get accurate rouge scores, you should rerun calculate_rouge on the `{output_dir}/test_generations.txt` file saved by `trainer.test()`
- `--max_target_length=60 --val_max_target_length=60 --test_max_target_length=100 ` is a reasonable setting for XSUM.
- `wandb` can be used by specifying `--logger_name wandb`. It is useful for reproducibility. Specify the environment variable `WANDB_PROJECT='hf_xsum'` to do the XSUM shared task.
- If you are finetuning on your own dataset, start from `distilbart-cnn-12-6` if you want long summaries and `distilbart-xsum-12-6` if you want short summaries.
-(It rarely makes sense to start from `bart-large` unless you are a researching finetuning methods).
-
-**Update 2018-07-18**
-Datasets: `LegacySeq2SeqDataset` will be used for all tokenizers without a `prepare_seq2seq_batch` method. Otherwise, `Seq2SeqDataset` will be used.
-Future work/help wanted: A new dataset to support multilingual tasks.
-
-
-### Fine-tuning using Seq2SeqTrainer
-To use `Seq2SeqTrainer` for fine-tuning you should use the `finetune_trainer.py` script. It subclasses `Trainer` to extend it for seq2seq training. Except the `Trainer`-related `TrainingArguments`, it shares the same argument names as that of `finetune.py` file. One notable difference is that calculating generative metrics (BLEU, ROUGE) is optional and is controlled using the `--predict_with_generate` argument.
-
-To see all the possible command line options, run:
-
-```bash
-python finetune_trainer.py --help
-```
-
-For multi-gpu training use `torch.distributed.launch`, e.g. with 2 gpus:
-```bash
-torchrun --nproc_per_node=2  finetune_trainer.py ...
-```
-
-**At the moment, `Seq2SeqTrainer` does not support *with teacher* distillation.**
-
-All `Seq2SeqTrainer`-based fine-tuning scripts are included in the `builtin_trainer` directory.
-
-#### TPU Training
-`Seq2SeqTrainer` supports TPU training with few caveats
-1. As `generate` method does not work on TPU at the moment, `predict_with_generate` cannot be used. You should use `--prediction_loss_only` to only calculate loss, and do not set `--do_predict` and `--predict_with_generate`.
-2. All sequences should be padded to be of equal length to avoid extremely slow training. (`finetune_trainer.py` does this automatically when running on TPU.)
-
-We provide a very simple launcher script named `xla_spawn.py` that lets you run our example scripts on multiple TPU cores without any boilerplate. Just pass a `--num_cores` flag to this script, then your regular training script with its arguments (this is similar to the `torch.distributed.launch` helper for `torch.distributed`).
-
-`builtin_trainer/finetune_tpu.sh` script provides minimal arguments needed for TPU training.
-
-The following command fine-tunes `sshleifer/student_marian_en_ro_6_3` on TPU V3-8 and should complete one epoch in ~5-6 mins.
-
-```bash
-./builtin_trainer/train_distil_marian_enro_tpu.sh
-```
-
-## Evaluation Commands
-
-To create summaries for each article in dataset, we use `run_eval.py`, here are a few commands that run eval for different tasks and models.
-If 'translation' is in your task name, the computed metric will be BLEU. Otherwise, ROUGE will be used.
-
-For t5, you need to specify --task translation_{src}_to_{tgt} as follows:
-```bash
-export DATA_DIR=wmt_en_ro
-./run_eval.py google-t5/t5-base \
-    $DATA_DIR/val.source t5_val_generations.txt \
-    --reference_path $DATA_DIR/val.target \
-    --score_path enro_bleu.json \
-    --task translation_en_to_ro \
-    --n_obs 100 \
-    --device cuda \
-    --fp16 \
-    --bs 32
-```
-
-This command works for MBART, although the BLEU score is suspiciously low.
-```bash
-export DATA_DIR=wmt_en_ro
-./run_eval.py facebook/mbart-large-en-ro $DATA_DIR/val.source mbart_val_generations.txt \
-    --reference_path $DATA_DIR/val.target \
-    --score_path enro_bleu.json \
-    --task translation \
-    --n_obs 100 \
-    --device cuda \
-    --fp16 \
-    --bs 32
-```
-
-Summarization (xsum will be very similar):
-```bash
-export DATA_DIR=cnn_dm
-./run_eval.py sshleifer/distilbart-cnn-12-6 $DATA_DIR/val.source dbart_val_generations.txt \
-    --reference_path $DATA_DIR/val.target \
-    --score_path cnn_rouge.json \
-    --task summarization \
-    --n_obs 100 \
-
-th 56 \
-    --fp16 \
-    --bs 32
-```
-
-### Multi-GPU Evaluation
-here is a command to run xsum evaluation on 8 GPUs. It is more than linearly faster than run_eval.py in some cases
-because it uses SortishSampler to minimize padding. You can also use it on 1 GPU. `data_dir` must have
-`{type_path}.source` and `{type_path}.target`. Run `./run_distributed_eval.py --help` for all clargs.
-
-```bash
-torchrun --nproc_per_node=8  run_distributed_eval.py \
-    --model_name sshleifer/distilbart-large-xsum-12-3  \
-    --save_dir xsum_generations \
-    --data_dir xsum \
-    --fp16  # you can pass generate kwargs like num_beams here, just like run_eval.py
-```
-
-Contributions that implement this command for other distributed hardware setups are welcome!
-
-#### Single-GPU Eval: Tips and Tricks
-
-When using `run_eval.py`, the following features can be useful:
-
-* if you running the script multiple times and want to make it easier to track what arguments produced that output, use `--dump-args`. Along with the results it will also dump any custom params that were passed to the script. For example if you used: `--num_beams 8 --early_stopping true`, the output will be:
-   ```json
-   {'bleu': 26.887, 'n_obs': 10, 'runtime': 1, 'seconds_per_sample': 0.1, 'num_beams': 8, 'early_stopping': True}
-   ```
-
-   `--info` is an additional argument available for the same purpose of tracking the conditions of the experiment. It's useful to pass things that weren't in the argument list, e.g. a language pair `--info "lang:en-ru"`. But also if you pass `--info` without a value it will fallback to the current date/time string, e.g. `2020-09-13 18:44:43`.
-
-   If using `--dump-args --info`, the output will be:
-
-   ```json
-   {'bleu': 26.887, 'n_obs': 10, 'runtime': 1, 'seconds_per_sample': 0.1, 'num_beams': 8, 'early_stopping': True, 'info': '2020-09-13 18:44:43'}
-   ```
-
-   If using `--dump-args --info "pair:en-ru chkpt=best`, the output will be:
-
-   ```json
-   {'bleu': 26.887, 'n_obs': 10, 'runtime': 1, 'seconds_per_sample': 0.1, 'num_beams': 8, 'early_stopping': True, 'info': 'pair=en-ru chkpt=best'}
-   ```
-
-
-* if you need to perform a parametric search in order to find the best ones that lead to the highest BLEU score, let `run_eval_search.py` to do the searching for you.
-
-   The script accepts the exact same arguments as `run_eval.py`, plus an additional argument `--search`. The value of `--search` is parsed, reformatted and fed to ``run_eval.py`` as additional args.
-
-   The format for the `--search` value is a simple string with hparams and colon separated values to try, e.g.:
-   ```
-    --search "num_beams=5:10 length_penalty=0.8:1.0:1.2 early_stopping=true:false"
-   ```
-   which will generate `12` `(2*3*2)` searches for a product of each hparam. For example the example that was just used will invoke `run_eval.py` repeatedly with:
-
-   ```
-    --num_beams 5 --length_penalty 0.8 --early_stopping true
-    --num_beams 5 --length_penalty 0.8 --early_stopping false
-    [...]
-    --num_beams 10 --length_penalty 1.2 --early_stopping false
-   ```
-
-   On completion, this function prints a markdown table of the results sorted by the best BLEU score and the winning arguments.
-
-```
-bleu  | num_beams | length_penalty | early_stopping
----- | --------- | -------------- | --------------
-26.71 |         5 |            1.1 |              1
-26.66 |         5 |            0.9 |              1
-26.66 |         5 |            0.9 |              0
-26.41 |         5 |            1.1 |              0
-21.94 |         1 |            0.9 |              1
-21.94 |         1 |            0.9 |              0
-21.94 |         1 |            1.1 |              1
-21.94 |         1 |            1.1 |              0
-
-Best score args:
-stas/wmt19-en-ru data/en-ru/val.source data/en-ru/test_translations.txt --reference_path data/en-ru/val.target --score_path data/en-ru/test_bleu.json --bs 8 --task translation --num_beams 5 --length_penalty 1.1 --early_stopping True
-```
-
-If you pass `--info "some experiment-specific info"` it will get printed before the results table - this is useful for scripting and multiple runs, so one can tell the different sets of results from each other.
-
-
-### Contributing
- follow the standard contributing guidelines and code of conduct.
- add tests to `test_seq2seq_examples.py`
- To run only the seq2seq tests, you must be in the root of the repository and run:
-```bash
-pytest examples/seq2seq/
-```
-
-### Converting pytorch-lightning checkpoints
-pytorch lightning ``-do_predict`` often fails, after you are done training, the best way to evaluate your model is to convert it.
-
-This should be done for you, with a file called `{save_dir}/best_tfmr`.
-
-If that file doesn't exist but you have a lightning `.ckpt` file, you can run
-```bash
-python convert_pl_checkpoint_to_hf.py PATH_TO_CKPT  randomly_initialized_hf_model_path save_dir/best_tfmr
-```
-Then either `run_eval` or `run_distributed_eval` with `save_dir/best_tfmr` (see previous sections)
-
-
-# Experimental Features
-These features are harder to use and not always useful.
-
-###  Dynamic Batch Size for MT
-`finetune.py` has a command line arg `--max_tokens_per_batch` that allows batches to be dynamically sized.
-This feature can only be used:
- with fairseq installed
- on 1 GPU
- without sortish sampler
- after calling `./save_len_file.py $tok $data_dir`
-
-For example,
-```bash
-./save_len_file.py Helsinki-NLP/opus-mt-en-ro  wmt_en_ro
-./dynamic_bs_example.sh --max_tokens_per_batch=2000 --output_dir benchmark_dynamic_bs
-```
-splits `wmt_en_ro/train` into 11,197 uneven length batches and can finish 1 epoch in 8 minutes on a v100.
-
-For comparison,
-```bash
-./dynamic_bs_example.sh --sortish_sampler --train_batch_size 48
-```
-uses 12,723 batches of length 48 and takes slightly more time 9.5 minutes.
-
-The feature is still experimental, because:
-+ we can make it much more robust if we have memory mapped/preprocessed datasets.
-+ The speedup over sortish sampler is not that large at the moment.
--- a/examples/legacy/seq2seq/init.py
+++ b/examples/legacy/seq2seq/init.py
@ -1,5 +0,0 @@
-import os
-import sys
-
-
-sys.path.insert(1, os.path.dirname(os.path.realpath(__file__)))
--- a/examples/legacy/seq2seq/convert_model_to_fp16.py
+++ b/examples/legacy/seq2seq/convert_model_to_fp16.py
@ -1,36 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Union
-
-import fire
-import torch
-from tqdm import tqdm
-
-
-def convert(src_path: str, map_location: str = "cpu", save_path: Union[str, None] = None) -> None:
-    """Convert a pytorch_model.bin or model.pt file to torch.float16 for faster downloads, less disk space."""
-    state_dict = torch.load(src_path, map_location=map_location, weights_only=True)
-    for k, v in tqdm(state_dict.items()):
-        if not isinstance(v, torch.Tensor):
-            raise TypeError("FP16 conversion only works on paths that are saved state dicts, like pytorch_model.bin")
-        state_dict[k] = v.half()
-    if save_path is None:  # overwrite src_path
-        save_path = src_path
-    torch.save(state_dict, save_path)
-
-
-if __name__ == "__main__":
-    fire.Fire(convert)
--- a/examples/legacy/seq2seq/download_wmt.py
+++ b/examples/legacy/seq2seq/download_wmt.py
@ -1,67 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from pathlib import Path
-
-import fire
-from tqdm import tqdm
-
-
-def download_wmt_dataset(src_lang="ro", tgt_lang="en", dataset="wmt16", save_dir=None) -> None:
-    """Download a dataset using the datasets package and save it to the format expected by finetune.py
-    Format of save_dir: train.source, train.target, val.source, val.target, test.source, test.target.
-
-    Args:
-        src_lang: <str> source language
-        tgt_lang: <str> target language
-        dataset: <str> wmt16, wmt17, etc. wmt16 is a good start as it's small. To get the full list run `import datasets; print([d.id for d in datasets.list_datasets() if "wmt" in d.id])`
-        save_dir: <str>, where to save the datasets, defaults to f'{dataset}-{src_lang}-{tgt_lang}'
-
-    Usage:
-        >>> download_wmt_dataset('ro', 'en', dataset='wmt16') # saves to wmt16-ro-en
-    """
-    try:
-        import datasets
-    except (ModuleNotFoundError, ImportError):
-        raise ImportError("run pip install datasets")
-    pair = f"{src_lang}-{tgt_lang}"
-    print(f"Converting {dataset}-{pair}")
-    ds = datasets.load_dataset(dataset, pair)
-    if save_dir is None:
-        save_dir = f"{dataset}-{pair}"
-    save_dir = Path(save_dir)
-    save_dir.mkdir(exist_ok=True)
-
-    for split in ds:
-        print(f"Splitting {split} with {ds[split].num_rows} records")
-
-        # to save to val.source, val.target like summary datasets
-        fn = "val" if split == "validation" else split
-        src_path = save_dir.joinpath(f"{fn}.source")
-        tgt_path = save_dir.joinpath(f"{fn}.target")
-        src_fp = src_path.open("w+")
-        tgt_fp = tgt_path.open("w+")
-
-        # reader is the bottleneck so writing one record at a time doesn't slow things down
-        for x in tqdm(ds[split]):
-            ex = x["translation"]
-            src_fp.write(ex[src_lang] + "\n")
-            tgt_fp.write(ex[tgt_lang] + "\n")
-
-    print(f"Saved {dataset} dataset to {save_dir}")
-
-
-if __name__ == "__main__":
-    fire.Fire(download_wmt_dataset)
--- a/examples/legacy/seq2seq/finetune_tpu.sh
+++ b/examples/legacy/seq2seq/finetune_tpu.sh
@ -1,26 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-export TPU_NUM_CORES=8
-
-# the proper usage is documented in the README, you need to specify data_dir, output_dir and model_name_or_path
-# run ./finetune_tpu.sh --help to see all the possible options
-# To specify the number of cores to use, use the TPU_NUM_DEVICES environment variable
-python xla_spawn.py finetune_trainer.py \
-    --learning_rate=3e-5 \
-    --do_train --do_eval \
-    --eval_strategy steps \
-    --prediction_loss_only \
-    --n_val 1000 \
-    "$@"
--- a/examples/legacy/seq2seq/finetune_trainer.py
+++ b/examples/legacy/seq2seq/finetune_trainer.py
@ -1,370 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import os
-import sys
-from dataclasses import dataclass, field
-from typing import Optional
-
-from seq2seq_trainer import Seq2SeqTrainer
-from seq2seq_training_args import Seq2SeqTrainingArguments
-
-import transformers
-from transformers import (
-    AutoConfig,
-    AutoModelForSeq2SeqLM,
-    AutoTokenizer,
-    HfArgumentParser,
-    MBartTokenizer,
-    MBartTokenizerFast,
-    set_seed,
-)
-from transformers.trainer_utils import is_main_process
-from transformers.training_args import ParallelMode
-from utils import (
-    Seq2SeqDataCollator,
-    Seq2SeqDataset,
-    assert_all_frozen,
-    build_compute_metrics_fn,
-    freeze_embeds,
-    freeze_params,
-    lmap,
-    save_json,
-    use_task_specific_params,
-    write_txt_file,
-)
-
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-
-    model_name_or_path: str = field(
-        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
-    )
-    config_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
-    )
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
-    )
-    freeze_encoder: bool = field(default=False, metadata={"help": "Whether tp freeze the encoder."})
-    freeze_embeds: bool = field(default=False, metadata={"help": "Whether  to freeze the embeddings."})
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-    """
-
-    data_dir: str = field(
-        metadata={"help": "The input data dir. Should contain the .tsv files (or other data files) for the task."}
-    )
-    task: Optional[str] = field(
-        default="summarization",
-        metadata={"help": "Task name, summarization (or summarization_{dataset} for pegasus) or translation"},
-    )
-    max_source_length: Optional[int] = field(
-        default=1024,
-        metadata={
-            "help": (
-                "The maximum total input sequence length after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded."
-            )
-        },
-    )
-    max_target_length: Optional[int] = field(
-        default=128,
-        metadata={
-            "help": (
-                "The maximum total sequence length for target text after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded."
-            )
-        },
-    )
-    val_max_target_length: Optional[int] = field(
-        default=142,
-        metadata={
-            "help": (
-                "The maximum total sequence length for validation target text after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded. "
-                "This argument is also used to override the ``max_length`` param of ``model.generate``, which is used "
-                "during ``evaluate`` and ``predict``."
-            )
-        },
-    )
-    test_max_target_length: Optional[int] = field(
-        default=142,
-        metadata={
-            "help": (
-                "The maximum total sequence length for test target text after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded."
-            )
-        },
-    )
-    n_train: Optional[int] = field(default=-1, metadata={"help": "# training examples. -1 means use all."})
-    n_val: Optional[int] = field(default=-1, metadata={"help": "# validation examples. -1 means use all."})
-    n_test: Optional[int] = field(default=-1, metadata={"help": "# test examples. -1 means use all."})
-    src_lang: Optional[str] = field(default=None, metadata={"help": "Source language id for translation."})
-    tgt_lang: Optional[str] = field(default=None, metadata={"help": "Target language id for translation."})
-    eval_beams: Optional[int] = field(default=None, metadata={"help": "# num_beams to use for evaluation."})
-    ignore_pad_token_for_loss: bool = field(
-        default=True,
-        metadata={"help": "If only pad tokens should be ignored. This assumes that `config.pad_token_id` is defined."},
-    )
-
-
-def handle_metrics(split, metrics, output_dir):
-    """
-    Log and save metrics
-
-    Args:
-    - split: one of train, val, test
-    - metrics: metrics dict
-    - output_dir: where to save the metrics
-    """
-
-    logger.info(f"***** {split} metrics *****")
-    for key in sorted(metrics.keys()):
-        logger.info(f"  {key} = {metrics[key]}")
-    save_json(metrics, os.path.join(output_dir, f"{split}_results.json"))
-
-
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))
-
-    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
-        # If we pass only one argument to the script and it's the path to a json file,
-        # let's parse it to get our arguments.
-        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
-    else:
-        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO if training_args.local_process_index in [-1, 0] else logging.WARN,
-    )
-    logger.warning(
-        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-        training_args.local_process_index,
-        training_args.device,
-        training_args.n_gpu,
-        bool(training_args.parallel_mode == ParallelMode.DISTRIBUTED),
-        training_args.fp16,
-    )
-    transformers.utils.logging.enable_default_handler()
-    transformers.utils.logging.enable_explicit_format()
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    if is_main_process(training_args.local_process_index):
-        transformers.utils.logging.set_verbosity_info()
-    logger.info("Training/evaluation parameters %s", training_args)
-
-    # Set seed
-    set_seed(training_args.seed)
-
-    # Load pretrained model and tokenizer
-    #
-    # Distributed training:
-    # The .from_pretrained methods guarantee that only one local process can concurrently
-    # download model & vocab.
-
-    config = AutoConfig.from_pretrained(
-        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-    )
-
-    extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "dropout", "attention_dropout")
-    for p in extra_model_params:
-        if getattr(training_args, p, None):
-            assert hasattr(config, p), f"({config.__class__.__name__}) doesn't have a `{p}` attribute"
-            setattr(config, p, getattr(training_args, p))
-
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-    )
-    model = AutoModelForSeq2SeqLM.from_pretrained(
-        model_args.model_name_or_path,
-        from_tf=".ckpt" in model_args.model_name_or_path,
-        config=config,
-        cache_dir=model_args.cache_dir,
-    )
-
-    # use task specific params
-    use_task_specific_params(model, data_args.task)
-
-    # set num_beams for evaluation
-    if data_args.eval_beams is None:
-        data_args.eval_beams = model.config.num_beams
-
-    # set decoder_start_token_id for MBart
-    if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
-        assert data_args.tgt_lang is not None and data_args.src_lang is not None, (
-            "mBart requires --tgt_lang and --src_lang"
-        )
-        if isinstance(tokenizer, MBartTokenizer):
-            model.config.decoder_start_token_id = tokenizer.lang_code_to_id[data_args.tgt_lang]
-        else:
-            model.config.decoder_start_token_id = tokenizer.convert_tokens_to_ids(data_args.tgt_lang)
-
-    if model_args.freeze_embeds:
-        freeze_embeds(model)
-    if model_args.freeze_encoder:
-        freeze_params(model.get_encoder())
-        assert_all_frozen(model.get_encoder())
-
-    dataset_class = Seq2SeqDataset
-
-    # Get datasets
-    train_dataset = (
-        dataset_class(
-            tokenizer,
-            type_path="train",
-            data_dir=data_args.data_dir,
-            n_obs=data_args.n_train,
-            max_target_length=data_args.max_target_length,
-            max_source_length=data_args.max_source_length,
-            prefix=model.config.prefix or "",
-        )
-        if training_args.do_train
-        else None
-    )
-    eval_dataset = (
-        dataset_class(
-            tokenizer,
-            type_path="val",
-            data_dir=data_args.data_dir,
-            n_obs=data_args.n_val,
-            max_target_length=data_args.val_max_target_length,
-            max_source_length=data_args.max_source_length,
-            prefix=model.config.prefix or "",
-        )
-        if training_args.do_eval
-        else None
-    )
-    test_dataset = (
-        dataset_class(
-            tokenizer,
-            type_path="test",
-            data_dir=data_args.data_dir,
-            n_obs=data_args.n_test,
-            max_target_length=data_args.test_max_target_length,
-            max_source_length=data_args.max_source_length,
-            prefix=model.config.prefix or "",
-        )
-        if training_args.do_predict
-        else None
-    )
-
-    # Initialize our Trainer
-    compute_metrics_fn = (
-        build_compute_metrics_fn(data_args.task, tokenizer) if training_args.predict_with_generate else None
-    )
-    trainer = Seq2SeqTrainer(
-        model=model,
-        args=training_args,
-        data_args=data_args,
-        train_dataset=train_dataset,
-        eval_dataset=eval_dataset,
-        data_collator=Seq2SeqDataCollator(tokenizer, data_args, model.config.decoder_start_token_id),
-        compute_metrics=compute_metrics_fn,
-        processing_class=tokenizer,
-    )
-
-    all_metrics = {}
-    # Training
-    if training_args.do_train:
-        logger.info("*** Train ***")
-
-        train_result = trainer.train(
-            model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
-        )
-        metrics = train_result.metrics
-        metrics["train_n_objs"] = data_args.n_train
-
-        trainer.save_model()  # this also saves the tokenizer
-
-        if trainer.is_world_process_zero():
-            handle_metrics("train", metrics, training_args.output_dir)
-            all_metrics.update(metrics)
-
-            # Need to save the state, since Trainer.save_model saves only the tokenizer with the model
-            trainer.state.save_to_json(os.path.join(training_args.output_dir, "trainer_state.json"))
-
-            # For convenience, we also re-save the tokenizer to the same directory,
-            # so that you can share your model easily on huggingface.co/models =)
-            tokenizer.save_pretrained(training_args.output_dir)
-
-    # Evaluation
-    if training_args.do_eval:
-        logger.info("*** Evaluate ***")
-
-        metrics = trainer.evaluate(metric_key_prefix="val")
-        metrics["val_n_objs"] = data_args.n_val
-        metrics["val_loss"] = round(metrics["val_loss"], 4)
-
-        if trainer.is_world_process_zero():
-            handle_metrics("val", metrics, training_args.output_dir)
-            all_metrics.update(metrics)
-
-    if training_args.do_predict:
-        logger.info("*** Predict ***")
-
-        test_output = trainer.predict(test_dataset=test_dataset, metric_key_prefix="test")
-        metrics = test_output.metrics
-        metrics["test_n_objs"] = data_args.n_test
-
-        if trainer.is_world_process_zero():
-            metrics["test_loss"] = round(metrics["test_loss"], 4)
-            handle_metrics("test", metrics, training_args.output_dir)
-            all_metrics.update(metrics)
-
-            if training_args.predict_with_generate:
-                test_preds = tokenizer.batch_decode(
-                    test_output.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True
-                )
-                test_preds = lmap(str.strip, test_preds)
-                write_txt_file(test_preds, os.path.join(training_args.output_dir, "test_generations.txt"))
-
-    if trainer.is_world_process_zero():
-        save_json(all_metrics, os.path.join(training_args.output_dir, "all_results.json"))
-
-    return all_metrics
-
-
-def _mp_fn(index):
-    # For xla_spawn (TPUs)
-    main()
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/legacy/seq2seq/minify_dataset.py
+++ b/examples/legacy/seq2seq/minify_dataset.py
@ -1,34 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from pathlib import Path
-
-import fire
-
-
-def minify(src_dir: str, dest_dir: str, n: int):
-    """Write first n lines of each file f in src_dir to dest_dir/f"""
-    src_dir = Path(src_dir)
-    dest_dir = Path(dest_dir)
-    dest_dir.mkdir(exist_ok=True)
-    for path in src_dir.iterdir():
-        new = [x.rstrip() for x in list(path.open().readlines())][:n]
-        dest_path = dest_dir.joinpath(path.name)
-        print(dest_path)
-        dest_path.open("w").write("\n".join(new))
-
-
-if __name__ == "__main__":
-    fire.Fire(minify)
--- a/examples/legacy/seq2seq/old_test_calculate_rouge.py
+++ b/examples/legacy/seq2seq/old_test_calculate_rouge.py
@ -1,109 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from collections import defaultdict
-from pathlib import Path
-
-import pandas as pd
-from rouge_cli import calculate_rouge_path
-
-from utils import calculate_rouge
-
-
-PRED = [
-    'Prosecutor: "No videos were used in the crash investigation" German papers say they saw a cell phone video of the'
-    ' final seconds on board Flight 9525. The Germanwings co-pilot says he had a "previous episode of severe'
-    " depression\" German airline confirms it knew of Andreas Lubitz's depression years before he took control.",
-    "The Palestinian Authority officially becomes the 123rd member of the International Criminal Court. The formal"
-    " accession was marked with a ceremony at The Hague, in the Netherlands. The Palestinians signed the ICC's"
-    " founding Rome Statute in January. Israel and the United States opposed the Palestinians' efforts to join the"
-    " body.",
-    "Amnesty International releases its annual report on the death penalty. The report catalogs the use of"
-    " state-sanctioned killing as a punitive measure across the globe. At least 607 people were executed around the"
-    " world in 2014, compared to 778 in 2013. The U.S. remains one of the worst offenders for imposing capital"
-    " punishment.",
-]
-
-TGT = [
-    'Marseille prosecutor says "so far no videos were used in the crash investigation" despite media reports .'
-    ' Journalists at Bild and Paris Match are "very confident" the video clip is real, an editor says . Andreas Lubitz'
-    " had informed his Lufthansa training school of an episode of severe depression, airline says .",
-    "Membership gives the ICC jurisdiction over alleged crimes committed in Palestinian territories since last June ."
-    " Israel and the United States opposed the move, which could open the door to war crimes investigations against"
-    " Israelis .",
-    "Amnesty's annual death penalty report catalogs encouraging signs, but setbacks in numbers of those sentenced to"
-    " death . Organization claims that governments around the world are using the threat of terrorism to advance"
-    " executions . The number of executions worldwide has gone down by almost 22% compared with 2013, but death"
-    " sentences up by 28% .",
-]
-
-
-def test_disaggregated_scores_are_determinstic():
-    no_aggregation = calculate_rouge(PRED, TGT, bootstrap_aggregation=False, rouge_keys=["rouge2", "rougeL"])
-    assert isinstance(no_aggregation, defaultdict)
-    no_aggregation_just_r2 = calculate_rouge(PRED, TGT, bootstrap_aggregation=False, rouge_keys=["rouge2"])
-    assert (
-        pd.DataFrame(no_aggregation["rouge2"]).fmeasure.mean()
-        == pd.DataFrame(no_aggregation_just_r2["rouge2"]).fmeasure.mean()
-    )
-
-
-def test_newline_cnn_improvement():
-    k = "rougeLsum"
-    score = calculate_rouge(PRED, TGT, newline_sep=True, rouge_keys=[k])[k]
-    score_no_sep = calculate_rouge(PRED, TGT, newline_sep=False, rouge_keys=[k])[k]
-    assert score > score_no_sep
-
-
-def test_newline_irrelevant_for_other_metrics():
-    k = ["rouge1", "rouge2", "rougeL"]
-    score_sep = calculate_rouge(PRED, TGT, newline_sep=True, rouge_keys=k)
-    score_no_sep = calculate_rouge(PRED, TGT, newline_sep=False, rouge_keys=k)
-    assert score_sep == score_no_sep
-
-
-def test_single_sent_scores_dont_depend_on_newline_sep():
-    pred = [
-        "Her older sister, Margot Frank, died in 1945, a month earlier than previously thought.",
-        'Marseille prosecutor says "so far no videos were used in the crash investigation" despite media reports .',
-    ]
-    tgt = [
-        "Margot Frank, died in 1945, a month earlier than previously thought.",
-        'Prosecutor: "No videos were used in the crash investigation" German papers say they saw a cell phone video of'
-        " the final seconds on board Flight 9525.",
-    ]
-    assert calculate_rouge(pred, tgt, newline_sep=True) == calculate_rouge(pred, tgt, newline_sep=False)
-
-
-def test_pegasus_newline():
-    pred = [
-        """" "a person who has such a video needs to immediately give it to the investigators," prosecutor says .<n> "it is a very disturbing scene," editor-in-chief of bild online tells "erin burnett: outfront" """
-    ]
-    tgt = [
-        """ Marseille prosecutor says "so far no videos were used in the crash investigation" despite media reports . Journalists at Bild and Paris Match are "very confident" the video clip is real, an editor says . Andreas Lubitz had informed his Lufthansa training school of an episode of severe depression, airline says ."""
-    ]
-
-    prev_score = calculate_rouge(pred, tgt, rouge_keys=["rougeLsum"], newline_sep=False)["rougeLsum"]
-    new_score = calculate_rouge(pred, tgt, rouge_keys=["rougeLsum"])["rougeLsum"]
-    assert new_score > prev_score
-
-
-def test_rouge_cli():
-    data_dir = Path("examples/seq2seq/test_data/wmt_en_ro")
-    metrics = calculate_rouge_path(data_dir.joinpath("test.source"), data_dir.joinpath("test.target"))
-    assert isinstance(metrics, dict)
-    metrics_default_dict = calculate_rouge_path(
-        data_dir.joinpath("test.source"), data_dir.joinpath("test.target"), bootstrap_aggregation=False
-    )
-    assert isinstance(metrics_default_dict, defaultdict)
--- a/examples/legacy/seq2seq/old_test_datasets.py
+++ b/examples/legacy/seq2seq/old_test_datasets.py
@ -1,247 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-from pathlib import Path
-
-import numpy as np
-import pytest
-from pack_dataset import pack_data_dir
-from parameterized import parameterized
-from save_len_file import save_len_file
-from torch.utils.data import DataLoader
-
-from transformers import AutoTokenizer
-from transformers.models.mbart.modeling_mbart import shift_tokens_right
-from transformers.testing_utils import TestCasePlus, slow
-from utils import FAIRSEQ_AVAILABLE, DistributedSortishSampler, LegacySeq2SeqDataset, Seq2SeqDataset
-
-
-BERT_BASE_CASED = "google-bert/bert-base-cased"
-PEGASUS_XSUM = "google/pegasus-xsum"
-ARTICLES = [" Sam ate lunch today.", "Sams lunch ingredients."]
-SUMMARIES = ["A very interesting story about what I ate for lunch.", "Avocado, celery, turkey, coffee"]
-T5_TINY = "patrickvonplaten/t5-tiny-random"
-BART_TINY = "sshleifer/bart-tiny-random"
-MBART_TINY = "sshleifer/tiny-mbart"
-MARIAN_TINY = "sshleifer/tiny-marian-en-de"
-
-
-def _dump_articles(path: Path, articles: list):
-    content = "\n".join(articles)
-    Path(path).open("w").writelines(content)
-
-
-def make_test_data_dir(tmp_dir):
-    for split in ["train", "val", "test"]:
-        _dump_articles(os.path.join(tmp_dir, f"{split}.source"), ARTICLES)
-        _dump_articles(os.path.join(tmp_dir, f"{split}.target"), SUMMARIES)
-    return tmp_dir
-
-
-class TestAll(TestCasePlus):
-    @parameterized.expand(
-        [
-            MBART_TINY,
-            MARIAN_TINY,
-            T5_TINY,
-            BART_TINY,
-            PEGASUS_XSUM,
-        ],
-    )
-    @slow
-    def test_seq2seq_dataset_truncation(self, tok_name):
-        tokenizer = AutoTokenizer.from_pretrained(tok_name)
-        tmp_dir = make_test_data_dir(tmp_dir=self.get_auto_remove_tmp_dir())
-        max_len_source = max(len(tokenizer.encode(a)) for a in ARTICLES)
-        max_len_target = max(len(tokenizer.encode(a)) for a in SUMMARIES)
-        max_src_len = 4
-        max_tgt_len = 8
-        assert max_len_target > max_src_len  # Will be truncated
-        assert max_len_source > max_src_len  # Will be truncated
-        src_lang, tgt_lang = "ro_RO", "de_DE"  # ignored for all but mbart, but never causes error.
-        train_dataset = Seq2SeqDataset(
-            tokenizer,
-            data_dir=tmp_dir,
-            type_path="train",
-            max_source_length=max_src_len,
-            max_target_length=max_tgt_len,  # ignored
-            src_lang=src_lang,
-            tgt_lang=tgt_lang,
-        )
-        dataloader = DataLoader(train_dataset, batch_size=2, collate_fn=train_dataset.collate_fn)
-        for batch in dataloader:
-            assert isinstance(batch, dict)
-            assert batch["attention_mask"].shape == batch["input_ids"].shape
-            # show that articles were trimmed.
-            assert batch["input_ids"].shape[1] == max_src_len
-            # show that targets are the same len
-            assert batch["labels"].shape[1] == max_tgt_len
-            if tok_name != MBART_TINY:
-                continue
-            # check language codes in correct place
-            batch["decoder_input_ids"] = shift_tokens_right(batch["labels"], tokenizer.pad_token_id)
-            assert batch["decoder_input_ids"][0, 0].item() == tokenizer.lang_code_to_id[tgt_lang]
-            assert batch["decoder_input_ids"][0, -1].item() == tokenizer.eos_token_id
-            assert batch["input_ids"][0, -2].item() == tokenizer.eos_token_id
-            assert batch["input_ids"][0, -1].item() == tokenizer.lang_code_to_id[src_lang]
-
-            break  # No need to test every batch
-
-    @parameterized.expand([BART_TINY, BERT_BASE_CASED])
-    def test_legacy_dataset_truncation(self, tok):
-        tokenizer = AutoTokenizer.from_pretrained(tok)
-        tmp_dir = make_test_data_dir(tmp_dir=self.get_auto_remove_tmp_dir())
-        max_len_source = max(len(tokenizer.encode(a)) for a in ARTICLES)
-        max_len_target = max(len(tokenizer.encode(a)) for a in SUMMARIES)
-        trunc_target = 4
-        train_dataset = LegacySeq2SeqDataset(
-            tokenizer,
-            data_dir=tmp_dir,
-            type_path="train",
-            max_source_length=20,
-            max_target_length=trunc_target,
-        )
-        dataloader = DataLoader(train_dataset, batch_size=2, collate_fn=train_dataset.collate_fn)
-        for batch in dataloader:
-            assert batch["attention_mask"].shape == batch["input_ids"].shape
-            # show that articles were trimmed.
-            assert batch["input_ids"].shape[1] == max_len_source
-            assert 20 >= batch["input_ids"].shape[1]  # trimmed significantly
-            # show that targets were truncated
-            assert batch["labels"].shape[1] == trunc_target  # Truncated
-            assert max_len_target > trunc_target  # Truncated
-            break  # No need to test every batch
-
-    def test_pack_dataset(self):
-        tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-cc25")
-
-        tmp_dir = Path(make_test_data_dir(tmp_dir=self.get_auto_remove_tmp_dir()))
-        orig_examples = tmp_dir.joinpath("train.source").open().readlines()
-        save_dir = Path(make_test_data_dir(tmp_dir=self.get_auto_remove_tmp_dir()))
-        pack_data_dir(tokenizer, tmp_dir, 128, save_dir)
-        orig_paths = {x.name for x in tmp_dir.iterdir()}
-        new_paths = {x.name for x in save_dir.iterdir()}
-        packed_examples = save_dir.joinpath("train.source").open().readlines()
-        # orig: [' Sam ate lunch today.\n', 'Sams lunch ingredients.']
-        # desired_packed: [' Sam ate lunch today.\n Sams lunch ingredients.']
-        assert len(packed_examples) < len(orig_examples)
-        assert len(packed_examples) == 1
-        assert len(packed_examples[0]) == sum(len(x) for x in orig_examples)
-        assert orig_paths == new_paths
-
-    @pytest.mark.skipif(not FAIRSEQ_AVAILABLE, reason="This test requires fairseq")
-    def test_dynamic_batch_size(self):
-        if not FAIRSEQ_AVAILABLE:
-            return
-        ds, max_tokens, tokenizer = self._get_dataset(max_len=64)
-        required_batch_size_multiple = 64
-        batch_sampler = ds.make_dynamic_sampler(max_tokens, required_batch_size_multiple=required_batch_size_multiple)
-        batch_sizes = [len(x) for x in batch_sampler]
-        assert len(set(batch_sizes)) > 1  # it's not dynamic batch size if every batch is the same length
-        assert sum(batch_sizes) == len(ds)  # no dropped or added examples
-        data_loader = DataLoader(ds, batch_sampler=batch_sampler, collate_fn=ds.collate_fn, num_workers=2)
-        failures = []
-        num_src_per_batch = []
-        for batch in data_loader:
-            src_shape = batch["input_ids"].shape
-            bs = src_shape[0]
-            assert bs % required_batch_size_multiple == 0 or bs < required_batch_size_multiple
-            num_src_tokens = np.product(batch["input_ids"].shape)
-            num_src_per_batch.append(num_src_tokens)
-            if num_src_tokens > (max_tokens * 1.1):
-                failures.append(num_src_tokens)
-        assert num_src_per_batch[0] == max(num_src_per_batch)
-        if failures:
-            raise AssertionError(f"too many tokens in {len(failures)} batches")
-
-    def test_sortish_sampler_reduces_padding(self):
-        ds, _, tokenizer = self._get_dataset(max_len=512)
-        bs = 2
-        sortish_sampler = ds.make_sortish_sampler(bs, shuffle=False)
-
-        naive_dl = DataLoader(ds, batch_size=bs, collate_fn=ds.collate_fn, num_workers=2)
-        sortish_dl = DataLoader(ds, batch_size=bs, collate_fn=ds.collate_fn, num_workers=2, sampler=sortish_sampler)
-
-        pad = tokenizer.pad_token_id
-
-        def count_pad_tokens(data_loader, k="input_ids"):
-            return [batch[k].eq(pad).sum().item() for batch in data_loader]
-
-        assert sum(count_pad_tokens(sortish_dl, k="labels")) < sum(count_pad_tokens(naive_dl, k="labels"))
-        assert sum(count_pad_tokens(sortish_dl)) < sum(count_pad_tokens(naive_dl))
-        assert len(sortish_dl) == len(naive_dl)
-
-    def _get_dataset(self, n_obs=1000, max_len=128):
-        if os.getenv("USE_REAL_DATA", None):
-            data_dir = "examples/seq2seq/wmt_en_ro"
-            max_tokens = max_len * 2 * 64
-            if not Path(data_dir).joinpath("train.len").exists():
-                save_len_file(MARIAN_TINY, data_dir)
-        else:
-            data_dir = "examples/seq2seq/test_data/wmt_en_ro"
-            max_tokens = max_len * 4
-            save_len_file(MARIAN_TINY, data_dir)
-
-        tokenizer = AutoTokenizer.from_pretrained(MARIAN_TINY)
-        ds = Seq2SeqDataset(
-            tokenizer,
-            data_dir=data_dir,
-            type_path="train",
-            max_source_length=max_len,
-            max_target_length=max_len,
-            n_obs=n_obs,
-        )
-        return ds, max_tokens, tokenizer
-
-    def test_distributed_sortish_sampler_splits_indices_between_procs(self):
-        ds, max_tokens, tokenizer = self._get_dataset()
-        ids1 = set(DistributedSortishSampler(ds, 256, num_replicas=2, rank=0, add_extra_examples=False))
-        ids2 = set(DistributedSortishSampler(ds, 256, num_replicas=2, rank=1, add_extra_examples=False))
-        assert ids1.intersection(ids2) == set()
-
-    @parameterized.expand(
-        [
-            MBART_TINY,
-            MARIAN_TINY,
-            T5_TINY,
-            BART_TINY,
-            PEGASUS_XSUM,
-        ],
-    )
-    def test_dataset_kwargs(self, tok_name):
-        tokenizer = AutoTokenizer.from_pretrained(tok_name, use_fast=False)
-        if tok_name == MBART_TINY:
-            train_dataset = Seq2SeqDataset(
-                tokenizer,
-                data_dir=make_test_data_dir(tmp_dir=self.get_auto_remove_tmp_dir()),
-                type_path="train",
-                max_source_length=4,
-                max_target_length=8,
-                src_lang="EN",
-                tgt_lang="FR",
-            )
-            kwargs = train_dataset.dataset_kwargs
-            assert "src_lang" in kwargs and "tgt_lang" in kwargs
-        else:
-            train_dataset = Seq2SeqDataset(
-                tokenizer,
-                data_dir=make_test_data_dir(tmp_dir=self.get_auto_remove_tmp_dir()),
-                type_path="train",
-                max_source_length=4,
-                max_target_length=8,
-            )
-            kwargs = train_dataset.dataset_kwargs
-            assert "add_prefix_space" not in kwargs if tok_name != BART_TINY else "add_prefix_space" in kwargs
-            assert len(kwargs) == 1 if tok_name == BART_TINY else len(kwargs) == 0
--- a/examples/legacy/seq2seq/old_test_fsmt_bleu_score.py
+++ b/examples/legacy/seq2seq/old_test_fsmt_bleu_score.py
@ -1,70 +0,0 @@
-# Copyright 2020 Huggingface
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import unittest
-
-from parameterized import parameterized
-
-from transformers import FSMTForConditionalGeneration, FSMTTokenizer
-from transformers.testing_utils import get_tests_dir, require_torch, slow, torch_device
-from utils import calculate_bleu
-
-
-filename = get_tests_dir() + "/test_data/fsmt/fsmt_val_data.json"
-with open(filename, encoding="utf-8") as f:
-    bleu_data = json.load(f)
-
-
-@require_torch
-class ModelEvalTester(unittest.TestCase):
-    def get_tokenizer(self, mname):
-        return FSMTTokenizer.from_pretrained(mname)
-
-    def get_model(self, mname):
-        model = FSMTForConditionalGeneration.from_pretrained(mname).to(torch_device)
-        if torch_device == "cuda":
-            model.half()
-        return model
-
-    @parameterized.expand(
-        [
-            ["en-ru", 26.0],
-            ["ru-en", 22.0],
-            ["en-de", 22.0],
-            ["de-en", 29.0],
-        ]
-    )
-    @slow
-    def test_bleu_scores(self, pair, min_bleu_score):
-        # note: this test is not testing the best performance since it only evals a small batch
-        # but it should be enough to detect a regression in the output quality
-        mname = f"facebook/wmt19-{pair}"
-        tokenizer = self.get_tokenizer(mname)
-        model = self.get_model(mname)
-
-        src_sentences = bleu_data[pair]["src"]
-        tgt_sentences = bleu_data[pair]["tgt"]
-
-        batch = tokenizer(src_sentences, return_tensors="pt", truncation=True, padding="longest").to(torch_device)
-        outputs = model.generate(
-            input_ids=batch.input_ids,
-            num_beams=8,
-        )
-        decoded_sentences = tokenizer.batch_decode(
-            outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False
-        )
-        scores = calculate_bleu(decoded_sentences, tgt_sentences)
-        print(scores)
-        self.assertGreaterEqual(scores["bleu"], min_bleu_score)
--- a/examples/legacy/seq2seq/old_test_seq2seq_examples.py
+++ b/examples/legacy/seq2seq/old_test_seq2seq_examples.py
@ -1,132 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import os
-import sys
-from pathlib import Path
-from unittest.mock import patch
-
-from parameterized import parameterized
-from run_eval import run_generate
-from run_eval_search import run_search
-
-from transformers.testing_utils import CaptureStdout, TestCasePlus, slow
-from utils import ROUGE_KEYS
-
-
-logging.basicConfig(level=logging.DEBUG)
-logger = logging.getLogger()
-
-
-def _dump_articles(path: Path, articles: list):
-    content = "\n".join(articles)
-    Path(path).open("w").writelines(content)
-
-
-T5_TINY = "patrickvonplaten/t5-tiny-random"
-BART_TINY = "sshleifer/bart-tiny-random"
-MBART_TINY = "sshleifer/tiny-mbart"
-
-stream_handler = logging.StreamHandler(sys.stdout)
-logger.addHandler(stream_handler)
-logging.disable(logging.CRITICAL)  # remove noisy download output from tracebacks
-
-
-class TestTheRest(TestCasePlus):
-    def run_eval_tester(self, model):
-        input_file_name = Path(self.get_auto_remove_tmp_dir()) / "utest_input.source"
-        output_file_name = input_file_name.parent / "utest_output.txt"
-        assert not output_file_name.exists()
-        articles = [" New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County."]
-        _dump_articles(input_file_name, articles)
-
-        score_path = str(Path(self.get_auto_remove_tmp_dir()) / "scores.json")
-        task = "translation_en_to_de" if model == T5_TINY else "summarization"
-        testargs = f"""
-            run_eval_search.py
-            {model}
-            {input_file_name}
-            {output_file_name}
-            --score_path {score_path}
-            --task {task}
-            --num_beams 2
-            --length_penalty 2.0
-            """.split()
-
-        with patch.object(sys, "argv", testargs):
-            run_generate()
-            assert Path(output_file_name).exists()
-            # os.remove(Path(output_file_name))
-
-    # test one model to quickly (no-@slow) catch simple problems and do an
-    # extensive testing of functionality with multiple models as @slow separately
-    def test_run_eval(self):
-        self.run_eval_tester(T5_TINY)
-
-    # any extra models should go into the list here - can be slow
-    @parameterized.expand([BART_TINY, MBART_TINY])
-    @slow
-    def test_run_eval_slow(self, model):
-        self.run_eval_tester(model)
-
-    # testing with 2 models to validate: 1. translation (t5) 2. summarization (mbart)
-    @parameterized.expand([T5_TINY, MBART_TINY])
-    @slow
-    def test_run_eval_search(self, model):
-        input_file_name = Path(self.get_auto_remove_tmp_dir()) / "utest_input.source"
-        output_file_name = input_file_name.parent / "utest_output.txt"
-        assert not output_file_name.exists()
-
-        text = {
-            "en": ["Machine learning is great, isn't it?", "I like to eat bananas", "Tomorrow is another great day!"],
-            "de": [
-                "Maschinelles Lernen ist großartig, oder?",
-                "Ich esse gerne Bananen",
-                "Morgen ist wieder ein toller Tag!",
-            ],
-        }
-
-        tmp_dir = Path(self.get_auto_remove_tmp_dir())
-        score_path = str(tmp_dir / "scores.json")
-        reference_path = str(tmp_dir / "val.target")
-        _dump_articles(input_file_name, text["en"])
-        _dump_articles(reference_path, text["de"])
-        task = "translation_en_to_de" if model == T5_TINY else "summarization"
-        testargs = f"""
-            run_eval_search.py
-            {model}
-            {str(input_file_name)}
-            {str(output_file_name)}
-            --score_path {score_path}
-            --reference_path {reference_path}
-            --task {task}
-            """.split()
-        testargs.extend(["--search", "num_beams=1:2 length_penalty=0.9:1.0"])
-
-        with patch.object(sys, "argv", testargs):
-            with CaptureStdout() as cs:
-                run_search()
-            expected_strings = [" num_beams | length_penalty", model, "Best score args"]
-            un_expected_strings = ["Info"]
-            if "translation" in task:
-                expected_strings.append("bleu")
-            else:
-                expected_strings.extend(ROUGE_KEYS)
-            for w in expected_strings:
-                assert w in cs.out
-            for w in un_expected_strings:
-                assert w not in cs.out
-            assert Path(output_file_name).exists()
-            os.remove(Path(output_file_name))
--- a/examples/legacy/seq2seq/old_test_seq2seq_examples_multi_gpu.py
+++ b/examples/legacy/seq2seq/old_test_seq2seq_examples_multi_gpu.py
@ -1,55 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# as due to their complexity multi-gpu tests could impact other tests, and to aid debug we have those in a separate module.
-
-import os
-import sys
-
-from transformers.testing_utils import TestCasePlus, execute_subprocess_async, get_gpu_count, require_torch_gpu, slow
-
-from .utils import load_json
-
-
-class TestSummarizationDistillerMultiGPU(TestCasePlus):
-    @classmethod
-    def setUpClass(cls):
-        return cls
-
-    @slow
-    @require_torch_gpu
-    def test_distributed_eval(self):
-        output_dir = self.get_auto_remove_tmp_dir()
-        args = f"""
-            --model_name Helsinki-NLP/opus-mt-en-ro
-            --save_dir {output_dir}
-            --data_dir {self.test_file_dir_str}/test_data/wmt_en_ro
-            --num_beams 2
-            --task translation
-        """.split()
-
-        # we want this test to run even if there is only one GPU, but if there are more we use them all
-        n_gpu = get_gpu_count()
-        distributed_args = f"""
-            -m torch.distributed.launch
-            --nproc_per_node={n_gpu}
-            {self.test_file_dir}/run_distributed_eval.py
-        """.split()
-        cmd = [sys.executable] + distributed_args + args
-        execute_subprocess_async(cmd, env=self.get_env())
-
-        metrics_save_path = os.path.join(output_dir, "test_bleu.json")
-        metrics = load_json(metrics_save_path)
-        # print(metrics)
-        self.assertGreaterEqual(metrics["bleu"], 25)
--- a/examples/legacy/seq2seq/old_test_tatoeba_conversion.py
+++ b/examples/legacy/seq2seq/old_test_tatoeba_conversion.py
@ -1,38 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import tempfile
-import unittest
-
-from transformers.models.marian.convert_marian_tatoeba_to_pytorch import DEFAULT_REPO, TatoebaConverter
-from transformers.testing_utils import slow
-from transformers.utils import cached_property
-
-
-@unittest.skipUnless(os.path.exists(DEFAULT_REPO), "Tatoeba directory does not exist.")
-class TatoebaConversionTester(unittest.TestCase):
-    @cached_property
-    def resolver(self):
-        tmp_dir = tempfile.mkdtemp()
-        return TatoebaConverter(save_dir=tmp_dir)
-
-    @slow
-    def test_resolver(self):
-        self.resolver.convert_models(["heb-eng"])
-
-    @slow
-    def test_model_card(self):
-        content, mmeta = self.resolver.write_model_card("opus-mt-he-en", dry_run=True)
-        assert mmeta["long_pair"] == "heb-eng"
--- a/examples/legacy/seq2seq/pack_dataset.py
+++ b/examples/legacy/seq2seq/pack_dataset.py
@ -1,87 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Fill examples with bitext up to max_tokens without breaking up examples.
-[['I went', 'yo fui'],
-['to the store', 'a la tienda']
-]
-=> ['I went to the store', 'yo fui a la tienda']
-"""
-
-import argparse
-import shutil
-from pathlib import Path
-
-from tqdm import tqdm
-
-from transformers import AutoTokenizer
-
-
-def pack_examples(tok, src_examples, tgt_examples, max_tokens=1024):
-    finished_src, finished_tgt = [], []
-
-    sorted_examples = list(zip(src_examples, tgt_examples))
-    new_src, new_tgt = sorted_examples[0]
-
-    def is_too_big(strang):
-        return tok(strang, return_tensors="pt").input_ids.shape[1] > max_tokens
-
-    for src, tgt in tqdm(sorted_examples[1:]):
-        cand_src = new_src + " " + src
-        cand_tgt = new_tgt + " " + tgt
-        if is_too_big(cand_src) or is_too_big(cand_tgt):  # can't fit, finalize example
-            finished_src.append(new_src)
-            finished_tgt.append(new_tgt)
-            new_src, new_tgt = src, tgt
-        else:  # can fit, keep adding
-            new_src, new_tgt = cand_src, cand_tgt
-
-    # cleanup
-    if new_src:
-        assert new_tgt
-        finished_src.append(new_src)
-        finished_tgt.append(new_tgt)
-    return finished_src, finished_tgt
-
-
-def pack_data_dir(tok, data_dir: Path, max_tokens, save_path):
-    save_path = Path(save_path)
-    save_path.mkdir(exist_ok=True)
-    for split in ["train"]:
-        src_path, tgt_path = data_dir / f"{split}.source", data_dir / f"{split}.target"
-        src_docs = [x.rstrip() for x in Path(src_path).open()]
-        tgt_docs = [x.rstrip() for x in Path(tgt_path).open()]
-        packed_src, packed_tgt = pack_examples(tok, src_docs, tgt_docs, max_tokens)
-        print(f"packed {split} split from {len(src_docs)} examples -> {len(packed_src)}.")
-        Path(save_path / f"{split}.source").open("w").write("\n".join(packed_src))
-        Path(save_path / f"{split}.target").open("w").write("\n".join(packed_tgt))
-    for split in ["val", "test"]:
-        src_path, tgt_path = data_dir / f"{split}.source", data_dir / f"{split}.target"
-        shutil.copyfile(src_path, save_path / f"{split}.source")
-        shutil.copyfile(tgt_path, save_path / f"{split}.target")
-
-
-def packer_cli():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--tok_name", type=str, help="like facebook/bart-large-cnn,google-t5/t5-base, etc.")
-    parser.add_argument("--max_seq_len", type=int, default=128)
-    parser.add_argument("--data_dir", type=str)
-    parser.add_argument("--save_path", type=str)
-    args = parser.parse_args()
-    tokenizer = AutoTokenizer.from_pretrained(args.tok_name)
-    return pack_data_dir(tokenizer, Path(args.data_dir), args.max_seq_len, args.save_path)
-
-
-if __name__ == "__main__":
-    packer_cli()
--- a/examples/legacy/seq2seq/requirements.txt
+++ b/examples/legacy/seq2seq/requirements.txt
@ -1,20 +0,0 @@
-tensorboard
-scikit-learn
-seqeval
-psutil
-sacrebleu
-rouge-score
-tensorflow_datasets
-matplotlib
-git-python==1.0.3
-faiss-cpu
-streamlit
-elasticsearch
-nltk
-pandas
-datasets >= 1.1.3
-fire
-pytest<8.0.1
-conllu
-sentencepiece != 0.1.92
-protobuf
--- a/examples/legacy/seq2seq/romanian_postprocessing.md
+++ b/examples/legacy/seq2seq/romanian_postprocessing.md
@ -1,65 +0,0 @@
-### Motivation
-Without processing, english-> romanian mbart-large-en-ro gets BLEU score 26.8 on the WMT data.
-With post processing, it can score 37..
-Here is the postprocessing code, stolen from @mjpost in this [issue](https://github.com/pytorch/fairseq/issues/1758)
-
-
-
-### Instructions
-Note: You need to have your test_generations.txt before you start this process.
-(1) Setup `mosesdecoder` and `wmt16-scripts`
-```bash
-cd $HOME
-git clone git@github.com:moses-smt/mosesdecoder.git
-cd mosesdecoder  
-git clone git@github.com:rsennrich/wmt16-scripts.git
-```
-
-(2) define a function for post processing.
- It removes diacritics and does other things I don't understand 
-```bash
-ro_post_process () {
-  sys=$1
-  ref=$2
-  export MOSES_PATH=$HOME/mosesdecoder
-  REPLACE_UNICODE_PUNCT=$MOSES_PATH/scripts/tokenizer/replace-unicode-punctuation.perl
-  NORM_PUNC=$MOSES_PATH/scripts/tokenizer/normalize-punctuation.perl
-  REM_NON_PRINT_CHAR=$MOSES_PATH/scripts/tokenizer/remove-non-printing-char.perl
-  REMOVE_DIACRITICS=$MOSES_PATH/wmt16-scripts/preprocess/remove-diacritics.py
-  NORMALIZE_ROMANIAN=$MOSES_PATH/wmt16-scripts/preprocess/normalise-romanian.py
-  TOKENIZER=$MOSES_PATH/scripts/tokenizer/tokenizer.perl
-
-
-
-  lang=ro
-  for file in $sys $ref; do
-    cat $file \
-    | $REPLACE_UNICODE_PUNCT \
-    | $NORM_PUNC -l $lang \
-    | $REM_NON_PRINT_CHAR \
-    | $NORMALIZE_ROMANIAN \
-    | $REMOVE_DIACRITICS \
-    | $TOKENIZER -no-escape -l $lang \
-    > $(basename $file).tok
-  done
-  # compute BLEU
-  cat $(basename $sys).tok | sacrebleu -tok none -s none -b $(basename $ref).tok
-}
-```
-
-(3) Call the function on test_generations.txt and test.target
-For example,
-```bash
-ro_post_process enro_finetune/test_generations.txt wmt_en_ro/test.target
-```
-This will split out a new blue score and write a new fine called `test_generations.tok` with post-processed outputs.
-
-
-
-
-
-
-
-
-
-```
--- a/examples/legacy/seq2seq/rouge_cli.py
+++ b/examples/legacy/seq2seq/rouge_cli.py
@ -1,31 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import fire
-
-from utils import calculate_rouge, save_json
-
-
-def calculate_rouge_path(pred_path, tgt_path, save_path=None, **kwargs):
-    """Kwargs will be passed to calculate_rouge"""
-    pred_lns = [x.strip() for x in open(pred_path)]
-    tgt_lns = [x.strip() for x in open(tgt_path)][: len(pred_lns)]
-    metrics = calculate_rouge(pred_lns, tgt_lns, **kwargs)
-    if save_path is not None:
-        save_json(metrics, save_path, indent=None)
-    return metrics  # these print nicely
-
-
-if __name__ == "__main__":
-    fire.Fire(calculate_rouge_path)
--- a/examples/legacy/seq2seq/run_distributed_eval.py
+++ b/examples/legacy/seq2seq/run_distributed_eval.py
@ -1,261 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import shutil
-import time
-from json import JSONDecodeError
-from logging import getLogger
-from pathlib import Path
-from typing import Optional
-
-import torch
-from torch.utils.data import DataLoader
-from tqdm import tqdm
-
-from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
-from utils import (
-    Seq2SeqDataset,
-    calculate_bleu,
-    calculate_rouge,
-    chunks,
-    lmap,
-    load_json,
-    parse_numeric_n_bool_cl_kwargs,
-    save_json,
-    use_task_specific_params,
-    write_txt_file,
-)
-
-
-logger = getLogger(__name__)
-
-
-def eval_data_dir(
-    data_dir,
-    save_dir: str,
-    model_name: str,
-    bs: int = 8,
-    max_source_length: int = 1024,
-    type_path="val",
-    n_obs=None,
-    fp16=False,
-    task="summarization",
-    local_rank=None,
-    num_return_sequences=1,
-    dataset_kwargs: Optional[dict] = None,
-    prefix="",
-    **generate_kwargs,
-) -> dict:
-    """Run evaluation on part of the data for one gpu and save to {save_dir}/rank_{rank}_output.json"""
-    model_name = str(model_name)
-    assert local_rank is not None
-    torch.distributed.init_process_group(backend="nccl", rank=local_rank)
-
-    save_dir = Path(save_dir)
-    save_path = save_dir.joinpath(f"rank_{local_rank}_output.json")
-    torch.cuda.set_device(local_rank)
-    model = AutoModelForSeq2SeqLM.from_pretrained(model_name).cuda()
-    if fp16:
-        model = model.half()
-    # determine if we need to increase num_beams
-    use_task_specific_params(model, task)  # update config with task specific params
-    num_beams = generate_kwargs.pop("num_beams", model.config.num_beams)  # AttributeError risk?
-    if num_return_sequences > num_beams:
-        num_beams = num_return_sequences
-
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    logger.info(f"Inferred tokenizer type: {tokenizer.__class__}")  # if this is wrong, check config.model_type.
-
-    if max_source_length is None:
-        max_source_length = tokenizer.model_max_length
-    if prefix is None:
-        prefix = prefix or getattr(model.config, "prefix", "") or ""
-    ds = Seq2SeqDataset(
-        tokenizer,
-        data_dir,
-        max_source_length,
-        max_target_length=1024,
-        type_path=type_path,
-        n_obs=n_obs,
-        prefix=prefix,
-        **dataset_kwargs,
-    )
-    # I set shuffle=True for a more accurate progress bar.
-    # If all the longest samples are first, the prog bar estimate is too high at the beginning.
-    sampler = ds.make_sortish_sampler(bs, distributed=True, add_extra_examples=False, shuffle=True)
-    data_loader = DataLoader(ds, sampler=sampler, batch_size=bs, collate_fn=ds.collate_fn)
-    results = []
-    for batch in tqdm(data_loader):
-        summaries = model.generate(
-            input_ids=batch["input_ids"].to(model.device),
-            attention_mask=batch["attention_mask"].to(model.device),
-            num_return_sequences=num_return_sequences,
-            num_beams=num_beams,
-            **generate_kwargs,
-        )
-        preds = tokenizer.batch_decode(summaries, skip_special_tokens=True, clean_up_tokenization_spaces=False)
-        ids = batch["ids"]
-        if num_return_sequences > 1:
-            preds = chunks(preds, num_return_sequences)  # batch size chunks, each of size num_return_seq
-        for i, pred in enumerate(preds):
-            results.append({"pred": pred, "id": ids[i].item()})
-    save_json(results, save_path)
-    return results, sampler.num_replicas
-
-
-def run_generate():
-    parser = argparse.ArgumentParser(
-        epilog="Unspecified args like --num_beams=2 --decoder_start_token_id=4 are passed to model.generate"
-    )
-    parser.add_argument("--data_dir", type=str, help="like cnn_dm/test.source")
-    parser.add_argument(
-        "--model_name",
-        type=str,
-        help="like facebook/bart-large-cnn,google-t5/t5-base, etc.",
-        default="sshleifer/distilbart-xsum-12-3",
-    )
-    parser.add_argument("--save_dir", type=str, help="where to save", default="tmp_gen")
-    parser.add_argument("--max_source_length", type=int, default=None)
-    parser.add_argument(
-        "--type_path", type=str, default="test", help="which subset to evaluate typically train/val/test"
-    )
-    parser.add_argument("--task", type=str, default="summarization", help="used for task_specific_params + metrics")
-    parser.add_argument("--bs", type=int, default=8, required=False, help="batch size")
-    parser.add_argument(
-        "--local_rank", type=int, default=-1, required=False, help="should be passed by distributed.launch"
-    )
-
-    parser.add_argument(
-        "--n_obs", type=int, default=None, required=False, help="How many observations. Defaults to all."
-    )
-    parser.add_argument(
-        "--num_return_sequences", type=int, default=1, required=False, help="How many sequences to return"
-    )
-    parser.add_argument(
-        "--sync_timeout",
-        type=int,
-        default=600,
-        required=False,
-        help="How long should master process wait for other processes to finish.",
-    )
-    parser.add_argument("--src_lang", type=str, default=None, required=False)
-    parser.add_argument("--tgt_lang", type=str, default=None, required=False)
-    parser.add_argument(
-        "--prefix", type=str, required=False, default=None, help="will be added to the beginning of src examples"
-    )
-    parser.add_argument("--fp16", action="store_true")
-    parser.add_argument("--debug", action="store_true")
-    start_time = time.time()
-    args, rest = parser.parse_known_args()
-    generate_kwargs = parse_numeric_n_bool_cl_kwargs(rest)
-    if generate_kwargs and args.local_rank <= 0:
-        print(f"parsed the following generate kwargs: {generate_kwargs}")
-    json_save_dir = Path(args.save_dir + "_tmp")
-    Path(json_save_dir).mkdir(exist_ok=True)  # this handles locking.
-    intermediate_files = list(json_save_dir.glob("rank_*.json"))
-    if intermediate_files:
-        raise ValueError(f"Found files at {json_save_dir} please move or remove them.")
-        # In theory, a node could finish and save before another node hits this. If this happens, we can address later.
-    dataset_kwargs = {}
-    if args.src_lang is not None:
-        dataset_kwargs["src_lang"] = args.src_lang
-    if args.tgt_lang is not None:
-        dataset_kwargs["tgt_lang"] = args.tgt_lang
-
-    Path(args.save_dir).mkdir(exist_ok=True)
-    results, num_replicas = eval_data_dir(
-        args.data_dir,
-        json_save_dir,
-        args.model_name,
-        type_path=args.type_path,
-        bs=args.bs,
-        fp16=args.fp16,
-        task=args.task,
-        local_rank=args.local_rank,
-        n_obs=args.n_obs,
-        max_source_length=args.max_source_length,
-        num_return_sequences=args.num_return_sequences,
-        prefix=args.prefix,
-        dataset_kwargs=dataset_kwargs,
-        **generate_kwargs,
-    )
-
-    if args.local_rank <= 0:
-        save_dir = Path(args.save_dir)
-        save_dir.mkdir(exist_ok=True)
-        partial_results = gather_results_from_each_node(num_replicas, json_save_dir, args.sync_timeout)
-        preds = combine_partial_results(partial_results)
-        if args.num_return_sequences > 1:
-            save_path = save_dir.joinpath("pseudolabel_results.json")
-            print(f"Saving aggregated results at {save_path}, intermediate in {json_save_dir}/")
-            save_json(preds, save_path)
-            return
-        tgt_file = Path(args.data_dir).joinpath(args.type_path + ".target")
-        with open(tgt_file) as f:
-            labels = [x.rstrip() for x in f][: len(preds)]
-
-        # Calculate metrics, save metrics,  and save _generations.txt
-        calc_bleu = "translation" in args.task
-        score_fn = calculate_bleu if calc_bleu else calculate_rouge
-        metric_name = "bleu" if calc_bleu else "rouge"
-        metrics: dict = score_fn(preds, labels)
-        metrics["n_obs"] = len(preds)
-        runtime = time.time() - start_time
-        metrics["seconds_per_sample"] = round(runtime / metrics["n_obs"], 4)
-        metrics["n_gpus"] = num_replicas
-        # TODO(@stas00): add whatever metadata to metrics
-        metrics_save_path = save_dir.joinpath(f"{args.type_path}_{metric_name}.json")
-        save_json(metrics, metrics_save_path, indent=None)
-        print(metrics)
-        write_txt_file(preds, save_dir.joinpath(f"{args.type_path}_generations.txt"))
-        if args.debug:
-            write_txt_file(labels, save_dir.joinpath(f"{args.type_path}.target"))
-        else:
-            shutil.rmtree(json_save_dir)
-
-
-def combine_partial_results(partial_results) -> list:
-    """Concatenate partial results into one file, then sort it by id."""
-    records = []
-    for partial_result in partial_results:
-        records.extend(partial_result)
-    records = sorted(records, key=lambda x: x["id"])
-    preds = [x["pred"] for x in records]
-    return preds
-
-
-def gather_results_from_each_node(num_replicas, save_dir, timeout) -> list[dict[str, list]]:
-    # WAIT FOR lots of .json files
-    start_wait = time.time()
-    logger.info("waiting for all nodes to finish")
-    json_data = None
-    while (time.time() - start_wait) < timeout:
-        json_files = list(save_dir.glob("rank_*.json"))
-        if len(json_files) < num_replicas:
-            continue
-        try:
-            # make sure all json files are fully saved
-            json_data = lmap(load_json, json_files)
-            return json_data
-        except JSONDecodeError:
-            continue
-    raise TimeoutError("Rank 0 gave up on waiting for other processes")
-    # Unreachable
-
-
-if __name__ == "__main__":
-    # Usage for MT:
-    run_generate()
--- a/examples/legacy/seq2seq/run_eval.py
+++ b/examples/legacy/seq2seq/run_eval.py
@ -1,184 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import datetime
-import json
-import time
-import warnings
-from logging import getLogger
-from pathlib import Path
-
-import torch
-from tqdm import tqdm
-
-from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
-from utils import calculate_bleu, calculate_rouge, chunks, parse_numeric_n_bool_cl_kwargs, use_task_specific_params
-
-
-logger = getLogger(__name__)
-
-
-DEFAULT_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-
-
-def generate_summaries_or_translations(
-    examples: list[str],
-    out_file: str,
-    model_name: str,
-    batch_size: int = 8,
-    device: str = DEFAULT_DEVICE,
-    fp16=False,
-    task="summarization",
-    prefix=None,
-    **generate_kwargs,
-) -> dict:
-    """Save model.generate results to <out_file>, and return how long it took."""
-    fout = Path(out_file).open("w", encoding="utf-8")
-    model_name = str(model_name)
-    model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
-    if fp16:
-        model = model.half()
-
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    logger.info(f"Inferred tokenizer type: {tokenizer.__class__}")  # if this is wrong, check config.model_type.
-
-    start_time = time.time()
-    # update config with task specific params
-    use_task_specific_params(model, task)
-    if prefix is None:
-        prefix = prefix or getattr(model.config, "prefix", "") or ""
-    for examples_chunk in tqdm(list(chunks(examples, batch_size))):
-        examples_chunk = [prefix + text for text in examples_chunk]
-        batch = tokenizer(examples_chunk, return_tensors="pt", truncation=True, padding="longest").to(device)
-        summaries = model.generate(
-            input_ids=batch.input_ids,
-            attention_mask=batch.attention_mask,
-            **generate_kwargs,
-        )
-        dec = tokenizer.batch_decode(summaries, skip_special_tokens=True, clean_up_tokenization_spaces=False)
-        for hypothesis in dec:
-            fout.write(hypothesis + "\n")
-            fout.flush()
-    fout.close()
-    runtime = int(time.time() - start_time)  # seconds
-    n_obs = len(examples)
-    return {"n_obs": n_obs, "runtime": runtime, "seconds_per_sample": round(runtime / n_obs, 4)}
-
-
-def datetime_now():
-    return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-
-
-def run_generate(verbose=True):
-    """
-
-    Takes input text, generates output, and then using reference calculates the BLEU scores.
-
-    The results are saved to a file and returned to the caller, and printed out unless ``verbose=False`` is passed.
-
-    Args:
-        verbose (:obj:`bool`, `optional`, defaults to :obj:`True`): print results to stdout
-
-    Returns:
-        a tuple: ``(scores, params}``
-        - ``scores``: a dict of scores data ``{'bleu': 39.6501, 'n_obs': 2000, 'runtime': 186, 'seconds_per_sample': 0.093}``
-        - ``params``: a dict of custom params, e.g. ``{'num_beams': 5, 'length_penalty': 0.8}``
-    """
-
-    parser = argparse.ArgumentParser()
-    parser.add_argument("model_name", type=str, help="like facebook/bart-large-cnn,google-t5/t5-base, etc.")
-    parser.add_argument("input_path", type=str, help="like cnn_dm/test.source")
-    parser.add_argument("save_path", type=str, help="where to save summaries")
-    parser.add_argument("--reference_path", type=str, required=False, help="like cnn_dm/test.target")
-    parser.add_argument("--score_path", type=str, required=False, default="metrics.json", help="where to save metrics")
-    parser.add_argument("--device", type=str, required=False, default=DEFAULT_DEVICE, help="cuda, cuda:1, cpu etc.")
-    parser.add_argument(
-        "--prefix", type=str, required=False, default=None, help="will be added to the beginning of src examples"
-    )
-    parser.add_argument("--task", type=str, default="summarization", help="used for task_specific_params + metrics")
-    parser.add_argument("--bs", type=int, default=8, required=False, help="batch size")
-    parser.add_argument(
-        "--n_obs", type=int, default=-1, required=False, help="How many observations. Defaults to all."
-    )
-    parser.add_argument("--fp16", action="store_true")
-    parser.add_argument("--dump-args", action="store_true", help="print the custom hparams with the results")
-    parser.add_argument(
-        "--info",
-        nargs="?",
-        type=str,
-        const=datetime_now(),
-        help=(
-            "use in conjunction w/ --dump-args to print with the results whatever other info you'd like, e.g."
-            " lang=en-ru. If no value is passed, the current datetime string will be used."
-        ),
-    )
-    # Unspecified args like --num_beams=2 --decoder_start_token_id=4 are passed to model.generate
-    args, rest = parser.parse_known_args()
-    parsed_args = parse_numeric_n_bool_cl_kwargs(rest)
-    if parsed_args and verbose:
-        print(f"parsed the following generate kwargs: {parsed_args}")
-    examples = [" " + x.rstrip() if "t5" in args.model_name else x.rstrip() for x in open(args.input_path)]
-    if args.n_obs > 0:
-        examples = examples[: args.n_obs]
-    Path(args.save_path).parent.mkdir(exist_ok=True)
-
-    if args.reference_path is None and Path(args.score_path).exists():
-        warnings.warn(f"score_path {args.score_path} will be overwritten unless you type ctrl-c.")
-
-    if args.device == "cpu" and args.fp16:
-        # this mix leads to RuntimeError: "threshold_cpu" not implemented for 'Half'
-        raise ValueError("Can't mix --fp16 and --device cpu")
-
-    runtime_metrics = generate_summaries_or_translations(
-        examples,
-        args.save_path,
-        args.model_name,
-        batch_size=args.bs,
-        device=args.device,
-        fp16=args.fp16,
-        task=args.task,
-        prefix=args.prefix,
-        **parsed_args,
-    )
-
-    if args.reference_path is None:
-        return {}
-
-    # Compute scores
-    score_fn = calculate_bleu if "translation" in args.task else calculate_rouge
-    output_lns = [x.rstrip() for x in open(args.save_path)]
-    reference_lns = [x.rstrip() for x in open(args.reference_path)][: len(output_lns)]
-    scores: dict = score_fn(output_lns, reference_lns)
-    scores.update(runtime_metrics)
-
-    if args.dump_args:
-        scores.update(parsed_args)
-    if args.info:
-        scores["info"] = args.info
-
-    if verbose:
-        print(scores)
-
-    if args.score_path is not None:
-        json.dump(scores, open(args.score_path, "w"))
-
-    return scores
-
-
-if __name__ == "__main__":
-    # Usage for MT:
-    # python run_eval.py MODEL_NAME $DATA_DIR/test.source $save_dir/test_translations.txt --reference_path $DATA_DIR/test.target --score_path $save_dir/test_bleu.json  --task translation $@
-    run_generate(verbose=True)
--- a/examples/legacy/seq2seq/run_eval_search.py
+++ b/examples/legacy/seq2seq/run_eval_search.py
@ -1,158 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import itertools
-import operator
-import sys
-from collections import OrderedDict
-
-from run_eval import datetime_now, run_generate
-
-from utils import ROUGE_KEYS
-
-
-# A table of supported tasks and the list of scores in the order of importance to be sorted by.
-# To add a new task, simply list the score names that `run_eval.run_generate()` returns
-task_score_names = {
-    "translation": ["bleu"],
-    "summarization": ROUGE_KEYS,
-}
-
-
-def parse_search_arg(search):
-    groups = search.split()
-    entries = dict(g.split("=") for g in groups)
-    entry_names = list(entries.keys())
-    sets = [[f"--{k} {v}" for v in vs.split(":")] for k, vs in entries.items()]
-    matrix = [list(x) for x in itertools.product(*sets)]
-    return matrix, entry_names
-
-
-def run_search():
-    """
-     Run parametric search over the desired hparam space with help of ``run_eval.py``.
-
-     All the arguments except ``--search`` are passed to ``run_eval.py`` as is. The values inside of "--search" are parsed, reformatted and fed to ``run_eval.py`` as additional args.
-
-    The format for the ``--search`` value is a simple string with hparams and colon separated values to try, e.g.:
-    ```
-     --search "num_beams=5:10 length_penalty=0.8:1.0:1.2 early_stopping=true:false"
-    ```
-    which will generate ``12`` ``(2*3*2)`` searches for a product of each hparam. For example the example that was just used will invoke ``run_eval.py`` repeatedly with:
-
-    ```
-     --num_beams 5 --length_penalty 0.8 --early_stopping true
-     --num_beams 5 --length_penalty 0.8 --early_stopping false
-     [...]
-     --num_beams 10 --length_penalty 1.2 --early_stopping false
-    ```
-
-    On completion, this function prints a markdown table of the results sorted by the best BLEU score and the winning arguments.
-
-
-    """
-    prog = sys.argv[0]
-
-    parser = argparse.ArgumentParser(
-        usage=(
-            "\n\nImportant: this script accepts all arguments `run_eval.py` accepts and then a few extra, therefore"
-            " refer to `run_eval.py -h` for the complete list."
-        )
-    )
-    parser.add_argument(
-        "--search",
-        type=str,
-        required=False,
-        help='param space to search, e.g. "num_beams=5:10 length_penalty=0.8:1.0:1.2"',
-    )
-    parser.add_argument(
-        "--bs", type=int, default=8, required=False, help="initial batch size (may get reduced if it's too big)"
-    )
-    parser.add_argument("--task", type=str, help="used for task_specific_params + metrics")
-    parser.add_argument(
-        "--info",
-        nargs="?",
-        type=str,
-        const=datetime_now(),
-        help=(
-            "add custom notes to be printed before the results table. If no value is passed, the current datetime"
-            " string will be used."
-        ),
-    )
-    args, args_main = parser.parse_known_args()
-    # we share some of the args
-    args_main.extend(["--task", args.task])
-    args_normal = [prog] + args_main
-
-    # to support variations like translation_en_to_de"
-    task = "translation" if "translation" in args.task else "summarization"
-
-    matrix, col_names = parse_search_arg(args.search)
-    col_names[0:0] = task_score_names[task]  # score cols first
-    col_widths = {col: len(str(col)) for col in col_names}
-    results = []
-    for r in matrix:
-        hparams = dict(x.replace("--", "").split() for x in r)
-        args_exp = " ".join(r).split()
-        args_exp.extend(["--bs", str(args.bs)])  # in case we need to reduce its size due to CUDA OOM
-        sys.argv = args_normal + args_exp
-
-        # XXX: need to trap CUDA OOM and lower args.bs if that happens and retry
-
-        scores = run_generate(verbose=False)
-        # make sure scores are first in the table
-        result = OrderedDict()
-        for score in task_score_names[task]:
-            result[score] = scores[score]
-        result.update(hparams)
-        results.append(result)
-
-        # find widest entries
-        for k, v in result.items():
-            l = len(str(v))
-            if l > col_widths[k]:
-                col_widths[k] = l
-
-    results_sorted = sorted(results, key=operator.itemgetter(*task_score_names[task]), reverse=True)
-    print(" | ".join([f"{col:{col_widths[col]}}" for col in col_names]))
-    print(" | ".join([f"{'-' * col_widths[col]}" for col in col_names]))
-    for row in results_sorted:
-        print(" | ".join([f"{row[col]:{col_widths[col]}}" for col in col_names]))
-
-    best = results_sorted[0]
-    for score in task_score_names[task]:
-        del best[score]
-    best_args = [f"--{k} {v}" for k, v in best.items()]
-    dyn_args = ["--bs", str(args.bs)]
-    if args.info:
-        print(f"\nInfo: {args.info}")
-    print("\nBest score args:")
-    print(" ".join(args_main + best_args + dyn_args))
-
-    return results_sorted
-
-
-if __name__ == "__main__":
-    # Usage:
-    # [normal-run_eval_search.py cmd plus] \
-    # --search="num_beams=1:5:10 length_penalty=0.8:1:1.2 early_stopping=true:false"
-    #
-    # Example:
-    # PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py $MODEL_NAME \
-    # $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target \
-    # --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation \
-    # --search="num_beams=1:5:10 length_penalty=0.8:1:1.2 early_stopping=true:false"
-    run_search()
--- a/examples/legacy/seq2seq/save_len_file.py
+++ b/examples/legacy/seq2seq/save_len_file.py
@ -1,56 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import fire
-from torch.utils.data import DataLoader
-from tqdm import tqdm
-
-from transformers import AutoTokenizer
-from utils import Seq2SeqDataset, pickle_save
-
-
-def save_len_file(
-    tokenizer_name, data_dir, max_source_length=1024, max_target_length=1024, consider_target=False, **kwargs
-):
-    """Save max(src_len, tgt_len) for each example to allow dynamic batching."""
-    tok = AutoTokenizer.from_pretrained(tokenizer_name)
-    train_ds = Seq2SeqDataset(tok, data_dir, max_source_length, max_target_length, type_path="train", **kwargs)
-    pad = tok.pad_token_id
-
-    def get_lens(ds):
-        dl = tqdm(
-            DataLoader(ds, batch_size=512, num_workers=8, shuffle=False, collate_fn=ds.collate_fn),
-            desc=str(ds.len_file),
-        )
-        max_lens = []
-        for batch in dl:
-            src_lens = batch["input_ids"].ne(pad).sum(1).tolist()
-            tgt_lens = batch["labels"].ne(pad).sum(1).tolist()
-            if consider_target:
-                for src, tgt in zip(src_lens, tgt_lens):
-                    max_lens.append(max(src, tgt))
-            else:
-                max_lens.extend(src_lens)
-        return max_lens
-
-    train_lens = get_lens(train_ds)
-    val_ds = Seq2SeqDataset(tok, data_dir, max_source_length, max_target_length, type_path="val", **kwargs)
-    val_lens = get_lens(val_ds)
-    pickle_save(train_lens, train_ds.len_file)
-    pickle_save(val_lens, val_ds.len_file)
-
-
-if __name__ == "__main__":
-    fire.Fire(save_len_file)
--- a/examples/legacy/seq2seq/save_randomly_initialized_model.py
+++ b/examples/legacy/seq2seq/save_randomly_initialized_model.py
@ -1,39 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import fire
-
-from transformers import AutoConfig, AutoModelForSeq2SeqLM, AutoTokenizer
-
-
-def save_randomly_initialized_version(config_name: str, save_dir: str, **config_kwargs):
-    """Save a randomly initialized version of a model using a pretrained config.
-    Args:
-        config_name: which config to use
-        save_dir: where to save the resulting model and tokenizer
-        config_kwargs: Passed to AutoConfig
-
-    Usage::
-        save_randomly_initialized_version("facebook/bart-large-cnn", "distilbart_random_cnn_6_3", encoder_layers=6, decoder_layers=3, num_beams=3)
-    """
-    cfg = AutoConfig.from_pretrained(config_name, **config_kwargs)
-    model = AutoModelForSeq2SeqLM.from_config(cfg)
-    model.save_pretrained(save_dir)
-    AutoTokenizer.from_pretrained(config_name).save_pretrained(save_dir)
-    return model
-
-
-if __name__ == "__main__":
-    fire.Fire(save_randomly_initialized_version)
--- a/examples/legacy/seq2seq/sentence_splitter.py
+++ b/examples/legacy/seq2seq/sentence_splitter.py
@ -1,35 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import re
-
-from filelock import FileLock
-
-
-try:
-    import nltk
-
-    NLTK_AVAILABLE = True
-except (ImportError, ModuleNotFoundError):
-    NLTK_AVAILABLE = False
-
-if NLTK_AVAILABLE:
-    with FileLock(".lock") as lock:
-        nltk.download("punkt", quiet=True)
-
-
-def add_newline_to_end_of_each_sentence(x: str) -> str:
-    """This was added to get rougeLsum scores matching published rougeL scores for BART and PEGASUS."""
-    re.sub("<n>", "", x)  # remove pegasus newline char
-    assert NLTK_AVAILABLE, "nltk must be installed to separate newlines between sentences. (pip install nltk)"
-    return "\n".join(nltk.sent_tokenize(x))
--- a/examples/legacy/seq2seq/seq2seq_trainer.py
+++ b/examples/legacy/seq2seq/seq2seq_trainer.py
@ -1,248 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Any, Optional, Union
-
-import torch
-from torch import nn
-from torch.utils.data import DistributedSampler, RandomSampler
-
-from transformers import PreTrainedModel, Trainer, logging
-from transformers.models.fsmt.configuration_fsmt import FSMTConfig
-from transformers.optimization import (
-    Adafactor,
-    get_constant_schedule,
-    get_constant_schedule_with_warmup,
-    get_cosine_schedule_with_warmup,
-    get_cosine_with_hard_restarts_schedule_with_warmup,
-    get_linear_schedule_with_warmup,
-    get_polynomial_decay_schedule_with_warmup,
-)
-from transformers.trainer_pt_utils import get_tpu_sampler
-from transformers.training_args import ParallelMode
-from transformers.utils import is_torch_xla_available
-
-
-logger = logging.get_logger(__name__)
-
-arg_to_scheduler = {
-    "linear": get_linear_schedule_with_warmup,
-    "cosine": get_cosine_schedule_with_warmup,
-    "cosine_w_restarts": get_cosine_with_hard_restarts_schedule_with_warmup,
-    "polynomial": get_polynomial_decay_schedule_with_warmup,
-    "constant": get_constant_schedule,
-    "constant_w_warmup": get_constant_schedule_with_warmup,
-}
-
-
-class Seq2SeqTrainer(Trainer):
-    def __init__(self, config=None, data_args=None, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        if config is None:
-            assert isinstance(self.model, PreTrainedModel), (
-                "If no `config` is passed the model to be trained has to be of type `PreTrainedModel`, but is"
-                f" {self.model.__class__}"
-            )
-            self.config = self.model.config
-        else:
-            self.config = config
-
-        self.data_args = data_args
-        self.vocab_size = self.config.tgt_vocab_size if isinstance(self.config, FSMTConfig) else self.config.vocab_size
-
-        if self.args.label_smoothing != 0 or (self.data_args is not None and self.data_args.ignore_pad_token_for_loss):
-            assert self.config.pad_token_id is not None, (
-                "Make sure that `config.pad_token_id` is correctly defined when ignoring `pad_token` for loss"
-                " calculation or doing label smoothing."
-            )
-
-        if self.config.pad_token_id is None and self.config.eos_token_id is not None:
-            logger.warning(
-                f"The `config.pad_token_id` is `None`. Using `config.eos_token_id` = {self.config.eos_token_id} for"
-                " padding.."
-            )
-
-        if self.args.label_smoothing == 0:
-            self.loss_fn = torch.nn.CrossEntropyLoss(ignore_index=self.config.pad_token_id)
-        else:
-            # dynamically import label_smoothed_nll_loss
-            from utils import label_smoothed_nll_loss
-
-            self.loss_fn = label_smoothed_nll_loss
-
-    def create_optimizer_and_scheduler(self, num_training_steps: int):
-        """
-        Setup the optimizer and the learning rate scheduler.
-
-        We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
-        Trainer's init through :obj:`optimizers`, or subclass and override this method in a subclass.
-        """
-        if self.optimizer is None:
-            no_decay = ["bias", "LayerNorm.weight"]
-            optimizer_grouped_parameters = [
-                {
-                    "params": [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
-                    "weight_decay": self.args.weight_decay,
-                },
-                {
-                    "params": [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)],
-                    "weight_decay": 0.0,
-                },
-            ]
-            if self.args.adafactor:
-                optimizer_cls = Adafactor
-                optimizer_kwargs = {"scale_parameter": False, "relative_step": False}
-            else:
-                optimizer_cls = torch.optim.AdamW
-                optimizer_kwargs = {
-                    "betas": (self.args.adam_beta1, self.args.adam_beta2),
-                    "eps": self.args.adam_epsilon,
-                }
-            optimizer_kwargs["lr"] = self.args.learning_rate
-            self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
-
-        if self.lr_scheduler is None:
-            self.lr_scheduler = self._get_lr_scheduler(num_training_steps)
-        else:  # ignoring --lr_scheduler
-            logger.warning("scheduler is passed to `Seq2SeqTrainer`, `--lr_scheduler` arg is ignored.")
-
-    def _get_lr_scheduler(self, num_training_steps):
-        schedule_func = arg_to_scheduler[self.args.lr_scheduler]
-        if self.args.lr_scheduler == "constant":
-            scheduler = schedule_func(self.optimizer)
-        elif self.args.lr_scheduler == "constant_w_warmup":
-            scheduler = schedule_func(self.optimizer, num_warmup_steps=self.args.warmup_steps)
-        else:
-            scheduler = schedule_func(
-                self.optimizer, num_warmup_steps=self.args.warmup_steps, num_training_steps=num_training_steps
-            )
-        return scheduler
-
-    def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
-        if isinstance(self.train_dataset, torch.utils.data.IterableDataset):
-            return None
-        elif is_torch_xla_available():
-            return get_tpu_sampler(self.train_dataset)
-        else:
-            if self.args.sortish_sampler:
-                self.train_dataset.make_sortish_sampler(
-                    self.args.per_device_train_batch_size,
-                    distributed=(self.args.parallel_mode == ParallelMode.DISTRIBUTED),
-                )
-
-            return (
-                RandomSampler(self.train_dataset)
-                if self.args.local_process_index == -1
-                else DistributedSampler(self.train_dataset)
-            )
-
-    def _compute_loss(self, model, inputs, labels):
-        if self.args.label_smoothing == 0:
-            if self.data_args is not None and self.data_args.ignore_pad_token_for_loss:
-                # force training to ignore pad token
-                logits = model(**inputs, use_cache=False)[0]
-                loss = self.loss_fn(logits.view(-1, logits.shape[-1]), labels.view(-1))
-            else:
-                # compute usual loss via models
-                loss, logits = model(**inputs, labels=labels, use_cache=False)[:2]
-        else:
-            # compute label smoothed loss
-            logits = model(**inputs, use_cache=False)[0]
-            lprobs = torch.nn.functional.log_softmax(logits, dim=-1)
-            loss, _ = self.loss_fn(lprobs, labels, self.args.label_smoothing, ignore_index=self.config.pad_token_id)
-        return loss, logits
-
-    def compute_loss(self, model, inputs):
-        labels = inputs.pop("labels")
-        loss, _ = self._compute_loss(model, inputs, labels)
-        return loss
-
-    def prediction_step(
-        self,
-        model: nn.Module,
-        inputs: dict[str, Union[torch.Tensor, Any]],
-        prediction_loss_only: bool,
-        ignore_keys: Optional[list[str]] = None,
-    ) -> tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
-        """
-        Perform an evaluation step on :obj:`model` using obj:`inputs`.
-
-        Subclass and override to inject custom behavior.
-
-        Args:
-            model (:obj:`nn.Module`):
-                The model to evaluate.
-            inputs (:obj:`dict[str, Union[torch.Tensor, Any]]`):
-                The inputs and targets of the model.
-
-                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
-                argument :obj:`labels`. Check your model's documentation for all accepted arguments.
-            prediction_loss_only (:obj:`bool`):
-                Whether or not to return the loss only.
-
-        Return:
-            tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]:
-            A tuple with the loss, logits and labels (each being optional).
-        """
-        inputs = self._prepare_inputs(inputs)
-
-        gen_kwargs = {
-            "max_length": self.data_args.val_max_target_length
-            if self.data_args is not None
-            else self.config.max_length,
-            "num_beams": self.data_args.eval_beams if self.data_args is not None else self.config.num_beams,
-        }
-
-        if self.args.predict_with_generate and not self.args.prediction_loss_only:
-            generated_tokens = self.model.generate(
-                inputs["input_ids"],
-                attention_mask=inputs["attention_mask"],
-                **gen_kwargs,
-            )
-            # in case the batch is shorter than max length, the output should be padded
-            if generated_tokens.shape[-1] < gen_kwargs["max_length"]:
-                generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_kwargs["max_length"])
-
-        labels = inputs.pop("labels")
-        with torch.no_grad():
-            # compute loss on predict data
-            loss, logits = self._compute_loss(model, inputs, labels)
-
-        loss = loss.mean().detach()
-        if self.args.prediction_loss_only:
-            return (loss, None, None)
-
-        logits = generated_tokens if self.args.predict_with_generate else logits
-
-        if labels.shape[-1] < gen_kwargs["max_length"]:
-            labels = self._pad_tensors_to_max_len(labels, gen_kwargs["max_length"])
-
-        return (loss, logits, labels)
-
-    def _pad_tensors_to_max_len(self, tensor, max_length):
-        # If PAD token is not defined at least EOS token has to be defined
-        pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else self.config.eos_token_id
-
-        if pad_token_id is None:
-            raise ValueError(
-                "Make sure that either `config.pad_token_id` or `config.eos_token_id` is defined if tensor has to be"
-                f" padded to `max_length`={max_length}"
-            )
-
-        padded_tensor = pad_token_id * torch.ones(
-            (tensor.shape[0], max_length), dtype=tensor.dtype, device=tensor.device
-        )
-        padded_tensor[:, : tensor.shape[-1]] = tensor
-        return padded_tensor
--- a/examples/legacy/seq2seq/seq2seq_training_args.py
+++ b/examples/legacy/seq2seq/seq2seq_training_args.py
@ -1,60 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-from dataclasses import dataclass, field
-from typing import Optional
-
-from seq2seq_trainer import arg_to_scheduler
-
-from transformers import TrainingArguments
-
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class Seq2SeqTrainingArguments(TrainingArguments):
-    """
-    Parameters:
-        label_smoothing (:obj:`float`, `optional`, defaults to 0):
-            The label smoothing epsilon to apply (if not zero).
-        sortish_sampler (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether to SortishSampler or not. It sorts the inputs according to lengths in-order to minimizing the padding size.
-        predict_with_generate (:obj:`bool`, `optional`, defaults to :obj:`False`):
-            Whether to use generate to calculate generative metrics (ROUGE, BLEU).
-    """
-
-    label_smoothing: Optional[float] = field(
-        default=0.0, metadata={"help": "The label smoothing epsilon to apply (if not zero)."}
-    )
-    sortish_sampler: bool = field(default=False, metadata={"help": "Whether to SortishSampler or not."})
-    predict_with_generate: bool = field(
-        default=False, metadata={"help": "Whether to use generate to calculate generative metrics (ROUGE, BLEU)."}
-    )
-    adafactor: bool = field(default=False, metadata={"help": "whether to use adafactor"})
-    encoder_layerdrop: Optional[float] = field(
-        default=None, metadata={"help": "Encoder layer dropout probability. Goes into model.config."}
-    )
-    decoder_layerdrop: Optional[float] = field(
-        default=None, metadata={"help": "Decoder layer dropout probability. Goes into model.config."}
-    )
-    dropout: Optional[float] = field(default=None, metadata={"help": "Dropout probability. Goes into model.config."})
-    attention_dropout: Optional[float] = field(
-        default=None, metadata={"help": "Attention dropout probability. Goes into model.config."}
-    )
-    lr_scheduler: Optional[str] = field(
-        default="linear",
-        metadata={"help": f"Which lr scheduler to use. Selected in {sorted(arg_to_scheduler.keys())}"},
-    )
--- a/examples/legacy/seq2seq/test_data/fsmt/build-eval-data.py
+++ b/examples/legacy/seq2seq/test_data/fsmt/build-eval-data.py
@ -1,32 +0,0 @@
-#!/usr/bin/env python
-
-import json
-import subprocess
-
-
-pairs = [
-    ["en", "ru"],
-    ["ru", "en"],
-    ["en", "de"],
-    ["de", "en"],
-]
-
-n_objs = 8
-
-
-def get_all_data(pairs, n_objs):
-    text = {}
-    for src, tgt in pairs:
-        pair = f"{src}-{tgt}"
-        cmd = f"sacrebleu -t wmt19 -l {pair} --echo src".split()
-        src_lines = subprocess.run(cmd, stdout=subprocess.PIPE).stdout.decode("utf-8").splitlines()
-        cmd = f"sacrebleu -t wmt19 -l {pair} --echo ref".split()
-        tgt_lines = subprocess.run(cmd, stdout=subprocess.PIPE).stdout.decode("utf-8").splitlines()
-        text[pair] = {"src": src_lines[:n_objs], "tgt": tgt_lines[:n_objs]}
-    return text
-
-
-text = get_all_data(pairs, n_objs)
-filename = "./fsmt_val_data.json"
-with open(filename, "w", encoding="utf-8") as f:
-    bleu_data = json.dump(text, f, indent=2, ensure_ascii=False)
--- a/examples/legacy/seq2seq/test_data/fsmt/fsmt_val_data.json
+++ b/examples/legacy/seq2seq/test_data/fsmt/fsmt_val_data.json
@ -1,90 +0,0 @@
-{
-  "en-ru": {
-    "src": [
-      "Welsh AMs worried about 'looking like muppets'",
-      "There is consternation among some AMs at a suggestion their title should change to MWPs (Member of the Welsh Parliament).",
-      "It has arisen because of plans to change the name of the assembly to the Welsh Parliament.",
-      "AMs across the political spectrum are worried it could invite ridicule.",
-      "One Labour AM said his group was concerned \"it rhymes with Twp and Pwp.\"",
-      "For readers outside of Wales: In Welsh twp means daft and pwp means poo.",
-      "A Plaid AM said the group as a whole was \"not happy\" and has suggested alternatives.",
-      "A Welsh Conservative said his group was \"open minded\" about the name change, but noted it was a short verbal hop from MWP to Muppet."
-    ],
-    "tgt": [
-      "Члены Национальной ассамблеи Уэльса обеспокоены, что \"выглядят как куклы\"",
-      "Некоторые члены Национальной ассамблеи Уэльса в ужасе от предложения о том, что их наименование должно измениться на MPW (члены Парламента Уэльса).",
-      "Этот вопрос был поднят в связи с планами по переименованию ассамблеи в Парламент Уэльса.",
-      "Члены Национальной ассамблеи Уэльса всего политического спектра обеспокоены, что это может породить насмешки.",
-      "Один из лейбористских членов Национальной ассамблеи Уэльса сказал, что его партия обеспокоена тем, что \"это рифмуется с Twp и Pwp\".",
-      "Для читателей за предлами Уэльса: по-валлийски twp означает \"глупый\", а pwp означает \"какашка\".",
-      "Член Национальной ассамблеи от Плайд сказал, что эта партия в целом \"не счастлива\" и предложил альтернативы.",
-      "Представитель Консервативной партии Уэльса сказал, что его партия \"открыта\" к переименованию, но отметил, что между WMP и Muppet небольшая разница в произношении."
-    ]
-  },
-  "ru-en": {
-    "src": [
-      "Названо число готовящихся к отправке в Донбасс новобранцев из Украины",
-      "Официальный представитель Народной милиции самопровозглашенной Луганской Народной Республики (ЛНР) Андрей Марочко заявил, что зимой 2018-2019 года Украина направит в Донбасс не менее 3 тыс. новобранцев.",
-      "По его словам, таким образом Киев планирует \"хоть как-то доукомплектовать подразделения\".",
-      "\"Нежелание граждан Украины проходить службу в рядах ВС Украины, массовые увольнения привели к низкой укомплектованности подразделений\", - рассказал Марочко, которого цитирует \"РИА Новости\".",
-      "Он также не исключил, что реальные цифры призванных в армию украинцев могут быть увеличены в случае необходимости.",
-      "В 2014-2017 годах Киев начал так называемую антитеррористическую операцию (АТО), которую позже сменили на операцию объединенных сил (ООС).",
-      "Предполагалось, что эта мера приведет к усилению роли украинских силовиков в урегулировании ситуации.",
-      "В конце августа 2018 года ситуация в Донбассе обострилась из-за убийства главы ДНР Александра Захарченко."
-    ],
-    "tgt": [
-      "The number of new Ukrainian recruits ready to go to Donbass has become public",
-      "Official representative of the peoples’ militia of the self-proclaimed Lugansk People’s Republic Andrey Marochko claimed that Ukrainian will send at least 3 thousand new recruits to Donbass in winter 2018-2019.",
-      "This is how Kyiv tries “at least somehow to staff the units,” he said.",
-      "“The unwillingness of Ukrainian citizens to serve in the Ukraine’s military forces, mass resignments lead to low understaffing,” said Marochko cited by RIA Novosti.",
-      "Also, he doesn’t exclude that the real numbers of conscripts in the Ukrainian army can be raised is necessary.",
-      "In 2014-2017, Kyiv started so-called antiterrorist operation, that ws later changed to the united forces operation.",
-      "This measure was supposed to strengthen the role of the Ukrainian military in settling the situation.",
-      "In the late August 2018, the situation in Donbass escalated as the DNR head Aleksandr Zakharchenko was killed."
-    ]
-  },
-  "en-de": {
-    "src": [
-      "Welsh AMs worried about 'looking like muppets'",
-      "There is consternation among some AMs at a suggestion their title should change to MWPs (Member of the Welsh Parliament).",
-      "It has arisen because of plans to change the name of the assembly to the Welsh Parliament.",
-      "AMs across the political spectrum are worried it could invite ridicule.",
-      "One Labour AM said his group was concerned \"it rhymes with Twp and Pwp.\"",
-      "For readers outside of Wales: In Welsh twp means daft and pwp means poo.",
-      "A Plaid AM said the group as a whole was \"not happy\" and has suggested alternatives.",
-      "A Welsh Conservative said his group was \"open minded\" about the name change, but noted it was a short verbal hop from MWP to Muppet."
-    ],
-    "tgt": [
-      "Walisische Ageordnete sorgen sich \"wie Dödel auszusehen\"",
-      "Es herrscht Bestürzung unter einigen Mitgliedern der Versammlung über einen Vorschlag, der ihren Titel zu MWPs (Mitglied der walisischen Parlament) ändern soll.",
-      "Der Grund dafür waren Pläne, den Namen der Nationalversammlung in Walisisches Parlament zu ändern.",
-      "Mitglieder aller Parteien der Nationalversammlung haben Bedenken, dass sie sich dadurch Spott aussetzen könnten.",
-      "Ein Labour-Abgeordneter sagte, dass seine Gruppe \"sich mit Twp und Pwp reimt\".",
-      "Hinweis für den Leser: „twp“ im Walisischen bedeutet „bescheuert“ und „pwp“ bedeutet „Kacke“.",
-      "Ein Versammlungsmitglied von Plaid Cymru sagte, die Gruppe als Ganzes sei \"nicht glücklich\" und hat Alternativen vorgeschlagen.",
-      "Ein walisischer Konservativer sagte, seine Gruppe wäre „offen“ für eine Namensänderung, wies aber darauf hin, dass es von „MWP“ (Mitglied des Walisischen Parlaments) nur ein kurzer verbaler Sprung zu „Muppet“ ist."
-    ]
-  },
-  "de-en": {
-    "src": [
-      "Schöne Münchnerin 2018: Schöne Münchnerin 2018 in Hvar: Neun Dates",
-      "Von az, aktualisiert am 04.05.2018 um 11:11",
-      "Ja, sie will...",
-      "\"Schöne Münchnerin\" 2018 werden!",
-      "Am Nachmittag wartet erneut eine Überraschung auf unsere Kandidatinnen: sie werden das romantische Candlelight-Shooting vor der MY SOLARIS nicht alleine bestreiten, sondern an der Seite von Male-Model Fabian!",
-      "Hvar - Flirten, kokettieren, verführen - keine einfachen Aufgaben für unsere Mädchen.",
-      "Insbesondere dann, wenn in Deutschland ein Freund wartet.",
-      "Dennoch liefern die neun \"Schöne Münchnerin\"-Kandidatinnen beim Shooting mit People-Fotograf Tuan ab und trotzen Wind, Gischt und Regen wie echte Profis."
-    ],
-    "tgt": [
-      "The Beauty of Munich 2018: the Beauty of Munich 2018 in Hvar: Nine dates",
-      "From A-Z, updated on 04/05/2018 at 11:11",
-      "Yes, she wants to...",
-      "to become \"The Beauty of Munich\" in 2018!",
-      "In the afternoon there is another surprise waiting for our contestants: they will be competing for the romantic candlelight photo shoot at MY SOLARIS not alone, but together with a male-model Fabian!",
-      "Hvar with its flirting, coquetting, and seduction is not an easy task for our girls.",
-      "Especially when there is a boyfriend waiting in Germany.",
-      "Despite dealing with wind, sprays and rain, the nine contestants of \"The Beauty of Munich\" behaved like real professionals at the photo shoot with People-photographer Tuan."
-    ]
-  }
-}
--- a/examples/legacy/seq2seq/test_data/wmt_en_ro/test.source
+++ b/examples/legacy/seq2seq/test_data/wmt_en_ro/test.source
@ -1,20 +0,0 @@
-UN Chief Says There Is No Military Solution in Syria Secretary-General Ban Ki-moon says his response to Russia's stepped up military support for Syria is that "there is no military solution" to the nearly five-year conflict and more weapons will only worsen the violence and misery for millions of people. The U.N. chief again urged all parties, including the divided U.N. Security Council, to unite and support inclusive negotiations to find a political solution. Ban told a news conference Wednesday that he plans to meet with foreign ministers of the five permanent council nations - the U.S., Russia, China, Britain and France - on the sidelines of the General Assembly's ministerial session later this month to discuss Syria.
-He expressed regret that divisions in the council and among the Syrian people and regional powers "made this situation unsolvable." Ban urged the five permanent members to show the solidarity and unity they did in achieving an Iran nuclear deal in addressing the Syria crisis. 8 Poll Numbers That Show Donald Trump Is For Real Some have tried to label him a flip-flopper. Others have dismissed him as a joke. And some are holding out for an implosion. But no matter how some Republicans are trying to drag Donald Trump down from atop the polls, it hasn't worked (yet).
-Ten of the last 11 national polls have shown Donald Trump's lead at double digits, and some are starting to ask seriously what it means for the real estate mogul's nomination chances. Of course, it's still early in the election cycle. None of this is to say that Trump is likely to win the Republican nomination. Pundits point out that at this time in 2011, Rick Perry's lead was giving way to a rising Herman Cain, neither of whom won even one state in the nomination process. And there are many reasons he would struggle in a general election. But outside groups like Jeb Bush's Super PAC and the economic conservative group Club for Growth are recognizing Trump's staying power and beginning to unload their dollars to topple him.
-Here are some recent poll numbers that suggest that the real estate mogul isn't just a passing phase: Trump's favorability ratings have turned 180 degrees. Right before Donald Trump announced his candidacy in mid-June, a Monmouth University poll showed only two in 10 Republicans had a positive view of the real estate mogul. By mid-July, it was 40 percent. In early August, it was 52 percent. Now, six in 10 Republicans have a favorable view of Donald Trump. Roughly three in 10 say they have a negative view. And these numbers hold up in early states. A Quinnipiac poll in Iowa last week found that 60 percent of Republicans there had a favorable view of Trump.
-Two-thirds of GOP voters would be happy with Trump as the nominee. In a CNN/ORC poll last week, 67 percent of Republicans said they would be either "enthusiastic" or "satisfied" if Trump were the nominee. Only two in 10 say they would be "upset" if he were the nominee. Only Ben Carson generates roughly the same level of enthusiasm as Trump (43 percent say they would be "enthusiastic" vs. 40 percent who say the same of Trump). The next closest in enthusiasm? Marco Rubio with only 21 percent.
-On the flip side, 47 percent of Republican voters say they would be "dissatisfied" or "upset" if establishment favorite Jeb Bush becomes the nominee. A majority of Republicans don't see Trump's temperament as a problem. While Donald Trump has been widely criticized for his bombast and insults, 52 percent of leaned Republican voters nationwide think that the real estate mogul has the right temperament to be president, according to Monday's ABC News/Washington Post poll. The same number holds in the first-in-the-nation caucus state of Iowa, where the same 52 percent of Republicans think he has the personality to be commander in chief, according to Quinnipiac last week.
-Still, 44 percent think he doesn't have the personality to serve effectively, and almost six in 10 independents say his temperament does not belong in the White House, according to ABC/Post. Republican voters are getting used to the idea. When they put on their pundit hats, Republican voters think Trump is for real. When asked who is most likely to win the GOP nomination, four in 10 said Trump was the best bet, according to a CNN/ORC poll out last week. That's a change from when four in 10 placed their money on Jeb Bush in late July. Full disclosure: GOP voters haven't had the clearest crystal ball in the past.
-At this time last cycle, four in 10 Republicans picked Rick Perry to win the nomination, vs. only 28 percent for eventual nominee Mitt Romney. Still, it shows that a plurality of GOP voters see Trump's campaign as plausible. Even if Republicans rallied around another candidate, Trump still beats almost everyone. Some pundits point out that the splintered field is likely contributing to Trump's lead, while anti-Trump support is be spread diffusely among more than a dozen other candidates. But a Monmouth University poll in early September shows that, in a hypothetical head-to-head matchup between Trump and most other Republican candidates, Trump almost always garners majority support.
-He leads Carly Fiorina by 13 points, Marco Rubio by 14 points, Walker by 15 points, Jeb Bush by 19 points, and, finally, Rand Paul, John Kasich and Chris Christie by 33 points each. He's in a dead heat with Ted Cruz. The only candidate who beats him? Ben Carson would lead the businessman by a wide 19 points in a hypothetical head-to-head. A bare majority of Donald Trump's supporters say they've made up their minds. A new CBS/NYT poll out on Tuesday shows that just more than half of voters who support Trump say they have locked in their votes. Obviously, a lot can happen to change that, and no one can really say they would never change their mind.
-46 percent said they are leaving the door open to switching candidates. Still, Trump's strongest competition at the moment is from fellow outsider neurosurgeon Ben Carson, but voters who say they have made up their minds are twice as likely to go for Trump. Six in 10 Republicans say they agree with Trump on immigration. Even since Donald Trump called immigrants from Mexico "rapists" in his campaign announcement speech two months ago, immigration has been front and center in the 2016 conversation. Some are worried that Trump's bombast will drive crucial Hispanic voters away from the Republican Party and damage rebranding efforts.
-But according to Monday's new ABC/Post poll, six in 10 Republicans say they agree with Trump on immigration issues. So as long as immigration remains in the spotlight, it seems Donald Trump will remain too. Frustration with government is climbing to new highs. Donald Trump and Ben Carson now account for roughly half of the support from Republican voters, largely due to their outsider status. Six in 10 Republicans in Monday's new ABC/Post poll say they want a political outsider over someone with government experience. And they are angry at Washington, too.
-A Des Moines Register/Bloomberg poll in Iowa from two weeks ago shows that three in four Iowa Republicans are frustrated with Republicans in Congress, with 54 percent "unsatisfied" and 21 percent "mad as hell." Jeremy Corbyn to make debut at Prime Minister's Questions Since his election, Mr Corbyn's debut at PMQs has been keenly awaited New Labour leader Jeremy Corbyn is to make his debut at Prime Minister's Questions later, taking on David Cameron for the first time.
-Mr Corbyn will rise to ask the first of his six allotted questions shortly after midday, with his performance likely to be closely scrutinised by the media and Labour MPs. He has called for "less theatre and more facts" at the weekly showpiece. He has also said he could skip some sessions, leaving them to colleagues. The encounter will be the first parliamentary test of Mr Corbyn's leadership, coming after his appointment of a shadow cabinet and his speech to the TUC annual congress on Tuesday.
-Meanwhile, the Labour leader's decision to stand in silence during the singing of the national anthem at a service on Tuesday to mark the 75th anniversary of the Battle of Britain has attracted criticism from a number of Tory MPs and is the focus of several front page stories in the newspapers. Mr Corbyn's decision not to sing the national anthem has attracted attention A spokesman for Mr Corbyn said he had "stood in respectful silence" and did recognise the "heroism of the Royal Air Force in the Battle of Britain."
-But a member of Mr Corbyn's shadow cabinet, Owen Smith, told BBC Two's Newsnight programme he would have advised the Labour leader to sing the national anthem "irrespective" of his belief that the monarchy should be abolished. Nearly a dozen shadow ministers have refused to serve in Mr Corbyn's top team, citing differences over the economy, defence and foreign affairs, while less than a sixth of the parliamentary party originally backed him as leader. BBC political correspondent Robin Brant says policy differences are also "stacking up" within Labour following Mr Corbyn's appointment over its position on the European Union and the government's cap on benefits.
-Mr Corbyn told the TUC conference Labour was putting forward amendments to remove the whole idea of a cap altogether. Hours later Mr Smith, the shadow work and pensions secretary, said the party was "very clear" that it was only opposing government plans to reduce the level of cap from £26,000 to £23,000. Mr Corbyn will be the fifth Labour leader that David Cameron has faced across the despatch box over the past decade since he became Tory leader. The Labour leader, who has promised a different approach to politics, says he has "crowd sourced" ideas for questions to ask Mr Cameron and has been given more than 30,000 suggestions.
-The Islington North MP has said PMQs is too confrontational and that he will refrain from both "repartee" and trading barbs, instead vowing to focus on serious issues such as poverty, inequality and the challenges facing young people. Mr Corbyn has said that Angela Eagle, the shadow business secretary, will deputise for him at PMQs when he does not attend - for instance when Mr Cameron is travelling abroad. He has also floated the idea of allowing other colleagues to take the floor on occasion, saying he had approached the Commons Speaker John Bercow to discuss the issue.
-When he became leader in 2005, Mr Cameron said he wanted to move away from the "Punch and Judy" style of politics often associated with PMQs but admitted some years later that he had failed. Since it was first televised in 1990, PMQs has been seen as a key barometer of a leader's judgement, their command of the Commons and their standing among their fellow MPs although critics have argued it has become a caricature and is in need of far-reaching reforms. 'Shot in Joburg': Homeless youth trained as photographers Downtown Johannesburg is a tough place to be homeless.
-But one group of former street children have found a way to learn a skill and make a living. "I was shot in Joburg" is a non-profit studio that teaches homeless youngsters how to take photographs of their neighbourhood and make a profit from it. BBC News went to meet one of the project's first graduates. JD Sports boss says higher wages could hurt expansion JD Sports Executive Chairman Peter Cowgill says a higher minimum wage for UK workers could mean "more spending power in the pockets of potential consumers." But that spending power is unlikely to outweigh the higher labour costs at his firm, he says.
-The costs could hit JD Sports' expansion plans, he added, which could mean fewer extra jobs. Thanasi Kokkinakis backed by Tennis Australia president Steve Healy Thanasi Kokkinakis deserves kudos rather than criticism for his behaviour. Thanasi Kokkinakis has been the collateral damage in the recent storm around his friend Nick Kyrgios and deserves kudos rather than criticism for his own behaviour, according to Tennis Australia president Steve Healy.
--- a/examples/legacy/seq2seq/test_data/wmt_en_ro/test.target
+++ b/examples/legacy/seq2seq/test_data/wmt_en_ro/test.target
@ -1,20 +0,0 @@
-Șeful ONU declară că nu există soluții militare în Siria Secretarul General Ban Ki-moon afirmă că răspunsul său la suportul militar al Rusiei pentru Siria este că „nu există o soluție militară” la conflictul care durează de aproape cinci ani iar mai multe arme nu ar face decât să agraveze violența și suferința a milioane de oameni. Șeful ONU a solicitat din nou tuturor părților, inclusiv Consiliului de securitate ONU divizat să se unifice și să susțină negocierile pentru a găsi o soluție politică. Ban a declarat miercuri în cadrul unei conferințe că intenționează să se întâlnească luna aceasta cu miniștrii de externe din cinci țări permanent prezente în consiliu - SUA, Rusia, China, Anglia și Franța - pe marginea sesiunii ministeriale a Adunării Generale pentru a discuta despre Siria.
-Ban și-a exprimat regretul că divizările în consiliu și între poporul sirian și puterile regionale „au făcut această situație de nerezolvat”. Ban le-a cerut celor cinci membri permanenți să dea dovadă de solidaritatea și unitatea arătate atunci când au reușit să încheie un acord referitor la armele nucleare ale Iranului, abordând astfel criza din Siria. 8 cifre din sondaje care arată că Donald Trump are șanse reale Unii au încercat să îl eticheteze ca politician „flip-flop”. Alții l-au numit o glumă. Iar alții așteaptă implozia. Însă indiferent de modul în care unii republicani încearcă să îl dărâme pe Donald Trump din vârful sondajelor, nu a funcționat (încă).
-Zece din ultimele 11 sondaje naționale au arătat că Donald Trump conduce cu un procent din două cifre iar unele voci încep să se întrebe serios ce înseamnă acest lucru pentru șansele de numire ale mogulului imobiliar. Desigur, este încă prematur. Nimic din toate acestea nu spune că Trump va câștiga cursa pentru nominalizarea republicanilor. Pundits arată că, în aceeași perioadă a anului 2011, avansul lui Rick Perry îi făcea loc lui Herman Cain în sondaje, dar niciunul dintre ei nu a câștigat în vreun stat în cursa de nominalizare. Iar motivele pentru care s-ar lupta din greu la alegerile generale sunt numeroase. Însă grupurile din exterior precum Super PAC al lui Jeb Bush și grupul conservator economic Club for Growth admit puterea lui Trump și încep să îl susțină cu bani.
-În continuare vă prezentăm câteva cifre din sondaje recente care sugerează că mogulul imobiliar nu este doar ceva trecător: Cifrele care indică susținerea față de Trump s-au întors la 180 grade. Chiar înainte ca Donald Trump să își anunțe candidatura, la mijlocul lui iunie, un sondaj realizat de Universitatea din Monmouth arăta că doar doi din 10 republicani aveau o părere pozitivă despre mogulul imobiliar. Până la mijlocul lui iulie, procentul a urcat la 40%. La începutul lui august, era 52%. În prezent, șase din 10 republicani au o părere favorabilă despre Donald Trump. Aproximativ trei din 10 declară că au o părere negativă. Aceste cifre se mențin. Un sondaj realizat săptămâna trecută de Quinnipiac în Iowa a concluzionat că 60% dintre republicanii din regiune au o părere favorabilă despre Trump.
-Două treimi dintre alegătorii GOP ar fi fericiți dacă Trump ar câștiga cursa pentru nominalizare. Într-un sondaj realizat săptămâna trecută de CNN/ORC, 67% dintre republicani au declarat că ar fi „entuziasmați” sau „mulțumiți” dacă Trump ar câștiga cursa pentru nominalizare. Doar doi din 10 declară că ar fi „supărați” dacă Trump ar câștiga cursa pentru nominalizare. Doar Ben Carson generează aproximativ același nivel de entuziasm ca Trump (43% declară că ar fi „entuziasmați” față de 40% care declară același lucru despre Trump). Cel mai aproape în ceea ce privește entuziasmul? Marco Rubio, cu doar 21%.
-De partea cealaltă, 47% dintre alegătorii republicani afirmă că ar fi „nemulțumiți” sau „supărați” dacă favoritul Jeb Bush câștigă cursa pentru nominalizare. Majoritatea republicanilor nu consideră temperamentul lui Trump o problemă. Deși Donald Trump a fost puternic criticat pentru insultele aduse și stilul său bombastic, 52% dintre alegătorii republicani la nivel național consideră că mogulul imobiliar are temperamentul potrivit pentru a fi președinte, conform sondajului realizat luni de ABC News/Washington Post. Regăsim aceleași cifre în statul Iowa, unde tot 52% dintre republicani cred că Trump are personalitatea potrivită pentru a fi conducător, conform sondajului realizat săptămâna trecută de Quinnipiac.
-Totuși, 44% sunt de părere că nu are personalitatea necesară pentru a acționa eficient și aproape șase din 10 independenți afirmă că temperamentul său nu are ce căuta la Casa Albă, conform ABC/Post. Alegătorii republicani se obișnuiesc cu ideea. Atunci când iau atitudinea de intelectuali, alegătorii republicani consideră că Trump este autentic. Conform unui sondaj realizat săptămâna trecută de CNN/ORC, la întrebarea cine are cele mai multe șanse să câștige cursa pentru nominalizare GOP, patru din 10 au declarat că Trump. Situația s-a schimbat față de finalul lui iulie, când patru din 10 ar fi pariat pe Jeb Bush. Informare completă: în trecut, alegătorii GOP nu au citit foarte bine viitorul.
-În aceeași perioadă a ultimelor alegeri, patru din 10 republicani l-au ales pe Rick Perry în cursa pentru nominalizare, față de doar 28% pentru Mitt Romney. Însă, aceste cifre arată că majoritatea alegătorilor GOP consideră plauzibilă campania lui Trump. Chiar dacă republicanii sau repliat spre un alt candidat. Trump încă se află în fruntea tuturor. Unele voci spun că situația divizată va contribui probabil la victoria lui Trump, în timp ce susținerea contra lui Trump se va împărți la mai mult de doisprezece candidați. Însă un sondaj derulat la începutul lui septembrie de Universitatea din Monmouth arată că, în situația ipotetică a unei colaborări între Trump și majoritatea celorlalți candidați republicani, aproape întotdeauna Trump va beneficia de susținerea majoritară.
-Trump se află la distanță de 13 puncte de Carly Fiorina, la 14 puncte de Marco Rubio, la 15 puncte de Walker, la 19 puncte de Jeb Bush și, în cele din urmă, la câte 33 de puncte față de Rand Paul, John Kasich și Chris Christie. Este aproape la egalitate cu Ted Cruz. Singurul candidat care îl învinge? Ben Carson l-ar învinge pe omul de afaceri cu 19 puncte într-o confruntare ipotetică de unu la unu. Majoritatea susținătorilor lui Donald Trump declară că s-au decis. Un nou sondaj realizat marți de CBS/NYT arată că peste jumătate dintre alegătorii care îl susțin pe Trump declară că nu își schimbă opțiunea de vot. Evident, se pot întâmpla multe în acest sens și nimeni nu poate spune că aceștia nu se vor răzgândi niciodată.
-46% afirmă că lasă portița deschisă posibilității de a-și schimba opțiunea. Cu toate acestea, cel mai important adversar al lui Trump este în prezent neurochirurgul Ben Carson, însă este de două ori mai probabil ca alegătorii care declară că s-au decis să voteze cu Trump. Șase din 10 republicani afirmă că sunt de acord cu Trump în problema imigrării. De când Donald Trump i-a numit pe imigranții din Mexic „violatori” în discursul de deschidere a campaniei sale, în urmă cu două luni, imigrarea a fost subiectul central în campania pentru 2016. Unii sunt îngrijorați că stilul bombastic al lui Trump va duce la o scindare între alegătorii hispanici importanți și Partidul Republican și va prejudicia eforturile de rebranding.
-Însă, conform sondajului realizat luni de ABC/Post, șase din 10 republicani afirmă că sunt de acord cu Trump în problema imigrării. Așa că, se pare că atâta timp cât problema imigrării rămâne în lumina reflectoarelor, la fel va rămâne și Doland Trump. Frustrarea față de autorități atinge noi culmi. Donald Trump și Ben Carson sunt acum susținuți de aproape jumătate dintre alegătorii republicani, în mare parte datorită statutului lor de outsideri. Conform sondajului realizat luni de ABC/Post, șase din 10 republicani afirmă că preferă un outsider politic în detrimentul cuiva cu experiență în guvernare. Oamenii sunt de asemenea supărați pe autoritățile de la Washington.
-Un sondaj derulat în urmă cu două săptămâni în Iowa de către Des Moines Register/Bloomberg arată că trei din patru republicani din Iowa sunt frustrați de prestația republicanilor din COngres, 54% declarându-se „nemulțumiți” iar 21% „nervoși la culme”. Jeremy Corbyn își face debutul la Prime Minister's Questions Încă de la alegerea sa, debutul domnului Corbyn la PMQs a fost îndelung așteptat Noul lider al Partidului Laburist, Jeremy Corbyn, își va face mai târziu debutul la Prime Minister's Questions, confruntându-se pentru prima dată cu David Cameron.
-Dl Corbyn va adresa primele dintre cele șase întrebări la care are dreptul la scurt timp după prânz; prestația sa va fi probabil analizată îndeaproape de mass-media și parlamentarii laburiști. În cadrul aparițiilor săptămânale, el a cerut „mai puțin teatru și mai multe fapte”. A declarat de asemenea că poate renunța la câteva participări și că le cedează colegilor săi. Confruntarea va fi primul test parlamentar al Dl Corbyn în poziție de lider, venind după ce a numit un „cabinet fantomă” și după discursul pe care l-a ținut marți la congresul anual TUC.
-Între timp, decizia liderului Partidului laburist de a păstra tăcerea la rostirea imnului național în cadrul unei slujbe ținute marți cu ocazia aniversării a 75 de ani de la Bătălia Angliei a atras critici din partea unor parlamentari conservatori și a ținut prima pagină a ziarelor. Decizia domnului Corbyn de a nu cânta imnul național a atras atenția Un purtător de cuvânt al Dl Corbyn a declarat că acesta „a păstrat tăcerea în mod respectuos” și a recunoscut „eroismul Forțelor aeriene britanice în Bătălia Angliei.”
-Însă un membru al cabinetului fantomă al Dl Corbyn, Owen Smith, a declarat pentru emisiunea Two's Newsnight transmisă de BBC că i-ar fi recomandat liderului laburist să cânte imnul național „indiferent” de credința sa că monarhia ar trebui abolită. În jur de doisprezece miniștri din cabinetul fantomă au refuzat să facă parte din echipa de frunte a Dl Corbyn, argumentând prin diferențe de opinie legate de economie, apărare și externe, în timp ce mai puțin de o șesime din partidul parlamentar l-a susținut ca lider. Corespondentul politic al BBC, Robin Brant, declară că diferențele de politică „se cumulează” în Partidul Laburist după numirea domnului Corbyn referitor la poziția sa față de Uniunea Europeană și limita de beneficii.
-Dl Corbyn a declarat la conferința TUC că Partidul Laburist va aduce modificări prin care se va elimina integral ideea limitării. Câteva ore mai târziu, Dl Smith, Ministrul Muncii și Pensiilor, a declarat că partidul „este foarte clar” în opoziția exclusivă față de planurile guvernului de a reduce nivelul „cap” de la 26.000 lire la 23.000 lire. Dl Corbyn va fi al cincilea lider laburist cu care se confruntă David Cameron la tribună în ultimul deceniu, de când a preluat conducerea Partidului Conservator. Liderul laburist, care a promis o abordare diferită a politicii, spune că are idei „din surse externe” pentru întrebări pe care să i le adreseze Domnului Cameron și că a primit peste 30.000 de sugestii.
-Parlamentarul Islington North a afirmat că PMQs implică un nivel de confruntare prea înalt și că se va abține de la replici și atacuri, angajându-se să se concentreze în schimb pe probleme serioase precum sărăcia, inegalitatea și provocările cu care se confruntă tinerii. Dl Corbyn a declarat că Angela Eagle, Ministrul de finanțe, îi va ține locul la PMQs atunci când el nu poate participa - de exemplu atunci când Dl Cameron se deplasează în străinătate. A exprimat de asemenea ideea că va permite altor colegi să ia cuvântul ocazional, spunând că l-a abordat pe Președintele Camerei Deputaților, John Bercow, pentru a discuta acest aspect.
-În 2005, când a preluat conducerea, Dl Cameron a declarat că dorește să renunțe la stilul politic „Punch and Judy” asociat adesea cu PMQs însă a recunoscut câțiva ani mai târziu că nu a reușit în demersul său. De la prima transmisie, în 1990, PMQs a fost considerată un barometru cheie al raționamentului unui lider, al modului în care acesta conduce Camera Deputaților și a poziției sale în rândul colegilor parlamentari, deși criticii afirmă a ca devenit o caricatură și că are nevoie de o reformare profundă. „Cadru în Joburg”: Tineri fără adăpost beneficiază de cursuri de fotografie Este dificil să fii un om fără adăpost în Johannesburg.
-Însă un grup de oameni care au trăit pe străzi în copilărie au găsit un mod de a învăța o meserie și de a-și câștiga traiul. „I was shot în Joburg” este un studio non-profit care îi învață pe tinerii fără adăpost să facă fotografii ale zonelor în care trăiesc și să câștige bani din asta. BBC News s-a întâlnit cu unul dintre primii absolvenți ai proiectului. Șeful JD Sports spune că salariile mai mari ar putea dăuna extinderii Președintele JD Sports, Peter Cowgill, declară că o creștere a salariului minim în Marea Britanie ar putea însemna „o putere de cumpărare mai mare în buzunarele potențialilor consumatori.” Este însă puțin probabil ca respectiva putere de cumpărare să depășească costurile mai mari pentru forța de muncă în cadrul firmei, afirmă el.
-Costurile ar putea avea impact asupra planurilor de extindere ale JD Sports, a adăugat el, ceea ce ar putea însemna mai puține locuri de muncă noi. Thanasi Kokkinakis susținut de președintele Tennis Australia, Steve Healy Thanasi Kokkinakis ar merita să fie lăudat și nu criticat pentru comportamentul său. Thanasi Kokkinakis a fost victimă colaterală în „furtuna” creată în jurul prietenului său, Nick Kyrgios, iar comportamentul său merită mai degrabă cuvinte de laudă și nu critică, în opinia președintelui Tennis Australia, Steve Healy.
--- a/examples/legacy/seq2seq/test_data/wmt_en_ro/train.len
+++ b/examples/legacy/seq2seq/test_data/wmt_en_ro/train.len
--- a/examples/legacy/seq2seq/test_data/wmt_en_ro/train.source
+++ b/examples/legacy/seq2seq/test_data/wmt_en_ro/train.source
@ -1,11 +0,0 @@
-Corrections to votes and voting intentions: see Minutes Assignment conferred on a Member: see Minutes Membership of committees and delegations: see Minutes Decisions concerning certain documents: see Minutes Forwarding of texts adopted during the sitting: see Minutes Dates for next sittings: see Minutes
-Membership of Parliament: see Minutes Approval of Minutes of previous sitting: see Minutes Membership of Parliament: see Minutes Verification of credentials: see Minutes Documents received: see Minutes Written statements and oral questions (tabling): see Minutes Petitions: see Minutes Texts of agreements forwarded by the Council: see Minutes Action taken on Parliament's resolutions: see Minutes Agenda for next sitting: see Minutes Closure of sitting (The sitting was closed at 7.45 p.m.)
-Election of Vice-Presidents of the European Parliament (deadline for submitting nominations): see Minutes (The sitting was suspended at 12.40 p.m. and resumed at 3.00 p.m.) Election of Quaestors of the European Parliament (deadline for submitting nominations): see Minutes (The sitting was suspended at 3.25 p.m. and resumed at 6.00 p.m.) Agenda for next sitting: see Minutes Closure of sitting (The sitting was closed at 6.15 p.m.) Opening of the sitting (The sitting was opened at 9.35 a.m.) Documents received: see Minutes Approval of Minutes of previous sitting: see Minutes Membership of Parliament: see Minutes
-Membership of committees (deadline for tabling amendments): see Minutes (The sitting was suspended at 7 p.m. and resumed at 9 p.m.) Agenda for next sitting: see Minutes Closure of sitting (The sitting was suspended at 23.25 p.m.) Documents received: see Minutes Communication of Council common positions: see Minutes (The sitting was suspended at 11.35 a.m. and resumed for voting time at noon) Approval of Minutes of previous sitting: see Minutes Committee of Inquiry into the crisis of the Equitable Life Assurance Society (extension of mandate): see Minutes
-Announcement by the President: see Minutes 1. Membership of committees (vote) 2. Amendment of the ACP-EC Partnership Agreement (vote) 4. Certification of train drivers operating locomotives and trains on the railway system in the Community (vote) 6. Law applicable to non-contractual obligations ("ROME II") (vote) 8. Seventh and eighth annual reports on arms exports (vote) Corrections to votes and voting intentions: see Minutes Membership of committees and delegations: see Minutes Request for waiver of parliamentary immunity: see Minutes Decisions concerning certain documents: see Minutes
-Written statements for entry
-Written statements for entry in the register (Rule 116): see Minutes Forwarding of texts adopted during the sitting: see Minutes Dates for next sittings: see Minutes Adjournment of the session I declare the session of the European Parliament adjourned. (The sitting was closed at 1 p.m.) Approval of Minutes of previous sitting: see Minutes Membership of Parliament: see Minutes Request for the defence of parliamentary immunity: see Minutes Appointments to committees (proposal by the Conference of Presidents): see Minutes Documents received: see Minutes Texts of agreements forwarded by the Council: see Minutes
-Action taken on Parliament's resolutions: see Minutes Oral questions and written statements (tabling): see Minutes Written statements (Rule 116): see Minutes Agenda: see Minutes 1. Appointments to parliamentary committees (vote): see Minutes Voting time Agenda for next sitting: see Minutes Closure of sitting (The sitting was closed at 12 midnight) Opening of the sitting (The sitting was opened at 09.05) Documents received: see Minutes Approval of Minutes of previous sitting: see Minutes 1. Protection of passengers against displaced luggage (vote) 2.
-Approval of motor vehicles with regard to the forward field of vision of the driver (vote) 3. EC-Korea Agreement on scientific and technological cooperation (vote) 4. Mainstreaming sustainability in development cooperation policies (vote) 5. Draft Amending Budget No 1/2007 (vote) 7. EC-Gabon Fisheries Partnership (vote) 10. Limitation periods in cross-border disputes involving personal injuries and fatal accidents (vote) 12. Strategy for a strengthened partnership with the Pacific Islands (vote) 13. The European private company statute (vote) That concludes the vote.
-Corrections to votes and voting intentions: see Minutes Assignment conferred on a Member: see Minutes Membership of committees and delegations: see Minutes Decisions concerning certain documents: see Minutes Forwarding of texts adopted during the sitting: see Minutes Dates for next sittings: see Minutes
-Written statements for entry
--- a/examples/legacy/seq2seq/test_data/wmt_en_ro/train.target
+++ b/examples/legacy/seq2seq/test_data/wmt_en_ro/train.target
@ -1,11 +0,0 @@
-Corectările voturilor şi intenţiile de vot: a se vedea procesul-verbal Misiune încredinţată unui deputat: consultaţi procesul-verbal Componenţa comisiilor şi a delegaţiilor: a se vedea procesul-verbal Decizii privind anumite documente: a se vedea procesul-verbal Transmiterea textelor adoptate în cursul prezentei şedinţe: a se vedea procesul-verbal Calendarul următoarelor şedinţe: a se vedea procesul-verbal
-Componenţa Parlamentului: a se vedea procesul-verbal Aprobarea procesului-verbal al şedinţei precedente: a se vedea procesul-verbal Componenţa Parlamentului: a se vedea procesul-verbal Verificarea prerogativelor: a se vedea procesul-verbal Depunere de documente: a se vedea procesul-verbal Declaraţii scrise şi întrebări orale (depunere): consultaţi procesul-verbal Petiţii: a se vedea procesul-verbal Transmiterea de către Consiliu a textelor acordurilor: a se vedea procesul-verbal Cursul dat rezoluţiilor Parlamentului: a se vedea procesul-verbal Ordinea de zi a următoarei şedinţe: a se vedea procesul-verbal Ridicarea şedinţei (Se levanta la sesión a las 19.45 horas)
-Alegerea vicepreşedinţilor Parlamentului European (termenul de depunere a candidaturilor): consultaţi procesul-verbal (Die Sitzung wird um 12.40 Uhr unterbrochen und um 15.00 Uhr wiederaufgenommen). Alegerea chestorilor Parlamentului European (termenul de depunere a candidaturilor): consultaţi procesul-verbal (Die Sitzung wird um 15.25 Uhr unterbrochen und um 18.00 Uhr wiederaufgenommen). Ordinea de zi a următoarei şedinţe: a se vedea procesul-verbal Ridicarea şedinţei (Die Sitzung wird um 18.15 Uhr geschlossen.) Deschiderea şedinţei (Die Sitzung wird um 9.35 Uhr eröffnet.) Depunerea documentelor: a se vedea procesul-verbal Aprobarea procesului-verbal al şedinţei precedente: a se vedea procesul-verbal Componenţa Parlamentului: a se vedea procesul-verbal
-Componenţa comisiilor (termenul de depunere a amendamentelor): consultaţi procesul-verbal (La seduta, sospesa alle 19.00, è ripresa alle 21.00) Ordinea de zi a următoarei şedinţe: a se vedea procesul-verbal Ridicarea şedinţei (Die Sitzung wird um 23.25 Uhr geschlossen.) Depunerea documentelor: a se vedea procesul-verbal Comunicarea poziţiilor comune ale Parlamentului: a se vedea procesul-verbal (La séance, suspendue à 11h35 dans l'attente de l'Heure des votes, est reprise à midi) Aprobarea procesului-verbal al şedinţei precedente: a se vedea procesul-verbal Comisia de anchetă privind criza societăţii de asigurări "Equitable Life” (prelungirea mandatului): consultaţi procesul-verbal
-Comunicarea Preşedintelui: consultaţi procesul-verbal 1. Componenţa comisiilor (vot) 2. Modificarea Acordului de parteneriat ACP-CE ("Acordul de la Cotonou”) (vot) 4. Certificarea mecanicilor de locomotivă care conduc locomotive şi trenuri în sistemul feroviar comunitar (vot) 6. Legea aplicabilă obligaţiilor necontractuale ("Roma II”) (vot) 8. Al şaptelea şi al optulea raport anual privind exportul de armament (vot) Corectările voturilor şi intenţiile de vot: a se vedea procesul-verbal Componenţa comisiilor şi a delegaţiilor: a se vedea procesul-verbal Cerere de ridicare a imunităţii parlamentare: consultaţi procesul-verbal Decizii privind anumite documente: a se vedea procesul-verbal
-Declaraţii scrise înscrise
-Declaraţii scrise înscrise în registru (articolul 116 din Regulamentul de procedură): a se vedea procesul-verbal Transmiterea textelor adoptate în cursul prezentei şedinţe: a se vedea procesul-verbal Calendarul următoarelor şedinţe: a se vedea procesul-verbal Întreruperea sesiunii Dichiaro interrotta la sessione del Parlamento europeo. (La seduta è tolta alle 13.00) Aprobarea procesului-verbal al şedinţei precedente: a se vedea procesul-verbal Componenţa Parlamentului: a se vedea procesul-verbal Cerere de apărare a imunităţii parlamentare: consultaţi procesul-verbal Numiri în comisii (propunerea Conferinţei preşedinţilor): consultaţi procesul-verbal Depunerea documentelor: a se vedea procesul-verbal Transmiterea de către Consiliu a textelor acordurilor: a se vedea procesul-verbal
-Continuări ale rezoluţiilor Parlamentului: consultaţi procesul-verbal Declaraţii scrise şi întrebări orale (depunere): consultaţi procesul-verbal Declaraţii scrise (articolul 116 din Regulamentul de procedură) Ordinea de zi: a se vedea procesul-verbal 1. Numiri în comisiile parlamentare (vot): consultaţi procesul-verbal Timpul afectat votului Ordinea de zi a următoarei şedinţe: a se vedea procesul-verbal Ridicarea şedinţei (La seduta è tolta alle 24.00) Deschiderea şedinţei (The sitting was opened at 09.05) Depunerea documentelor: a se vedea procesul-verbal Aprobarea procesului-verbal al şedinţei precedente: a se vedea procesul-verbal 1. Protecţia pasagerilor împotriva deplasării bagajelor (vot) 2.
-Omologarea vehiculelor cu motor cu privire la câmpul de vizibilitate înainte al conducătorului auto (vot) 3. Acordul CE-Coreea de cooperare ştiinţifică şi tehnologică (vot) 4. Integrarea durabilităţii în politicile de cooperare pentru dezvoltare (vot) 5. Proiect de buget rectificativ nr.1/2007 (vot) 7. Acordul de parteneriat în domeniul pescuitului între Comunitatea Europeană şi Republica Gaboneză (vot) 10. Termenele de prescripţie aplicabile în cadrul litigiilor transfrontaliere cu privire la vătămările corporale şi accidentele mortale (vot) 12. Relaţiile UE cu insulele din Pacific: Strategie pentru un parteneriat consolidat (vot) 13. Statutul societăţii private europene (vot) Damit ist die Abstimmungsstunde beendet.
-Corectările voturilor şi intenţiile de vot: a se vedea procesul-verbal Misiune încredinţată unui deputat: consultaţi procesul-verbal Componenţa comisiilor şi a delegaţiilor: a se vedea procesul-verbal Decizii privind anumite documente: a se vedea procesul-verbal Transmiterea textelor adoptate în cursul prezentei şedinţe: a se vedea procesul-verbal Calendarul următoarelor şedinţe: a se vedea procesul-verbal
-Declaraţii scrise înscrise
--- a/examples/legacy/seq2seq/test_data/wmt_en_ro/val.len
+++ b/examples/legacy/seq2seq/test_data/wmt_en_ro/val.len
--- a/examples/legacy/seq2seq/test_data/wmt_en_ro/val.source
+++ b/examples/legacy/seq2seq/test_data/wmt_en_ro/val.source
@ -1,16 +0,0 @@
-Brazil's Former Presidential Chief-of-Staff to Stand Trial A federal judge on Tuesday accepted the charges filed against Brazil's former presidential chief of staff for his alleged involvement in a massive corruption scheme at state-owned oil company Petrobras. The federal prosecutor's office said Jose Dirceu will face trial on the corruption, racketeering and money laundering charges filed earlier this month. Fourteen other people will also be tried, including Joao Vaccari Neto, the former treasurer of Brazil's governing Workers' Party and Renato de Souza Duque, Petrobras' former head of corporate services.
-Dirceu is the most senior member of the ruling Workers' Party to be taken into custody in connection with the scheme. Dirceu served as former President Luiz Inacio Lula da Silva's chief of staff between 2003 and 2005. He was arrested early August in his home, where he already was under house arrest serving an 11-year sentence for his involvement in a cash-for-votes scheme in Congress more than 10 years ago. Prosecutors have said that Dirceu masterminded the kickback scheme at Petrobras, accepted bribes while in office and continued to receive payments from contractors after he was jailed in late 2013 for the vote-buying scandal.
-According to prosecutors, the scheme at Petrobras involved roughly $2 billion in bribes and other illegal funds. Some of that money was allegedly funneled back to campaign coffers of the ruling party and its allies. It also allegedly included the payment of bribes to Petrobras executives in return for inflated contracts. 'Miraculous' recovery for Peshawar massacre schoolboy A teenager paralysed after being shot four times in Pakistan's deadliest terror attack has made a "miraculous" recovery following treatment in the UK. Muhammad Ibrahim Khan, 13, had been told by doctors in Pakistan that he would never walk again.
-At least 140 people, mostly children, were killed when gunmen stormed Peshawar's Army Public School last December. Muhammad, who arrived in London last month for surgery, is being discharged from hospital later. Exactly nine months ago, on an ordinary Tuesday morning, Muhammad sat in his first aid class listening to his teachers intently. At the same time seven gunmen disguised in security uniforms were entering the Army Public School. They were strapped with explosives and had one simple mission in mind: Kill every man, woman and child they came across. "I can't forget what happened that day," Muhammad says with a severe stare.
-We were sitting in the auditorium, we were asking questions... and then we heard heavy gunfire outside. The terrorists moved inside and they started killing - our teacher was burned alive. Muhammad described pulling four other pupils out of the auditorium as the carnage unfolded. He said he then heard his friend, Hamza calling to him. He said, 'oh brother save me'. I held his hand. That's when I was shot in the back, and he was shot in the head. Most of the people killed in the attack were pupils Hamza died in Muhammad's arms. Muhammad recalled blacking out after that, and the next thing he knew he was in a hospital bed, paralysed from the waist down.
-Doctors in Peshawar in northern Pakistan, and then Rawalpindi, close to the capital, told his family there was no treatment, and he would never walk again. "Seeing him I felt like my soul had left my body," says Muhammad's father, Sher Khan Those nine months were the hardest in my life. But Mr Khan and his wife, Sherbano, refused to believe that their cricket-mad son would never be able to use his legs again. They campaigned, and appealed for help on Pakistani TV, gaining the support of high profile people such as cricketer turned politician Imran Khan.
-Finally, they were able to raise the funds to bring Muhammad to the UK and provide him with treatment at London's private Harley Street Clinic. Consultant neurosurgeon Irfan Malik described Muhammad as "terrified" when he first arrived at the hospital. "He'd spent the last [few] months lying on a bed, unable to move side to side," says Mr Malik. He was weak, he had a pressure sore on his back. He wasn't in great shape. A vertebra at the base of Muhammad's spine was destroyed Muhammad was shot in his shoulder, his hip, and his back during the attack, damaging his lower spine - leading to paralysis.
-But during six hours of surgery, Mr Malik and his team were able to reattach nerve endings and reconstruct the damaged part of the spine. Even Mr Malik was surprised at what happened next. Exactly one week after the surgery Muhammad stood up and started taking steps and walking. We were not expecting to get that sort of excellent result. That was miraculous," he says. Less than two weeks after his operation, Muhammad is ready to leave hospital and start the long road to recovery. Muhammad has defied the odds and started to walk again He says he wants to build his strength and continue his education in the UK. But he says he is determined to return to Pakistan, join the army and help fight terrorism.
-"I feel like I have a second chance at life," he says as he shows off pictures he's drawn of guns scribbled out next to school books and pens Muhammad grows physically stronger every day but the psychological trauma he continues to endure is unimaginable. "My anger is not diminishing" he says. In my school little kids were killed. What was their crime? His mother, wiping a tear from her eye, caressed his head and said: "I can see my son walking again." He'll be able to get on with his normal life. 'Super Voice' 4G service from Three offers better signal Three is making use of a lower frequency 4G spectrum that can travel more widely
-Mobile phone provider Three has launched a UK service it says will improve reception inside buildings and in rural black spots. Its 4G Super Voice enables customers to make calls and send texts using a lower frequency spectrum. Other networks are looking into introducing the technology, known as Voice Over Long-Term Evolution (VoLTE). It currently works on only the Samsung Galaxy S5, but recent iPhone handsets will be added in the coming months. Three said up to 5.5 million customers would have access to the service by 2017.
-Chief technology officer Bryn Jones said: "By the end of the year, one million of our customers will have access to better indoor coverage and be able to use their phones in more places than ever before." Stars prepare for panto season Pantomime season is big business for theatres up and down the UK, with many getting ready for this year's season now. Some of the biggest names in showbusiness now take part in the yuletide theatre. Matthew Kelly and Hayley Mills will be appearing in Cinderella - one as an ugly sister, the other as fairy godmother. They reveal their panto secrets to BBC Breakfast. Steven Wilson: 'If I don't do anything, I feel this creeping guilt'
-Steven Wilson was recently the big winner at the Progressive Music Awards Steven Wilson is often dubbed the hardest working musician in the world of progressive rock. The multi-talented musician won three prizes at this month's Progressive Music Awards in London, including album of the year for Hand. The Guardian's five-star review called it "a smart, soulful and immersive work of art." Since the 1980s, Wilson has been the driving force in a number of musical projects, the best known of which is the rock band Porcupine Tree. Now, ahead of two sell-out shows at the Royal Albert Hall, Wilson is releasing a vinyl-only double LP, Transience, to showcase the "more accessible" side of his solo output.
-He tells the BBC about his love of vinyl, his busy schedule and explains how comic actor Matt Berry came to be his support act. What does vinyl mean to you? I grew up at the very tail end of the vinyl era, and at the time, I remember, we couldn't wait for CD to come along because vinyl was so frustrating. You would buy the record, take it home, and it would have a scratch, and you would have to take it back again. I love CDs, and for some kinds of music - classical for example - it is better than vinyl. But the problem with the CD and digital downloads is that there's nothing you can really cherish or treasure. Owning vinyl is like having a beautiful painting hanging in your living room.
-It's something you can hold, pore over the lyrics and immerse yourself in the art work. I thought it was just a nostalgic thing, but it can't be if kids too young to remember vinyl are enjoying that kind of experience. Do you have a piece of vinyl that you treasure? The truth is I got rid of 100% of my vinyl in the 90s. All the vinyl I have is re-bought. I started off from the perspective that I wanted to recreate the collection I had when I was 15, but it's gone beyond that. The first record which I persuaded my parents to buy for me was Electric Light Orchestra's Out of the Blue.
-If I still had my original copy, it would have sentimental value, but, alas, it's in a charity shop somewhere. Steven Wilson hopes the album will be a doorway for potential new fans Why release your new compilation Transience on vinyl? It was originally conceived as an idea for Record Store Day, but we missed the boat on that. My record company had suggested I put together some of my shorter, more accessible songs. I got a bit obsessed by the idea to make something like "an introduction to Steven Wilson," and I was committed to it being a vinyl-only release. Anyone who buys the vinyl does also get a high-resolution download.
-Do you have a concern that the album won't show your work in a true light?
--- a/examples/legacy/seq2seq/test_data/wmt_en_ro/val.target
+++ b/examples/legacy/seq2seq/test_data/wmt_en_ro/val.target
@ -1,16 +0,0 @@
-Fostul șef al cabinetului prezidențial brazilian este adus în fața instanței Marți, un judecător federal a acceptat acuzațiile aduse împotriva fostului șef al cabinetului prezidențial brazilian pentru presupusa implicare a acestuia într-o schemă masivă de corupție privind compania petrolieră de stat Petrobras. Biroul procurorului federal a declarat că Jose Dirceu va fi trimis în judecată pentru acuzațiile de corupție, înșelătorie și spălare de bani aduse în această lună. Alte paisprezece persoane vor fi judecate, printre acestea numărându-se Joao Vaccari Neto, fostul trezorier al Partidului Muncitorilor, aflat la putere în Brazilia, și Renato de Souza Duque, fostul președinte al serviciilor pentru întreprinderi ale Petrobras.
-Dirceu este cel mai vechi membru al Partidului Muncitorilor aflat la guvernare luat în custodie pentru legăturile cu această schemă. Dirceu a servit ca șef de cabinet al fostului președinte Luiz Inacio Lula da Silva între 2003 și 2005. A fost arestat la începutul lui august de acasă, unde deja se afla sub arest la domiciliu, cu o pedeapsă de 11 ani pentru implicarea într-o schemă de cumpărare a voturilor în Congres cu peste 10 ani în urmă. Procurorii au declarat că Dirceu a dezvoltat schema de luare de mită de la Petrobras, a acceptat mită în timp ce se afla în funcție și a continuat să primească plăți de la antreprenori după ce a fost închis la sfârșitul lui 2013 pentru scandalul voturilor cumpărate.
-Conform procurorilor, schema de la Petrobras a implicat aproximativ 2 miliarde de dolari sub formă de mită și alte fonduri ilegale. O parte din acei bani s-ar fi întors în fondul de campanie al partidului aflat la guvernare și al aliaților acestora. De asemenea, ar fi inclus mită către directorii Petrobras în schimbul unor contracte umflate. Recuperarea „miraculoasă” a unui elev supraviețuitor al masacrului de la Peshawar Un adolescent paralizat după ce fusese împușcat de patru ori în cel mai cumplit atac terorist din Pakistan a reușit o recuperare „miraculoasă” după ce a urmat un tratament în Regatul Unit. Lui Mohamed Ibrahim Khan, în vârstă de 13 ani, doctorii din Pakistan îi spuseseră că nu va mai putea să meargă niciodată.
-Cel puțin 140 de persoane, majoritatea copii, au fost ucise când bărbați înarmați au atacat școala publică a armatei din Peshawar în luna decembrie a anului trecut. Mohamed, care a sosit la Londra luna trecută pentru operație, va fi externat mai târziu din spital. Exact cu nouă luni în urmă, într-o dimineață obișnuită de marți, Mohamed stătea la ora de primul ajutor și își asculta atent profesorii. Chiar atunci, șapte bărbați înarmați deghizați în uniformele agenților de pază intrau în școala publică a armatei. Purtau centuri cu explozivi și aveau de îndeplinit o misiune simplă: să îi ucidă pe toți bărbații, femeile și copiii care le ieșeau în cale. „Nu pot uita ce s-a întâmplat în acea zi”, spune Mohamed cu o privire aspră.
-Stăteam în amfiteatru, puneam întrebări... apoi am auzit focuri de armă afară. Teroriștii au intrat înăuntru și au început să ucidă. Profesorul nostru a fost ars de viu. Mohamed descrie cum a scos patru elevi din amfiteatru în timp ce se desfășura carnagiul. Apoi spune că și-a auzit prietenul, pe Hamza, strigându-l. Spunea „oh, frate, salvează-mă”. L-am ținut de mână. Atunci eu am fost împușcat în spate, iar el în cap. Cei mai mulți dintre cei uciși în atac erau elevi Hamza a murit în brațele lui Mohamed. Mohamed își amintește că imediat după asta a leșinat și că următorul lucru pe care l-a știut a fost că se afla pe un pat de spital, paralizat de la brâu în jos.
-Doctorii din Peshawar din nordul Pakistanului, apoi cei din Rawalpindi, aproape de capitală, i-au spus familiei sale că nu exista tratament și că nu va mai putea merge niciodată. „Când l-am văzut, am simțit cum îmi iese sufletul”, spune Sher Khan, tatăl lui Mohamed. Acele nouă luni au fost cele mai grele din viața mea. Însă Khan și soția lui, Sherbano, au refuzat să creadă că fiul lor atât de pasionat de crichet nu-și va mai putea folosi vreodată picioarele. Au făcut o campanie și au cerut ajutor de la televiziunea pakistaneză, atrăgând sprijinul unor oameni faimoși precum Imran Khan, jucător de crichet devenit politician.
-Într-un final, au reușit să strângă fonduri pentru a-l duce pe Mohamed în Regatul Unit și a-i oferi tratament la clinica privată Harley Street din Londra. Neurochirurgul consultant Irfan Malik l-a descris pe Mohamed drept „înspăimântat” când acesta a ajuns la spital. „Își petrecuse ultimele [câteva] luni zăcând în pat, fără să se poată mișca de pe o parte pe alta, spune Malik. Era slăbit, se pusese multă presiune pe spatele lui. Nu era într-o formă prea bună. O vertebră de la baza coloanei vertebrale a lui Mohamed fusese distrusă Mohamed fusese împușcat în umăr, în șold și în spate în timpul atacului, iar coloana vertebrală inferioară îi fusese distrusă, ducând la paralizie.
-Însă, în timpul unei operații care a durat șase ore, Malik și echipa lui au reușit să lege din nou terminațiile nervoase și să reconstruiască partea distrusă a coloanei. Chiar și Malik a fost surprins de ceea ce s-a întâmplat în continuare. Exact la o săptămână după operație, Mohamed s-a ridicat și a început să facă pași și să meargă. Nu ne așteptam la un rezultat atât de bun. A fost un miracol”, spune acesta. În mai puțin de două săptămâni de la operație, Mohamed este gata să părăsească spitalul și să înceapă procesul lung de recuperare. Mohamed a sfidat soarta și a început să meargă din nou Vrea să devină puternic și să își continue studiile în Regatul Unit. Însă este hotărât să revină în Pakistan, să se înroleze în armată și să lupte împotriva terorismului.
-„Simt că am încă o șansă la viață” spune el, arătând imaginile cu arme desenate de el lângă manuale școlare și stilouri Fizic, Mohamed devine tot mai puternic în fiecare zi, însă trauma psihologică prin care trece și acum este de neimaginat. „Furia mea nu a scăzut”, mărturisește el. În școala mea au fost uciși copii mici. Ce crimă au comis ei? Mama lui își șterge o lacrimă, îl mângâie pe creștet și spune: „Îmi văd fiul mergând din nou”. Va putea să-și continue firesc viața. Serviciul 4G „Super Voice” de la Three oferă semnal mai bun Three folosește un spectru 4G cu o frecvență mai joasă, care poate acoperi o zonă mai extinsă
-Furnizorul de telefonie mobilă Three a lansat în Regatul Unit un serviciu despre care spune că va îmbunătăți recepția în interiorul clădirilor și în zonele rurale fără semnal. Serviciul 4G Super Voice le permite clienților să efectueze apeluri și să trimită mesaje text folosind un spectru cu o frecvență mai joasă. Și alte rețele intenționează să introducă aceeași tehnologie, cunoscută ca „Voice Over Long-Term Evolution (VoLTE)”. Aceasta funcționează momentan doar cu Samsung Galaxy S5, însă telefoanele iPhone recente vor beneficia de ea în lunile următoare. Three menționează că până la 5,5 milioane de clienți vor avea acces la serviciu până în 2017.
-Responsabilul șef pentru tehnologie, Bryn Jones a declarat: „Până la sfârșitul anului, un milion dintre clienții noștri vor avea acces la o acoperire mai bună în interior și își vor putea folosi telefoanele în mai multe locuri ca până acum”. Vedetele se pregătesc pentru stagiunea de pantomimă Stagiunea de pantomimă este foarte importantă pentru teatrele din tot Regatul Unit, multe dintre ele pregătindu-se acum pentru stagiunea din acest an. Acum, la teatrul de Crăciun participă unele dintre numele cele mai mari din showbusiness. Matthew Kelly și Hayley Mills vor apărea în Cenușăreasa - primul în rolul uneia dintre surorile rele, iar a doua în rolul zânei. Aceștia dezvăluie secretele pantomimei lor la BBC Breakfast. Steven Wilson: „Dacă nu fac nimic, mă simt vinovat”
-Steven Wilson a fost desemnat recent drept marele câștigător al Progressive Music Awards Steven Wilson a fost numit de multe ori drept cel mai muncitor muzician din lumea rockului progresiv. Talentatul muzician a câștigat trei premii la Progressive Music Awards, care a avut loc luna aceasta la Londra, printre care și premiul pentru cel mai bun album al anului pentru Hand. În recenzia sa de cinci stele, The Guardian a numit albumul „o operă de artă inteligentă, expresivă și captivantă”. Încă din anii 1980, Wilson este motorul mai multor proiecte muzicale, cel mai cunoscut dintre acestea fiind trupa de rock Porcupine Tree. Acum, înainte de două spectacole cu casa închisă la Royal Albert Hall, Wilson lansează un dublu LP doar în format vinil, Transience, pentru a arăta latura „mai accesibilă” a activității sale solo.
-A povestit pentru BBC despre dragostea lui pentru viniluri și despre programul său încărcat și a explicat cum a ajuns actorul de comedie Matt Berry să îi deschidă spectacolele. Ce înseamnă vinil pentru tine? Am crescut chiar în perioada de sfârșit a erei vinilurilor și îmi amintesc că atunci abia așteptam apariția CD-ului, căci vinilul era atât de enervant. Cumpărai un disc, mergeai cu el acasă, avea o zgârietură și trebuia să îl aduci înapoi. Iubesc CD-urile, iar pentru anumite tipuri de muzică, de exemplu cea clasică, sunt mai bune decât vinilurile. Însă problema cu CD-urile și cu descărcările digitale este aceea că nu mai există nimic pe care să îl prețuiești cu adevărat. Să ai un vinil e ca și cum ai avea un tablou frumos agățat în sufragerie.
-E ceva ce poți ține în mână, în timp ce te lași absorbit de versuri și copleșit de actul artistic. Am crezut că e doar o chestie nostalgică, însă nu are cum să fie așa dacă unor puști prea tineri să-și amintească de viniluri le place acest gen de experiență. Ai vreun vinil la care ții în mod special? Recunosc că am scăpat de toate vinilurile în anii '90. Toate vinilurile pe care le am sunt cumpărate din nou. Am pornit de la ideea de a reface colecția pe care o aveam la 15 ani, însă am trecut de limita aceea. Primul disc pe care mi-am convins părinții să mi-l cumpere a fost Out of the Blue de la Electric Light Orchestra.
-Dacă aș mai fi avut încă exemplarul inițial, acesta ar fi avut valoare sentimentală, însă, din păcate, se află pe undeva printr-un magazin de caritate. Steven Wilson speră că albumul va fi o poartă către posibili fani noi De ce ți-ai lansat noua compilație Transience pe vinil? Aceasta a fost concepută inițial ca idee pentru Ziua magazinelor de discuri, însă am ratat ocazia. Casa mea de discuri sugerase să adun câteva dintre melodiile mele mai scurte și mai accesibile. Am ajuns să fiu ușor obsedat de ideea de a face ceva gen „introducere în muzica lui Steven Wilson” și am ținut neapărat ca proiectul să fie lansat doar pe vinil. Cine cumpără vinilul primește, de asemenea, și o variantă descărcată la rezoluție înaltă.
-Ești îngrijorat că albumul nu va arăta muzica ta în adevărata ei lumină?
--- a/examples/legacy/seq2seq/train_distil_marian_enro.sh
+++ b/examples/legacy/seq2seq/train_distil_marian_enro.sh
@ -1,38 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-export WANDB_PROJECT=distil-marian
-export BS=64
-export GAS=1
-export m=sshleifer/student_marian_en_ro_6_3
-export MAX_LEN=128
-python finetune_trainer.py \
-    --tokenizer_name $m --model_name_or_path $m \
-    --data_dir $ENRO_DIR \
-    --output_dir marian_en_ro_6_3 \
-    --learning_rate=3e-4 \
-    --warmup_steps 500 --sortish_sampler \
-    --fp16 \
-    --gradient_accumulation_steps=$GAS \
-    --per_device_train_batch_size=$BS --per_device_eval_batch_size=$BS \
-    --freeze_encoder --freeze_embeds \
-    --num_train_epochs=6 \
-    --save_steps 3000 --eval_steps 3000 \
-    --max_source_length $MAX_LEN --max_target_length $MAX_LEN \
-    --val_max_target_length $MAX_TGT_LEN --test_max_target_length $MAX_TGT_LEN \
-    --do_train --do_eval --do_predict \
-    --eval_strategy steps \
-    --predict_with_generate --logging_first_step \
-    --task translation --label_smoothing_factor 0.1 \
-    "$@"
--- a/examples/legacy/seq2seq/train_distil_marian_enro_tpu.sh
+++ b/examples/legacy/seq2seq/train_distil_marian_enro_tpu.sh
@ -1,37 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-export WANDB_PROJECT=distil-marian
-export BS=64
-export m=sshleifer/student_marian_en_ro_6_3
-export MAX_LEN=128
-
-python xla_spawn.py finetune_trainer.py \
-    --tokenizer_name $m --model_name_or_path $m \
-    --data_dir $ENRO_DIR \
-    --output_dir marian_en_ro_6_3 \
-    --learning_rate=3e-4 \
-    --warmup_steps 500 \
-    --per_device_train_batch_size=$BS --per_device_eval_batch_size=$BS \
-    --freeze_encoder --freeze_embeds \
-    --num_train_epochs=6 \
-    --save_steps 500 --eval_steps 500 \
-    --logging_first_step --logging_steps 200 \
-    --max_source_length $MAX_LEN --max_target_length $MAX_LEN \
-    --val_max_target_length $MAX_TGT_LEN --test_max_target_length $MAX_TGT_LEN \
-    --do_train --do_eval \
-    --eval_strategy steps \
-    --prediction_loss_only \
-    --task translation --label_smoothing_factor 0.1 \
-    "$@"
--- a/examples/legacy/seq2seq/train_distilbart_cnn.sh
+++ b/examples/legacy/seq2seq/train_distilbart_cnn.sh
@ -1,39 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-export WANDB_PROJECT=distilbart-trainer
-export BS=32
-export m=sshleifer/student_cnn_12_6
-export tok=facebook/bart-large
-export MAX_TGT_LEN=142
-
-python finetune_trainer.py \
-    --model_name_or_path $m --tokenizer_name $tok \ 
-    --data_dir cnn_dm \
-    --output_dir distilbart-cnn-12-6 \
-    --learning_rate=3e-5 \
-    --warmup_steps 500 --sortish_sampler \
-    --fp16 \
-    --n_val 500 \
-    --gradient_accumulation_steps=1 \
-    --per_device_train_batch_size=$BS --per_device_eval_batch_size=$BS \
-    --freeze_encoder --freeze_embeds \
-    --num_train_epochs=2 \
-    --save_steps 3000 --eval_steps 3000 \
-    --logging_first_step \
-    --max_target_length 56 --val_max_target_length $MAX_TGT_LEN --test_max_target_length $MAX_TGT_LEN\
-    --do_train --do_eval --do_predict \
-    --eval_strategy steps \
-    --predict_with_generate --sortish_sampler \
-    "$@"
--- a/examples/legacy/seq2seq/train_mbart_cc25_enro.sh
+++ b/examples/legacy/seq2seq/train_mbart_cc25_enro.sh
@ -1,35 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-python finetune_trainer.py \
-    --model_name_or_path=facebook/mbart-large-cc25 \
-    --data_dir $ENRO_DIR \
-    --output_dir mbart_cc25_enro \
-    --learning_rate=3e-5 \
-    --warmup_steps 500 \ 
-    --fp16 \
-    --label_smoothing 0.1 \
-    --adam_eps 1e-06 \
-    --src_lang en_XX --tgt_lang ro_RO \
-    --freeze_embeds \
-    --per_device_train_batch_size=4 --per_device_eval_batch_size=4 \
-    --max_source_length 128 --max_target_length 128 --val_max_target_length 128 --test_max_target_length 128\
-    --sortish_sampler \
-    --num_train_epochs 6 \
-    --save_steps 25000 --eval_steps 25000 --logging_steps 1000 \
-    --do_train --do_eval --do_predict \
-    --eval_strategy steps \
-    --predict_with_generate --logging_first_step \
-    --task translation \
-    "$@"
--- a/examples/legacy/seq2seq/utils.py
+++ b/examples/legacy/seq2seq/utils.py
@ -1,640 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import itertools
-import json
-import linecache
-import math
-import os
-import pickle
-import socket
-from collections.abc import Iterable
-from logging import getLogger
-from pathlib import Path
-from typing import Callable, Union
-
-import git
-import numpy as np
-import torch
-import torch.distributed as dist
-from rouge_score import rouge_scorer, scoring
-from sacrebleu import corpus_bleu
-from sentence_splitter import add_newline_to_end_of_each_sentence
-from torch import nn
-from torch.utils.data import Dataset, Sampler
-
-from transformers import BartTokenizer, EvalPrediction, PreTrainedTokenizer, T5Tokenizer
-from transformers.models.bart.modeling_bart import shift_tokens_right
-from transformers.utils import cached_property
-
-
-try:
-    from fairseq.data.data_utils import batch_by_size
-
-    FAIRSEQ_AVAILABLE = True
-except (ImportError, ModuleNotFoundError):
-    FAIRSEQ_AVAILABLE = False
-
-
-def label_smoothed_nll_loss(lprobs, target, epsilon, ignore_index=-100):
-    """From fairseq"""
-    if target.dim() == lprobs.dim() - 1:
-        target = target.unsqueeze(-1)
-    nll_loss = -lprobs.gather(dim=-1, index=target)
-    smooth_loss = -lprobs.sum(dim=-1, keepdim=True)
-    if ignore_index is not None:
-        pad_mask = target.eq(ignore_index)
-        nll_loss.masked_fill_(pad_mask, 0.0)
-        smooth_loss.masked_fill_(pad_mask, 0.0)
-    else:
-        nll_loss = nll_loss.squeeze(-1)
-        smooth_loss = smooth_loss.squeeze(-1)
-
-    nll_loss = nll_loss.sum()  # mean()? Scared to break other math.
-    smooth_loss = smooth_loss.sum()
-    eps_i = epsilon / lprobs.size(-1)
-    loss = (1.0 - epsilon) * nll_loss + eps_i * smooth_loss
-    return loss, nll_loss
-
-
-def lmap(f: Callable, x: Iterable) -> list:
-    """list(map(f, x))"""
-    return list(map(f, x))
-
-
-def calculate_bleu(output_lns, refs_lns, **kwargs) -> dict:
-    """Uses sacrebleu's corpus_bleu implementation."""
-    return {"bleu": round(corpus_bleu(output_lns, [refs_lns], **kwargs).score, 4)}
-
-
-def build_compute_metrics_fn(task_name: str, tokenizer: PreTrainedTokenizer) -> Callable[[EvalPrediction], dict]:
-    def non_pad_len(tokens: np.ndarray) -> int:
-        return np.count_nonzero(tokens != tokenizer.pad_token_id)
-
-    def decode_pred(pred: EvalPrediction) -> tuple[list[str], list[str]]:
-        pred_ids = pred.predictions
-        label_ids = pred.label_ids
-        pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
-        label_ids[label_ids == -100] = tokenizer.pad_token_id
-        label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)
-        pred_str = lmap(str.strip, pred_str)
-        label_str = lmap(str.strip, label_str)
-        return pred_str, label_str
-
-    def summarization_metrics(pred: EvalPrediction) -> dict:
-        pred_str, label_str = decode_pred(pred)
-        rouge: dict = calculate_rouge(pred_str, label_str)
-        summ_len = np.round(np.mean(lmap(non_pad_len, pred.predictions)), 1)
-        rouge.update({"gen_len": summ_len})
-        return rouge
-
-    def translation_metrics(pred: EvalPrediction) -> dict:
-        pred_str, label_str = decode_pred(pred)
-        bleu: dict = calculate_bleu(pred_str, label_str)
-        gen_len = np.round(np.mean(lmap(non_pad_len, pred.predictions)), 1)
-        bleu.update({"gen_len": gen_len})
-        return bleu
-
-    compute_metrics_fn = summarization_metrics if "summarization" in task_name else translation_metrics
-    return compute_metrics_fn
-
-
-def trim_batch(
-    input_ids,
-    pad_token_id,
-    attention_mask=None,
-):
-    """Remove columns that are populated exclusively by pad_token_id"""
-    keep_column_mask = input_ids.ne(pad_token_id).any(dim=0)
-    if attention_mask is None:
-        return input_ids[:, keep_column_mask]
-    else:
-        return (input_ids[:, keep_column_mask], attention_mask[:, keep_column_mask])
-
-
-class AbstractSeq2SeqDataset(Dataset):
-    def __init__(
-        self,
-        tokenizer,
-        data_dir,
-        max_source_length,
-        max_target_length,
-        type_path="train",
-        n_obs=None,
-        prefix="",
-        **dataset_kwargs,
-    ):
-        super().__init__()
-        self.src_file = Path(data_dir).joinpath(type_path + ".source")
-        self.tgt_file = Path(data_dir).joinpath(type_path + ".target")
-        self.len_file = Path(data_dir).joinpath(type_path + ".len")
-        if os.path.exists(self.len_file):
-            self.src_lens = pickle_load(self.len_file)
-            self.used_char_len = False
-        else:
-            self.src_lens = self.get_char_lens(self.src_file)
-            self.used_char_len = True
-        self.max_source_length = max_source_length
-        self.max_target_length = max_target_length
-        assert min(self.src_lens) > 0, f"found empty line in {self.src_file}"
-        self.tokenizer = tokenizer
-        self.prefix = prefix if prefix is not None else ""
-
-        if n_obs is not None:
-            self.src_lens = self.src_lens[:n_obs]
-        self.pad_token_id = self.tokenizer.pad_token_id
-        self.dataset_kwargs = dataset_kwargs
-        dataset_kwargs.update({"add_prefix_space": True} if isinstance(self.tokenizer, BartTokenizer) else {})
-
-    def __len__(self):
-        return len(self.src_lens)
-
-    @staticmethod
-    def get_char_lens(data_file):
-        return [len(x) for x in Path(data_file).open()]
-
-    @cached_property
-    def tgt_lens(self):
-        """Length in characters of target documents"""
-        return self.get_char_lens(self.tgt_file)
-
-    def make_sortish_sampler(self, batch_size, distributed=False, shuffle=True, **kwargs):
-        if distributed:
-            return DistributedSortishSampler(self, batch_size, shuffle=shuffle, **kwargs)
-        else:
-            return SortishSampler(self.src_lens, batch_size, shuffle=shuffle)
-
-    def make_dynamic_sampler(self, max_tokens_per_batch=1024, **kwargs):
-        assert FAIRSEQ_AVAILABLE, "Dynamic batch size requires `pip install fairseq`"
-        assert not self.used_char_len, "You must call  python make_len_file.py before calling make_dynamic_sampler"
-        sorted_indices = list(self.make_sortish_sampler(1024, shuffle=False))
-
-        def num_tokens_in_example(i):
-            return min(self.src_lens[i], self.max_target_length)
-
-        # call fairseq cython function
-        batch_sampler: list[list[int]] = batch_by_size(
-            sorted_indices,
-            num_tokens_fn=num_tokens_in_example,
-            max_tokens=max_tokens_per_batch,
-            required_batch_size_multiple=64,
-        )
-        shuffled_batches = [batch_sampler[i] for i in np.random.permutation(range(len(batch_sampler)))]
-        # move the largest batch to the front to OOM quickly (uses an approximation for padding)
-        approximate_toks_per_batch = [max(self.src_lens[i] for i in batch) * len(batch) for batch in shuffled_batches]
-        largest_batch_idx = np.argmax(approximate_toks_per_batch)
-        shuffled_batches[0], shuffled_batches[largest_batch_idx] = (
-            shuffled_batches[largest_batch_idx],
-            shuffled_batches[0],
-        )
-        return shuffled_batches
-
-    def __getitem__(self, item):
-        raise NotImplementedError("You must implement this")
-
-    def collate_fn(self, batch):
-        raise NotImplementedError("You must implement this")
-
-
-class LegacySeq2SeqDataset(AbstractSeq2SeqDataset):
-    def __getitem__(self, index) -> dict[str, torch.Tensor]:
-        """Call tokenizer on src and tgt_lines"""
-        index = index + 1  # linecache starts at 1
-        source_line = self.prefix + linecache.getline(str(self.src_file), index).rstrip("\n")
-        tgt_line = linecache.getline(str(self.tgt_file), index).rstrip("\n")
-        assert source_line, f"empty source line for index {index}"
-        assert tgt_line, f"empty tgt line for index {index}"
-        source_inputs = self.encode_line(self.tokenizer, source_line, self.max_source_length)
-        target_inputs = self.encode_line(self.tokenizer, tgt_line, self.max_target_length)
-
-        source_ids = source_inputs["input_ids"].squeeze()
-        target_ids = target_inputs["input_ids"].squeeze()
-        src_mask = source_inputs["attention_mask"].squeeze()
-        return {
-            "input_ids": source_ids,
-            "attention_mask": src_mask,
-            "labels": target_ids,
-        }
-
-    def encode_line(self, tokenizer, line, max_length, pad_to_max_length=True, return_tensors="pt"):
-        """Only used by LegacyDataset"""
-        return tokenizer(
-            [line],
-            max_length=max_length,
-            padding="max_length" if pad_to_max_length else None,
-            truncation=True,
-            return_tensors=return_tensors,
-            **self.dataset_kwargs,
-        )
-
-    def collate_fn(self, batch) -> dict[str, torch.Tensor]:
-        input_ids = torch.stack([x["input_ids"] for x in batch])
-        masks = torch.stack([x["attention_mask"] for x in batch])
-        target_ids = torch.stack([x["labels"] for x in batch])
-        pad_token_id = self.pad_token_id
-        y = trim_batch(target_ids, pad_token_id)
-        source_ids, source_mask = trim_batch(input_ids, pad_token_id, attention_mask=masks)
-        batch = {
-            "input_ids": source_ids,
-            "attention_mask": source_mask,
-            "labels": y,
-        }
-        return batch
-
-
-class Seq2SeqDataset(AbstractSeq2SeqDataset):
-    """A dataset that calls prepare_seq2seq_batch."""
-
-    def __getitem__(self, index) -> dict[str, str]:
-        index = index + 1  # linecache starts at 1
-        source_line = self.prefix + linecache.getline(str(self.src_file), index).rstrip("\n")
-        tgt_line = linecache.getline(str(self.tgt_file), index).rstrip("\n")
-        assert source_line, f"empty source line for index {index}"
-        assert tgt_line, f"empty tgt line for index {index}"
-        return {"tgt_texts": tgt_line, "src_texts": source_line, "id": index - 1}
-
-    def collate_fn(self, batch) -> dict[str, torch.Tensor]:
-        """Call prepare_seq2seq_batch."""
-        batch_encoding: dict[str, torch.Tensor] = self.tokenizer.prepare_seq2seq_batch(
-            [x["src_texts"] for x in batch],
-            tgt_texts=[x["tgt_texts"] for x in batch],
-            max_length=self.max_source_length,
-            max_target_length=self.max_target_length,
-            return_tensors="pt",
-            **self.dataset_kwargs,
-        ).data
-        batch_encoding["ids"] = torch.tensor([x["id"] for x in batch])
-        return batch_encoding
-
-
-class Seq2SeqDataCollator:
-    def __init__(self, tokenizer, data_args, decoder_start_token_id):
-        self.tokenizer = tokenizer
-        self.pad_token_id = tokenizer.pad_token_id
-        self.decoder_start_token_id = decoder_start_token_id
-        assert self.pad_token_id is not None, (
-            f"pad_token_id is not defined for ({self.tokenizer.__class__.__name__}), it must be defined."
-        )
-        self.data_args = data_args
-        self.dataset_kwargs = {"add_prefix_space": True} if isinstance(tokenizer, BartTokenizer) else {}
-        if data_args.src_lang is not None:
-            self.dataset_kwargs["src_lang"] = data_args.src_lang
-        if data_args.tgt_lang is not None:
-            self.dataset_kwargs["tgt_lang"] = data_args.tgt_lang
-
-    def __call__(self, batch) -> dict[str, torch.Tensor]:
-        if hasattr(self.tokenizer, "prepare_seq2seq_batch"):
-            batch = self._encode(batch)
-            input_ids, attention_mask, labels = (
-                batch["input_ids"],
-                batch["attention_mask"],
-                batch["labels"],
-            )
-        else:
-            input_ids = torch.stack([x["input_ids"] for x in batch])
-            attention_mask = torch.stack([x["attention_mask"] for x in batch])
-            labels = torch.stack([x["labels"] for x in batch])
-
-            labels = trim_batch(labels, self.pad_token_id)
-            input_ids, attention_mask = trim_batch(input_ids, self.pad_token_id, attention_mask=attention_mask)
-
-        if isinstance(self.tokenizer, T5Tokenizer):
-            decoder_input_ids = self._shift_right_t5(labels)
-        else:
-            decoder_input_ids = shift_tokens_right(labels, self.pad_token_id, self.decoder_start_token_id)
-
-        batch = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-            "decoder_input_ids": decoder_input_ids,
-            "labels": labels,
-        }
-        return batch
-
-    def _shift_right_t5(self, input_ids):
-        # shift inputs to the right
-        shifted_input_ids = input_ids.new_zeros(input_ids.shape)
-        shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
-        shifted_input_ids[..., 0] = self.pad_token_id
-        return shifted_input_ids
-
-    def _encode(self, batch) -> dict[str, torch.Tensor]:
-        batch_encoding = self.tokenizer.prepare_seq2seq_batch(
-            [x["src_texts"] for x in batch],
-            tgt_texts=[x["tgt_texts"] for x in batch],
-            max_length=self.data_args.max_source_length,
-            max_target_length=self.data_args.max_target_length,
-            padding="longest",
-            return_tensors="pt",
-            **self.dataset_kwargs,
-        )
-        return batch_encoding.data
-
-
-class SortishSampler(Sampler):
-    "Go through the text data by order of src length with a bit of randomness. From fastai repo."
-
-    def __init__(self, data, batch_size, shuffle=True):
-        self.data, self.bs, self.shuffle = data, batch_size, shuffle
-
-    def __len__(self) -> int:
-        return len(self.data)
-
-    def __iter__(self):
-        return iter(sortish_sampler_indices(self.data, self.bs, shuffle=self.shuffle))
-
-
-def sortish_sampler_indices(data: list, bs: int, shuffle=True) -> np.array:
-    "Go through the text data by order of src length with a bit of randomness. From fastai repo."
-    if not shuffle:
-        return np.argsort(np.array(data) * -1)
-
-    def key_fn(i):
-        return data[i]
-
-    idxs = np.random.permutation(len(data))
-    sz = bs * 50
-    ck_idx = [idxs[i : i + sz] for i in range(0, len(idxs), sz)]
-    sort_idx = np.concatenate([sorted(s, key=key_fn, reverse=True) for s in ck_idx])
-    sz = bs
-    ck_idx = [sort_idx[i : i + sz] for i in range(0, len(sort_idx), sz)]
-    max_ck = np.argmax([key_fn(ck[0]) for ck in ck_idx])  # find the chunk with the largest key,
-    ck_idx[0], ck_idx[max_ck] = ck_idx[max_ck], ck_idx[0]  # then make sure it goes first.
-    sort_idx = np.concatenate(np.random.permutation(ck_idx[1:])) if len(ck_idx) > 1 else np.array([], dtype=int)
-    sort_idx = np.concatenate((ck_idx[0], sort_idx))
-    return sort_idx
-
-
-class DistributedSortishSampler(Sampler):
-    """Copied from torch DistributedSampler"""
-
-    def __init__(self, dataset, batch_size, num_replicas=None, rank=None, add_extra_examples=True, shuffle=True):
-        if num_replicas is None:
-            if not dist.is_available():
-                raise RuntimeError("Requires distributed package to be available")
-            num_replicas = dist.get_world_size()
-        if rank is None:
-            if not dist.is_available():
-                raise RuntimeError("Requires distributed package to be available")
-            rank = dist.get_rank()
-        self.dataset = dataset
-        self.num_replicas = num_replicas
-        self.rank = rank
-        self.epoch = 0
-        if add_extra_examples:
-            self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
-            self.total_size = self.num_samples * self.num_replicas
-        else:
-            self.total_size = len(dataset)
-            self.num_samples = len(self.available_indices)
-        self.batch_size = batch_size
-        self.add_extra_examples = add_extra_examples
-        self.shuffle = shuffle
-
-    def __iter__(self) -> Iterable:
-        g = torch.Generator()
-        g.manual_seed(self.epoch)
-
-        sortish_data = [self.dataset.src_lens[i] for i in self.available_indices]
-        sortish_indices = sortish_sampler_indices(sortish_data, self.batch_size, shuffle=self.shuffle)
-        indices = [self.available_indices[i] for i in sortish_indices]
-        assert len(indices) == self.num_samples
-        return iter(indices)
-
-    @cached_property
-    def available_indices(self) -> np.array:
-        indices = list(range(len(self.dataset)))
-        # add extra samples to make it evenly divisible
-        indices += indices[: (self.total_size - len(indices))]
-        assert len(indices) == self.total_size
-        # subsample
-        available_indices = indices[self.rank : self.total_size : self.num_replicas]
-        return available_indices
-
-    def __len__(self):
-        return self.num_samples
-
-    def set_epoch(self, epoch):
-        self.epoch = epoch
-
-
-logger = getLogger(__name__)
-
-
-def use_task_specific_params(model, task):
-    """Update config with summarization specific params."""
-    task_specific_params = model.config.task_specific_params
-
-    if task_specific_params is not None:
-        pars = task_specific_params.get(task, {})
-        logger.info(f"setting model.config to task specific params for {task}:\n {pars}")
-        logger.info("note: command line args may override some of these")
-        model.config.update(pars)
-
-
-def pickle_load(path):
-    """pickle.load(path)"""
-    with open(path, "rb") as f:
-        return pickle.load(f)
-
-
-def pickle_save(obj, path):
-    """pickle.dump(obj, path)"""
-    with open(path, "wb") as f:
-        return pickle.dump(obj, f)
-
-
-def flatten_list(summary_ids: list[list]):
-    return list(itertools.chain.from_iterable(summary_ids))
-
-
-def save_git_info(folder_path: str) -> None:
-    """Save git information to output_dir/git_log.json"""
-    repo_infos = get_git_info()
-    save_json(repo_infos, os.path.join(folder_path, "git_log.json"))
-
-
-def save_json(content, path, indent=4, **json_dump_kwargs):
-    with open(path, "w") as f:
-        json.dump(content, f, indent=indent, sort_keys=True, **json_dump_kwargs)
-
-
-def load_json(path):
-    with open(path) as f:
-        return json.load(f)
-
-
-def get_git_info():
-    try:
-        repo = git.Repo(search_parent_directories=True)
-        repo_infos = {
-            "repo_id": str(repo),
-            "repo_sha": str(repo.head.object.hexsha),
-            "repo_branch": str(repo.active_branch),
-            "hostname": str(socket.gethostname()),
-        }
-        return repo_infos
-    except TypeError:
-        return {
-            "repo_id": None,
-            "repo_sha": None,
-            "repo_branch": None,
-            "hostname": None,
-        }
-
-
-ROUGE_KEYS = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
-
-
-def extract_rouge_mid_statistics(dct):
-    new_dict = {}
-    for k1, v1 in dct.items():
-        mid = v1.mid
-        new_dict[k1] = {stat: round(getattr(mid, stat), 4) for stat in ["precision", "recall", "fmeasure"]}
-    return new_dict
-
-
-def calculate_rouge(
-    pred_lns: list[str],
-    tgt_lns: list[str],
-    use_stemmer=True,
-    rouge_keys=ROUGE_KEYS,
-    return_precision_and_recall=False,
-    bootstrap_aggregation=True,
-    newline_sep=True,
-) -> dict:
-    """Calculate rouge using rouge_scorer package.
-
-    Args:
-        pred_lns: list of summaries generated by model
-        tgt_lns: list of groundtruth summaries (e.g. contents of val.target)
-        use_stemmer:  Bool indicating whether Porter stemmer should be used to
-        strip word suffixes to improve matching.
-        rouge_keys:  which metrics to compute, defaults to rouge1, rouge2, rougeL, rougeLsum
-        return_precision_and_recall: (False) whether to also return precision and recall.
-        bootstrap_aggregation: whether to do the typical bootstrap resampling of scores. Defaults to True, if False
-            this function returns a collections.defaultdict[metric: list of values for each observation for each subscore]``
-        newline_sep:(default=True) whether to add newline between sentences. This is essential for calculation rougeL
-        on multi sentence summaries (CNN/DM dataset).
-
-    Returns:
-         dict[score: value] if aggregate else defaultdict(list) keyed by rouge_keys
-
-    """
-    scorer = rouge_scorer.RougeScorer(rouge_keys, use_stemmer=use_stemmer)
-    aggregator = scoring.BootstrapAggregator()
-    for pred, tgt in zip(tgt_lns, pred_lns):
-        # rougeLsum expects "\n" separated sentences within a summary
-        if newline_sep:
-            pred = add_newline_to_end_of_each_sentence(pred)
-            tgt = add_newline_to_end_of_each_sentence(tgt)
-        scores = scorer.score(pred, tgt)
-        aggregator.add_scores(scores)
-
-    if bootstrap_aggregation:
-        result = aggregator.aggregate()
-        if return_precision_and_recall:
-            return extract_rouge_mid_statistics(result)  # here we return dict
-        else:
-            return {k: round(v.mid.fmeasure * 100, 4) for k, v in result.items()}
-
-    else:
-        return aggregator._scores  # here we return defaultdict(list)
-
-
-# Utilities for freezing parameters and checking whether they are frozen
-
-
-def freeze_params(model: nn.Module):
-    """Set requires_grad=False for each of model.parameters()"""
-    for par in model.parameters():
-        par.requires_grad = False
-
-
-def freeze_embeds(model):
-    """Freeze token embeddings and positional embeddings for bart, just token embeddings for t5."""
-    model_type = model.config.model_type
-
-    if model_type in ["t5", "mt5"]:
-        freeze_params(model.shared)
-        for d in [model.encoder, model.decoder]:
-            freeze_params(d.embed_tokens)
-    elif model_type == "fsmt":
-        for d in [model.model.encoder, model.model.decoder]:
-            freeze_params(d.embed_positions)
-            freeze_params(d.embed_tokens)
-    else:
-        freeze_params(model.model.shared)
-        for d in [model.model.encoder, model.model.decoder]:
-            freeze_params(d.embed_positions)
-            freeze_params(d.embed_tokens)
-
-
-def grad_status(model: nn.Module) -> Iterable:
-    return (par.requires_grad for par in model.parameters())
-
-
-def any_requires_grad(model: nn.Module) -> bool:
-    return any(grad_status(model))
-
-
-def assert_all_frozen(model):
-    model_grads: list[bool] = list(grad_status(model))
-    n_require_grad = sum(lmap(int, model_grads))
-    npars = len(model_grads)
-    assert not any(model_grads), f"{n_require_grad / npars:.1%} of {npars} weights require grad"
-
-
-def assert_not_all_frozen(model):
-    model_grads: list[bool] = list(grad_status(model))
-    npars = len(model_grads)
-    assert any(model_grads), f"none of {npars} weights require grad"
-
-
-def parse_numeric_n_bool_cl_kwargs(unparsed_args: list[str]) -> dict[str, Union[int, float, bool]]:
-    """
-    Parse an argv list of unspecified command line args to a dict.
-    Assumes all values are either numeric or boolean in the form of true/false.
-    """
-    result = {}
-    assert len(unparsed_args) % 2 == 0, f"got odd number of unparsed args: {unparsed_args}"
-    num_pairs = len(unparsed_args) // 2
-    for pair_num in range(num_pairs):
-        i = 2 * pair_num
-        assert unparsed_args[i].startswith("--")
-        if unparsed_args[i + 1].lower() == "true":
-            value = True
-        elif unparsed_args[i + 1].lower() == "false":
-            value = False
-        else:
-            try:
-                value = int(unparsed_args[i + 1])
-            except ValueError:
-                value = float(unparsed_args[i + 1])  # this can raise another informative ValueError
-
-        result[unparsed_args[i][2:]] = value
-    return result
-
-
-def write_txt_file(ordered_tgt, path):
-    f = Path(path).open("w")
-    for ln in ordered_tgt:
-        f.write(ln + "\n")
-        f.flush()
-
-
-def chunks(lst, n):
-    """Yield successive n-sized chunks from lst."""
-    for i in range(0, len(lst), n):
-        yield lst[i : i + n]
--- a/examples/legacy/seq2seq/xla_spawn.py
+++ b/examples/legacy/seq2seq/xla_spawn.py
@ -1,82 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-A simple launcher script for TPU training
-
-Inspired by https://github.com/pytorch/pytorch/blob/master/torch/distributed/launch.py
-
-::
-    >>> python xla_spawn.py --num_cores=NUM_CORES_YOU_HAVE
-               YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3 and all other
-               arguments of your training script)
-
-"""
-
-import importlib
-import sys
-from argparse import REMAINDER, ArgumentParser
-from pathlib import Path
-
-import torch_xla.distributed.xla_multiprocessing as xmp
-
-
-def parse_args():
-    """
-    Helper function parsing the command line options
-    @retval ArgumentParser
-    """
-    parser = ArgumentParser(
-        description=(
-            "PyTorch TPU distributed training launch helper utility that will spawn up multiple distributed processes"
-        )
-    )
-
-    # Optional arguments for the launch helper
-    parser.add_argument("--num_cores", type=int, default=1, help="Number of TPU cores to use (1 or 8).")
-
-    # positional
-    parser.add_argument(
-        "training_script",
-        type=str,
-        help=(
-            "The full path to the single TPU training "
-            "program/script to be launched in parallel, "
-            "followed by all the arguments for the "
-            "training script"
-        ),
-    )
-
-    # rest from the training program
-    parser.add_argument("training_script_args", nargs=REMAINDER)
-
-    return parser.parse_args()
-
-
-def main():
-    args = parse_args()
-
-    # Import training_script as a module.
-    script_fpath = Path(args.training_script)
-    sys.path.append(str(script_fpath.parent.resolve()))
-    mod_name = script_fpath.stem
-    mod = importlib.import_module(mod_name)
-
-    # Patch sys.argv
-    sys.argv = [args.training_script] + args.training_script_args
-
-    xmp.spawn(mod._mp_fn, args=())
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/legacy/token-classification/README.md
+++ b/examples/legacy/token-classification/README.md
@ -1,294 +0,0 @@
-## Token classification
-
-Based on the scripts [`run_ner.py`](https://github.com/huggingface/transformers/blob/main/examples/legacy/token-classification/run_ner.py).
-
-The following examples are covered in this section:
-
-* NER on the GermEval 2014 (German NER) dataset
-* Emerging and Rare Entities task: WNUT’17 (English NER) dataset
-
-Details and results for the fine-tuning provided by @stefan-it.
-
-### GermEval 2014 (German NER) dataset
-
-#### Data (Download and pre-processing steps)
-
-Data can be obtained from the [GermEval 2014](https://sites.google.com/site/germeval2014ner/data) shared task page.
-
-Here are the commands for downloading and pre-processing train, dev and test datasets. The original data format has four (tab-separated) columns, in a pre-processing step only the two relevant columns (token and outer span NER annotation) are extracted:
-
-```bash
-curl -L 'https://drive.google.com/uc?export=download&id=1Jjhbal535VVz2ap4v4r_rN1UEHTdLK5P' \
-| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > train.txt.tmp
-curl -L 'https://drive.google.com/uc?export=download&id=1ZfRcQThdtAR5PPRjIDtrVP7BtXSCUBbm' \
-| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > dev.txt.tmp
-curl -L 'https://drive.google.com/uc?export=download&id=1u9mb7kNJHWQCWyweMDRMuTFoOHOfeBTH' \
-| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > test.txt.tmp
-```
-
-The GermEval 2014 dataset contains some strange "control character" tokens like `'\x96', '\u200e', '\x95', '\xad' or '\x80'`.
-One problem with these tokens is, that `BertTokenizer` returns an empty token for them, resulting in misaligned `InputExample`s.
-The `preprocess.py` script located in the `scripts` folder a) filters these tokens and b) splits longer sentences into smaller ones (once the max. subtoken length is reached).
-
-Let's define some variables that we need for further pre-processing steps and training the model:
-
-```bash
-export MAX_LENGTH=128
-export BERT_MODEL=google-bert/bert-base-multilingual-cased
-```
-
-Run the pre-processing script on training, dev and test datasets:
-
-```bash
-python3 scripts/preprocess.py train.txt.tmp $BERT_MODEL $MAX_LENGTH > train.txt
-python3 scripts/preprocess.py dev.txt.tmp $BERT_MODEL $MAX_LENGTH > dev.txt
-python3 scripts/preprocess.py test.txt.tmp $BERT_MODEL $MAX_LENGTH > test.txt
-```
-
-The GermEval 2014 dataset has much more labels than CoNLL-2002/2003 datasets, so an own set of labels must be used:
-
-```bash
-cat train.txt dev.txt test.txt | cut -d " " -f 2 | grep -v "^$"| sort | uniq > labels.txt
-```
-
-#### Prepare the run
-
-Additional environment variables must be set:
-
-```bash
-export OUTPUT_DIR=germeval-model
-export BATCH_SIZE=32
-export NUM_EPOCHS=3
-export SAVE_STEPS=750
-export SEED=1
-```
-
-#### Run the Pytorch version
-
-To start training, just run:
-
-```bash
-python3 run_ner.py --data_dir ./ \
--labels ./labels.txt \
--model_name_or_path $BERT_MODEL \
--output_dir $OUTPUT_DIR \
--max_seq_length  $MAX_LENGTH \
--num_train_epochs $NUM_EPOCHS \
--per_device_train_batch_size $BATCH_SIZE \
--save_steps $SAVE_STEPS \
--seed $SEED \
--do_train \
--do_eval \
--do_predict
-```
-
-If your GPU supports half-precision training, just add the `--fp16` flag. After training, the model will be both evaluated on development and test datasets.
-
-#### JSON-based configuration file
-
-Instead of passing all parameters via commandline arguments, the `run_ner.py` script also supports reading parameters from a json-based configuration file:
-
-```json
-{
-    "data_dir": ".",
-    "labels": "./labels.txt",
-    "model_name_or_path": "google-bert/bert-base-multilingual-cased",
-    "output_dir": "germeval-model",
-    "max_seq_length": 128,
-    "num_train_epochs": 3,
-    "per_device_train_batch_size": 32,
-    "save_steps": 750,
-    "seed": 1,
-    "do_train": true,
-    "do_eval": true,
-    "do_predict": true
-}
-```
-
-It must be saved with a `.json` extension and can be used by running `python3 run_ner.py config.json`.
-
-#### Evaluation
-
-Evaluation on development dataset outputs the following for our example:
-
-```bash
-10/04/2019 00:42:06 - INFO - __main__ -   ***** Eval results  *****
-10/04/2019 00:42:06 - INFO - __main__ -     f1 = 0.8623348017621146
-10/04/2019 00:42:06 - INFO - __main__ -     loss = 0.07183869666975543
-10/04/2019 00:42:06 - INFO - __main__ -     precision = 0.8467916366258111
-10/04/2019 00:42:06 - INFO - __main__ -     recall = 0.8784592370979806
-```
-
-On the test dataset the following results could be achieved:
-
-```bash
-10/04/2019 00:42:42 - INFO - __main__ -   ***** Eval results  *****
-10/04/2019 00:42:42 - INFO - __main__ -     f1 = 0.8614389652384803
-10/04/2019 00:42:42 - INFO - __main__ -     loss = 0.07064602487454782
-10/04/2019 00:42:42 - INFO - __main__ -     precision = 0.8604651162790697
-10/04/2019 00:42:42 - INFO - __main__ -     recall = 0.8624150210424085
-```
-
-#### Run the Tensorflow 2 version
-
-To start training, just run:
-
-```bash
-python3 run_tf_ner.py --data_dir ./ \
--labels ./labels.txt \
--model_name_or_path $BERT_MODEL \
--output_dir $OUTPUT_DIR \
--max_seq_length  $MAX_LENGTH \
--num_train_epochs $NUM_EPOCHS \
--per_device_train_batch_size $BATCH_SIZE \
--save_steps $SAVE_STEPS \
--seed $SEED \
--do_train \
--do_eval \
--do_predict
-```
-
-Such as the Pytorch version, if your GPU supports half-precision training, just add the `--fp16` flag. After training, the model will be both evaluated on development and test datasets.
-
-#### Evaluation
-
-Evaluation on development dataset outputs the following for our example:
-```bash
-           precision    recall  f1-score   support
-
- LOCderiv     0.7619    0.6154    0.6809        52
-  PERpart     0.8724    0.8997    0.8858      4057
-  OTHpart     0.9360    0.9466    0.9413       711
-  ORGpart     0.7015    0.6989    0.7002       269
-  LOCpart     0.7668    0.8488    0.8057       496
-      LOC     0.8745    0.9191    0.8963       235
- ORGderiv     0.7723    0.8571    0.8125        91
- OTHderiv     0.4800    0.6667    0.5581        18
-      OTH     0.5789    0.6875    0.6286        16
- PERderiv     0.5385    0.3889    0.4516        18
-      PER     0.5000    0.5000    0.5000         2
-      ORG     0.0000    0.0000    0.0000         3
-
-micro avg     0.8574    0.8862    0.8715      5968
-macro avg     0.8575    0.8862    0.8713      5968
-```
-
-On the test dataset the following results could be achieved:
-```bash
-           precision    recall  f1-score   support
-
-  PERpart     0.8847    0.8944    0.8896      9397
-  OTHpart     0.9376    0.9353    0.9365      1639
-  ORGpart     0.7307    0.7044    0.7173       697
-      LOC     0.9133    0.9394    0.9262       561
-  LOCpart     0.8058    0.8157    0.8107      1150
-      ORG     0.0000    0.0000    0.0000         8
- OTHderiv     0.5882    0.4762    0.5263        42
- PERderiv     0.6571    0.5227    0.5823        44
-      OTH     0.4906    0.6667    0.5652        39
- ORGderiv     0.7016    0.7791    0.7383       172
- LOCderiv     0.8256    0.6514    0.7282       109
-      PER     0.0000    0.0000    0.0000        11
-
-micro avg     0.8722    0.8774    0.8748     13869
-macro avg     0.8712    0.8774    0.8740     13869
-```
-
-### Emerging and Rare Entities task: WNUT’17 (English NER) dataset
-
-Description of the WNUT’17 task from the [shared task website](http://noisy-text.github.io/2017/index.html):
-
-> The WNUT’17 shared task focuses on identifying unusual, previously-unseen entities in the context of emerging discussions.
-> Named entities form the basis of many modern approaches to other tasks (like event clustering and summarization), but recall on
-> them is a real problem in noisy text - even among annotators. This drop tends to be due to novel entities and surface forms.
-
-Six labels are available in the dataset. An overview can be found on this [page](http://noisy-text.github.io/2017/files/).
-
-#### Data (Download and pre-processing steps)
-
-The dataset can be downloaded from the [official GitHub](https://github.com/leondz/emerging_entities_17) repository.
-
-The following commands show how to prepare the dataset for fine-tuning:
-
-```bash
-mkdir -p data_wnut_17
-
-curl -L 'https://github.com/leondz/emerging_entities_17/raw/master/wnut17train.conll'  | tr '\t' ' ' > data_wnut_17/train.txt.tmp
-curl -L 'https://github.com/leondz/emerging_entities_17/raw/master/emerging.dev.conll' | tr '\t' ' ' > data_wnut_17/dev.txt.tmp
-curl -L 'https://raw.githubusercontent.com/leondz/emerging_entities_17/master/emerging.test.annotated' | tr '\t' ' ' > data_wnut_17/test.txt.tmp
-```
-
-Let's define some variables that we need for further pre-processing steps:
-
-```bash
-export MAX_LENGTH=128
-export BERT_MODEL=google-bert/bert-large-cased
-```
-
-Here we use the English BERT large model for fine-tuning.
-The `preprocess.py` scripts splits longer sentences into smaller ones (once the max. subtoken length is reached):
-
-```bash
-python3 scripts/preprocess.py data_wnut_17/train.txt.tmp $BERT_MODEL $MAX_LENGTH > data_wnut_17/train.txt
-python3 scripts/preprocess.py data_wnut_17/dev.txt.tmp $BERT_MODEL $MAX_LENGTH > data_wnut_17/dev.txt
-python3 scripts/preprocess.py data_wnut_17/test.txt.tmp $BERT_MODEL $MAX_LENGTH > data_wnut_17/test.txt
-```
-
-In the last pre-processing step, the `labels.txt` file needs to be generated. This file contains all available labels:
-
-```bash
-cat data_wnut_17/train.txt data_wnut_17/dev.txt data_wnut_17/test.txt | cut -d " " -f 2 | grep -v "^$"| sort | uniq > data_wnut_17/labels.txt
-```
-
-#### Run the Pytorch version
-
-Fine-tuning with the PyTorch version can be started using the `run_ner.py` script. In this example we use a JSON-based configuration file.
-
-This configuration file looks like:
-
-```json
-{
-    "data_dir": "./data_wnut_17",
-    "labels": "./data_wnut_17/labels.txt",
-    "model_name_or_path": "google-bert/bert-large-cased",
-    "output_dir": "wnut-17-model-1",
-    "max_seq_length": 128,
-    "num_train_epochs": 3,
-    "per_device_train_batch_size": 32,
-    "save_steps": 425,
-    "seed": 1,
-    "do_train": true,
-    "do_eval": true,
-    "do_predict": true,
-    "fp16": false
-}
-```
-
-If your GPU supports half-precision training, please set `fp16` to `true`.
-
-Save this JSON-based configuration under `wnut_17.json`. The fine-tuning can be started with `python3 run_ner_old.py wnut_17.json`.
-
-#### Evaluation
-
-Evaluation on development dataset outputs the following:
-
-```bash
-05/29/2020 23:33:44 - INFO - __main__ -   ***** Eval results *****
-05/29/2020 23:33:44 - INFO - __main__ -     eval_loss = 0.26505235286212275
-05/29/2020 23:33:44 - INFO - __main__ -     eval_precision = 0.7008264462809918
-05/29/2020 23:33:44 - INFO - __main__ -     eval_recall = 0.507177033492823
-05/29/2020 23:33:44 - INFO - __main__ -     eval_f1 = 0.5884802220680084
-05/29/2020 23:33:44 - INFO - __main__ -     epoch = 3.0
-```
-
-On the test dataset the following results could be achieved:
-
-```bash
-05/29/2020 23:33:44 - INFO - transformers.trainer -   ***** Running Prediction *****
-05/29/2020 23:34:02 - INFO - __main__ -     eval_loss = 0.30948806500973547
-05/29/2020 23:34:02 - INFO - __main__ -     eval_precision = 0.5840108401084011
-05/29/2020 23:34:02 - INFO - __main__ -     eval_recall = 0.3994439295644115
-05/29/2020 23:34:02 - INFO - __main__ -     eval_f1 = 0.47440836543753434
-```
-
-WNUT’17 is a very difficult task. Current state-of-the-art results on this dataset can be found [here](https://nlpprogress.com/english/named_entity_recognition.html).
--- a/examples/legacy/token-classification/run.sh
+++ b/examples/legacy/token-classification/run.sh
@ -1,36 +0,0 @@
-## The relevant files are currently on a shared Google
-## drive at https://drive.google.com/drive/folders/1kC0I2UGl2ltrluI9NqDjaQJGw5iliw_J
-## Monitor for changes and eventually migrate to use the `datasets` library
-curl -L 'https://drive.google.com/uc?export=download&id=1Jjhbal535VVz2ap4v4r_rN1UEHTdLK5P' \
-| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > train.txt.tmp
-curl -L 'https://drive.google.com/uc?export=download&id=1ZfRcQThdtAR5PPRjIDtrVP7BtXSCUBbm' \
-| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > dev.txt.tmp
-curl -L 'https://drive.google.com/uc?export=download&id=1u9mb7kNJHWQCWyweMDRMuTFoOHOfeBTH' \
-| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > test.txt.tmp
-
-export MAX_LENGTH=128
-export BERT_MODEL=bert-base-multilingual-cased
-python3 scripts/preprocess.py train.txt.tmp $BERT_MODEL $MAX_LENGTH > train.txt
-python3 scripts/preprocess.py dev.txt.tmp $BERT_MODEL $MAX_LENGTH > dev.txt
-python3 scripts/preprocess.py test.txt.tmp $BERT_MODEL $MAX_LENGTH > test.txt
-cat train.txt dev.txt test.txt | cut -d " " -f 2 | grep -v "^$"| sort | uniq > labels.txt
-export OUTPUT_DIR=germeval-model
-export BATCH_SIZE=32
-export NUM_EPOCHS=3
-export SAVE_STEPS=750
-export SEED=1
-
-python3 run_ner.py \
--task_type NER \
--data_dir . \
--labels ./labels.txt \
--model_name_or_path $BERT_MODEL \
--output_dir $OUTPUT_DIR \
--max_seq_length  $MAX_LENGTH \
--num_train_epochs $NUM_EPOCHS \
--per_gpu_train_batch_size $BATCH_SIZE \
--save_steps $SAVE_STEPS \
--seed $SEED \
--do_train \
--do_eval \
--do_predict
--- a/examples/legacy/token-classification/run_chunk.sh
+++ b/examples/legacy/token-classification/run_chunk.sh
@ -1,37 +0,0 @@
-if ! [ -f ./dev.txt ]; then
-  echo "Downloading CONLL2003 dev dataset...."
-  curl -L -o ./dev.txt 'https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/valid.txt'
-fi
-
-if ! [ -f ./test.txt ]; then
-  echo "Downloading CONLL2003 test dataset...."
-  curl -L -o ./test.txt 'https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/test.txt'
-fi
-
-if ! [ -f ./train.txt ]; then
-  echo "Downloading CONLL2003 train dataset...."
-  curl -L -o ./train.txt 'https://github.com/davidsbatista/NER-datasets/raw/master/CONLL2003/train.txt'
-fi
-
-export MAX_LENGTH=200
-export BERT_MODEL=bert-base-uncased
-export OUTPUT_DIR=chunker-model
-export BATCH_SIZE=32
-export NUM_EPOCHS=3
-export SAVE_STEPS=750
-export SEED=1
-
-python3 run_ner.py \
--task_type Chunk \
--data_dir . \
--model_name_or_path $BERT_MODEL \
--output_dir $OUTPUT_DIR \
--max_seq_length  $MAX_LENGTH \
--num_train_epochs $NUM_EPOCHS \
--per_gpu_train_batch_size $BATCH_SIZE \
--save_steps $SAVE_STEPS \
--seed $SEED \
--do_train \
--do_eval \
--do_predict
-
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
ydshieh	193acecc87	final check	2025-11-05 22:19:31 +01:00
Yih-Dar	bb65d2d953	Fix `pr_slow_ci_suggestion.yml` after #42023 (#42049 ) fix Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>	2025-11-05 22:10:12 +01:00
Yih-Dar	57bdb4a680	Cleanup workflow - part 1 (#42023 ) * part 1 * part 2 * part 3 * part 4 * part 5 * fix 1 * check 1 * part 6 * part 7 * part 8 * part 9 * part 10: rename file * OK: new_model_pr_merged_notification.yml * part 11 * fix 2 * revert check * fix * fix * fix --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>	2025-11-05 21:01:06 +01:00
Yih-Dar	1a0ae4bb81	Remove some custom datasets defined in codebase (#41511 ) * how bad it woud be anyway? * let's break all * delete * update --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>	2025-11-05 18:26:47 +01:00
célina	5689dd6b8e	update `huggingface_hub` dependency version (#42033 ) * update huggingface_hub version * nit	2025-11-05 16:22:22 +01:00
Manuel de Prada Corral	571352d378	🔴 Isolate prefill from generation loops (#40652 ) * isolate-prefill: squash * prefill inside decoding methods * simplify autocompile helpers	2025-11-05 14:40:01 +00:00
Raushan Turganbay	2418196ef4	Fix the order of methods in processor loading (#42031 ) * fix the order * add a test	2025-11-05 15:33:07 +01:00
Yih-Dar	561233cabf	Change trigger time for AMD CI (#42034 ) fix Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>	2025-11-05 14:17:12 +01:00
Yao Matrix	36b640562b	extend fp_quant cases to xpu (#41833 ) * extend fp_quant UTs to xpu Signed-off-by: Yao, Matrix <matrix.yao@intel.com> * fix style Signed-off-by: Yao, Matrix <matrix.yao@intel.com> * Update tests/quantization/fp_quant_integration/test_fp_quant.py Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com> --------- Signed-off-by: Yao, Matrix <matrix.yao@intel.com> Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>	2025-11-05 10:50:31 +00:00
Kashif Rasul	0c4a202408	[tests] Add Context-parallel CI tests (#41860 ) * intial * simplify tests * add test_cp_equivalence * removed fsdp_transformer_layer_cls_to_wrap * use DataCollatorForLanguageModeling * remove use_cache=False. * changes from review * make script self contained * moved to fsdp folder * fix class name	2025-11-05 11:40:51 +01:00
Pauline Bailly-Masson	20396951af	CodeQL workflow for security analysis (#42015 ) * CodeQL workflow for security analysis Created CodeQL workflow to use reusable workflow from internal and simplified configuration. * Update CodeQL workflow for main branch only and remving python from analysis Restrict CodeQL analysis to 'actions' language only. * Disable pull_request trigger in CodeQL workflow temporarly Comment out pull_request trigger for CodeQL workflow	2025-11-05 10:59:37 +01:00
Yih-Dar	3c4cdd549d	fix `deeepspeed` in AMD docker file (#42025 ) fix deeepspeed in AMD docker Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>	2025-11-05 10:40:29 +01:00
Andrei Panferov	020e713ac8	[FPQuant] MXFP8 and MXFP4 backwards support (#41897 ) * FP-Quant backwards * fp-quant v0.3.0 docker * availability version bump * fp_quant==0.3.1 * fp_quant v0.3.2	2025-11-04 16:52:47 +00:00
Matt	371ef0f4a2	[v5] Deprecate Text2Text and related pipelines (#41996 ) * Deprecate Text2Text and related pipelines * Try a restructure * make fixup * logging -> logger	2025-11-04 16:47:06 +00:00
Mohamed Mekkouri	6efc1799c1	[kernels] Fix XPU layernorm kernel (#41583 ) * fix * add comment * better fix * style * Update src/transformers/modeling_utils.py Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> --------- Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>	2025-11-04 16:59:07 +01:00
Pritam Das	325810e7fc	add fuyu fast image processors (#41817 ) * added fast processor for fuyu (#36978) * updated docs for fuyu model (#36978) * updated test_image_processing and image_processing_fuyu_fast * updated fuyu.md and image_processing_fuyu_fast (#36978) * updated test_image_processing_fuyu (#36978) * formatted image_processing_fuyu_fast and test_image_processing_fuyu (#36978) * updated tests and fuyu fast image processing (#36978) * Merge branch 'fuyu-fast-image-processors' of https://github.com/DeXtAr47-oss/transformers into fuyu-fast-image-processors * fixed format (#36978) * formatted files (#36978) * formatted files * revert unnecessary changes * clean up and process by group --------- Co-authored-by: yonigozlan <yoni.gozlan@huggingface.co>	2025-11-04 15:45:02 +00:00
ARAVINDHAN T	9a19171fad	Add GLPNImageProcessorFast (#41725 ) * Add GLPNImageProcessorFast for torch backend * Address review feedback - Simplified to_dict() method - Keep tensors as torch instead of converting to numpy for heterogeneous shapes - Removed unnecessary shape guards in post_process_depth_estimation - Improved variable names (tgt -> target_size, d -> resized) - Removed unnecessary GLPNImageProcessorKwargs class * Address review feedback - Simplified to_dict() method - Keep tensors as torch instead of converting to numpy for heterogeneous shapes - Removed unnecessary shape guards in post_process_depth_estimation - Improved variable names (tgt -> target_size, d -> resized) - Removed unnecessary GLPNImageProcessorKwargs class * commits after 2nd review * Address all review feedback and add explicit batched test - Simplified to_dict() with descriptive variable names (d->output_dict) - Fixed resize operation: changed from crop to proper resize with interpolation - Added padding for heterogeneous batch shapes in both slow and fast processors - Fused rescale and normalize operations for efficiency - Improved all variable names (tgt->target_size, d->depth_4d->resized) - Added GLPNImageProcessorKwargs class in slow processor and imported in fast - Renamed test_equivalence_slow_fast to test_slow_fast_equivalence - Added explicit test_slow_fast_equivalence_batched test - All 20 tests passing * using padding from utils * simplify glpn image processor fast * fix docstring --------- Co-authored-by: yonigozlan <yoni.gozlan@huggingface.co> Co-authored-by: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com>	2025-11-04 15:44:52 +00:00
MilkClouds	26fca86312	Fix default image_rows and image_cols initialization in Idefics3 and SmolVLM processors (#41871 ) * Fix default image_rows and image_cols initialization in Idefics3 and SmolVLM processors * Fix default initialization of image_rows and image_cols in Idefics3 and SmolVLM processors	2025-11-04 15:42:47 +00:00
Yoni Gozlan	900cf9d33b	Fix issue with from pretrained and kwargs in image processors (#41997 ) * accept kwargs in image proc from_pretrained * only use kwargs that are in cls.valid_kwargs * remove specific logic for _from_auto * add image_seq_length to Images_kwargs for backward compatibility * fix missing image kwargs in pix2struct	2025-11-04 10:35:39 -05:00
Marc Sun	154d5101a4	add back `logging_dir` (#42013 ) * add back * Apply style fixes --------- Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>	2025-11-04 16:22:58 +01:00
Matt	e3d4fa692e	Fix continuous batching tests (#42012 ) * Fix continuous batching tests * make fixup	2025-11-04 15:10:35 +00:00
Rémi Ouazan	dd4e048e75	Reduce the number of benchmark in the CI (#42008 ) Changed how benchmark cfgs are chosen	2025-11-04 14:07:17 +01:00
Yacklin Wong	6ff4fabd9d	Correct syntax error in trainer.md (#42001 ) A comma is missing between two parameters in the signature of compute_loss function.	2025-11-04 12:36:54 +00:00
Yih-Dar	6d4450e341	Fix `torch+deepspeed` docker file (#41985 ) * fix * delete --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>	2025-11-04 10:41:22 +00:00
Benjamin Bossan	aee5c2384a	DOC Fix typo in argument name: pseudoquant (#41994 ) The correct argument name is pseudoquantization. Since there is no error on passing wrong arguments name (which is arguably an anti-pattern), this is difficult for users to debug.	2025-11-04 10:48:39 +01:00
Mohamed Mekkouri	5b6c209bc5	[kernels] change import time in KernelConfig (#42004 ) * change import time * style	2025-11-04 10:26:24 +01:00
Yih-Dar	258c76e4dc	Fix `run slow v2`: empty report when there is only one model (#42002 ) fix Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>	2025-11-04 06:46:21 +01:00
James	64397a8301	Fixed wrong padding value in OWLv2 (#41938 ) * Update image_processing_owlv2_fast.py fixed padding value * fixed padding value * Change padding constant value from 0.5 to 0.0 * Fixed missed padding value in modular_owlv2.py --------- Co-authored-by: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com>	2025-11-03 18:46:28 -05:00
Sahil Kabir	cd309610c0	Integrate colqwen2.5 using colqwen2 modelling code (#40600 ) * adding option for 2.5 * minor - arg in conversion script * getting started on modelling.py * minor - shouldve been using modular * adressing comments + fixing datatype/device _get method * minor * commiting suggestion Co-authored-by: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com> * docs + first test * ruff fix * minor fix * ruff fix * model fix * model fix * fine-grained check, with a hardcoded score from the original Hf implementation. * minor ruff * update tests values with CI hardware * adding 2.5 to conversion script * Apply style fixes --------- Co-authored-by: Sahil Kabir <sahilkabir@Sahils-MacBook-Pro.local> Co-authored-by: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com> Co-authored-by: yonigozlan <yoni.gozlan@huggingface.co> Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>	2025-11-03 18:31:07 -05:00
kaixuanliu	dd8f231495	fix 3 failed test cases for video_llama_3 model on Intel XPU (#41931 ) * fix 3 failed test cases for video_llama_3 model on Intel XPU Signed-off-by: Liu, Kaixuan <kaixuan.liu@intel.com> * update Signed-off-by: Liu, Kaixuan <kaixuan.liu@intel.com> * adjust format Signed-off-by: Liu, Kaixuan <kaixuan.liu@intel.com> * update code Signed-off-by: Liu, Kaixuan <kaixuan.liu@intel.com> --------- Signed-off-by: Liu, Kaixuan <kaixuan.liu@intel.com>	2025-11-03 18:18:20 +01:00
Guillaume LEGENDRE	1619a3475f	fix (CI): Refactor SSH runners (#41991 ) * Change ssh runner type * Add wait step to SSH runner workflow * Rename wait step to wait2 in ssh-runner.yml * Remove wait step from ssh-runner.yml Removed the wait step from the SSH runner workflow. * Update runner type for single GPU A10 instance * Update SSH runner version to 1.90.3 * Add sha256sum to ssh-runner workflow * Update runner type and remove unused steps	2025-11-03 18:16:32 +01:00
Rémi Ouazan	ff0f7d6498	More data in benchmarking (#41848 ) * Reduce scope of cross-generate * Rm generate_sall configs * Workflow benchmarks more * Prevent crash when FA is not installed	2025-11-03 18:05:26 +01:00
Rémi Ouazan	80305364e2	Move the Mi355 to regular docker (#41989 ) * Move the Mi355 to regular docker * Disable gfx950 compilation for FA on AMD	2025-11-03 16:41:06 +01:00
Mohamed Mekkouri	a623cda427	[kernels] Add Tests & CI for kernels (#41765 ) * first commit * add tests * add kernel config * add more tests * add ci * small fix * change branch name * update tests * nit * change test name * revert jobs * addressing review * reenable all jobs * address second review	2025-11-03 16:36:52 +01:00
Yih-Dar	7d5160bd7a	Fix `torchcodec` version in quantization docker file (#41988 ) check Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>	2025-11-03 15:27:47 +01:00
Luc Georges	22e39dfb31	docs: add continuous batching page (#41847 ) * docs: add continuous batching page * docs(cb): add `generate_batch` example * docs(cb): add `opentelemtry` and `serving` section * feat: add `TODO` note about opentelemetry dependency * docs(cb): add supported features * docs(cb): add unsupported features * docs(cb): add `ContinuousBatchingManager` example * docs(cb): x reference CB in optimizing inference	2025-11-03 15:19:30 +01:00
Ryan Mullins	63fbd50fb4	fix: dict[RopeParameters] to dict[str, RopeParameters] (#41963 )	2025-11-03 14:09:27 +00:00
Ferdinand Mom	b433ec8b50	test tensor parallel: make tests for dense model more robust (#41968 ) * make test forward and backward more robust * refactor compile part of test tensor parallel * linting * pass rank around instead of calling it over and over * Run slow v2 (#41914) * Super * Super * Super * Super --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com> * Fix `detectron2` installation in docker files (#41975) * detectron2 - part 1 * detectron2 - part 2 --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com> * Fix `autoawq[kernels]` installation in quantization docker file (#41978) fix autoawq[kernels] Co-authored-by: ydshieh <ydshieh@users.noreply.github.com> * add support for saving encoder only so any parakeet model can be loaded for inference (#41969) * add support for saving encoder only so any decoder model can be loaded Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com> * use convolution_bias * convert modular * convolution_bias in convertion script --------- Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com> Co-authored-by: Eustache Le Bihan <eulebihan@gmail.com> Co-authored-by: eustlb <94853470+eustlb@users.noreply.github.com> --------- Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com> Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Co-authored-by: ydshieh <ydshieh@users.noreply.github.com> Co-authored-by: Nithin Rao <nithinrao.koluguri@gmail.com> Co-authored-by: Eustache Le Bihan <eulebihan@gmail.com> Co-authored-by: eustlb <94853470+eustlb@users.noreply.github.com>	2025-11-03 13:56:26 +01:00
Rémi Ouazan	3c16c1ae43	Use indices as position_ids in modernebert (#41789 ) * Use indices as position_ids in modernebert * Move position_ids init to the branch	2025-11-03 12:10:24 +01:00
Nithin Rao	b9f90dc388	add support for saving encoder only so any parakeet model can be loaded for inference (#41969 ) * add support for saving encoder only so any decoder model can be loaded Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com> * use convolution_bias * convert modular * convolution_bias in convertion script --------- Signed-off-by: nithinraok <nithinrao.koluguri@gmail.com> Co-authored-by: Eustache Le Bihan <eulebihan@gmail.com> Co-authored-by: eustlb <94853470+eustlb@users.noreply.github.com>	2025-11-02 18:21:41 +00:00
Yih-Dar	37a6296283	Fix `autoawq[kernels]` installation in quantization docker file (#41978 ) fix autoawq[kernels] Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>	2025-11-02 18:54:55 +01:00
Yih-Dar	0ed6d51ae8	Fix `detectron2` installation in docker files (#41975 ) * detectron2 - part 1 * detectron2 - part 2 --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>	2025-11-02 14:20:36 +01:00
Yih-Dar	8fb854cac8	Run slow v2 (#41914 ) * Super * Super * Super * Super --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>	2025-11-01 19:40:40 +01:00
Yoni Gozlan	a0bf5a82ee	Fix typo in image_processing_lfm2_vl_fast (#41940 ) fix typo	2025-10-31 11:02:39 -04:00
Yao Matrix	6fb6d3c0fb	make recurrent_gemma and voxtral cases pass on xpu (#41958 ) Signed-off-by: Yao, Matrix <matrix.yao@intel.com> Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>	2025-10-31 13:55:33 +00:00
Matt	5f8d02f2f1	[v5] Return a BatchEncoding dict from apply_chat_template by default (#41626 ) * Flip the default return type for `apply_chat_template` to match the underlying tokenizer * Remove test_tokenization_for_chat tests, which no longer do anything useful * Remove test_tokenization_for_chat tests, which no longer do anything useful * Fix test_encode_message tests * Fix test_encode_message tests * Return dicts for Processor too * Fix mistral-common tests * Catch one of the processors too * revert test bug! * nit fix * nit fix	2025-10-31 13:50:26 +00:00
Ferdinand Mom	4418728dfa	V4.57.1 training ci: Refactor `test_tensor_parallel.py` (#41918 ) * refactor test to not depends on subprocess (this way we can easily debug test with breakpoint) * make test more robust by testing on more process (2 4 8) * remove 8 gpus tests because llama is too tiny to apply TP then => RuntimeError. This will imply bigger llama for test but since TP=2/4 works already, no need * linting	2025-10-31 14:46:45 +01:00
Luã Enrique Zangrande	0a8ab33f7a	Fix: prevent .gitignore truncation in run_clm_no_trainer.py (#41957 ) * fix: update gitignore update flow * fix: remove whitespace	2025-10-31 12:17:09 +00:00
Yao Matrix	90d1b67db1	fix prepare_config_and_inputs_for_common bug in llava test (#41942 ) fix bug Signed-off-by: Yao, Matrix <matrix.yao@intel.com>	2025-10-31 10:02:39 +01:00
Ryan Mullins	02c324f43f	Fix: Gemma3TextConfig rope scaling assignments (#41934 ) * Fix: Gemma3TextConfig rope scaling assignments * Fix: type annotation for rope_parameters	2025-10-30 12:23:54 +00:00
Douglas Reid	b47b35637f	Fix rope_parameters for gemma3 weights conversion script (#41922 ) Fix rope_parameters for gemma3 weights conversion script. Co-authored-by: Douglas Reid <21148125+douglas-reid@users.noreply.github.com>	2025-10-30 11:49:18 +00:00
Yao Matrix	e7e7eca06b	fix some ut failures on XPU w/ torch 2.9 (#41941 ) * fix some ut failures on XPU w/ torch 2.9 Signed-off-by: Yao, Matrix <matrix.yao@intel.com> * fix style Signed-off-by: Yao, Matrix <matrix.yao@intel.com> --------- Signed-off-by: Yao, Matrix <matrix.yao@intel.com>	2025-10-30 11:23:57 +01:00
Yih-Dar	cad7eeeb5e	Minor fix in docker image build workflow (#41949 ) fix Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>	2025-10-30 11:02:11 +01:00
Jitesh Gupta	76fc50a152	Cache latest pytorch amd image locally on mi325 CI runner cluster (#41926 )	2025-10-29 19:45:37 +01:00
Yao Matrix	a43b36cf80	fix some ut failures on XPU w/ torch 2.9 (#41923 ) * fix 6 ut failures on XPU w/ torch 2.9 Signed-off-by: Yao, Matrix <matrix.yao@intel.com> * fix UT failures for 4 models on XPU Signed-off-by: Yao, Matrix <matrix.yao@intel.com> --------- Signed-off-by: Yao, Matrix <matrix.yao@intel.com>	2025-10-29 16:15:33 +01:00