update test function source

remove additional tiying after rebase
fix
2025-11-19 17:54:41 +08:00 · 2025-11-18 22:04:28 +01:00 · 2025-11-18 18:55:53 +01:00 · 2025-11-18 18:52:39 +01:00 · 2025-11-18 18:52:39 +01:00 · 2025-11-18 18:52:39 +01:00
1397 changed files with 25969 additions and 74749 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -46,8 +46,8 @@ jobs:
            - run: uv pip install -U -e .
            - run: echo 'export "GIT_COMMIT_MESSAGE=$(git show -s --format=%s)"' >> "$BASH_ENV" && source "$BASH_ENV"
            - run: mkdir -p test_preparation
-            - run: python utils/tests_fetcher.py | tee tests_fetched_summary.txt
-            - run: python utils/tests_fetcher.py --filter_tests
+            - run: python utils/tests_fetcher.py | tee tests_fetched_summary.txt || true
+            - run: python utils/tests_fetcher.py --filter_tests || true
            - run: export "GIT_COMMIT_MESSAGE=$(git show -s --format=%s)" && echo $GIT_COMMIT_MESSAGE && python .circleci/create_circleci_config.py --fetcher_folder test_preparation
            - run: |
                if [ ! -s test_preparation/generated_config.yml ]; then
@ -98,8 +98,8 @@ jobs:
            - run: uv pip install -U -e .
            - run: echo 'export "GIT_COMMIT_MESSAGE=$(git show -s --format=%s)"' >> "$BASH_ENV" && source "$BASH_ENV"
            - run: mkdir -p test_preparation
-            - run: python utils/tests_fetcher.py --fetch_all | tee tests_fetched_summary.txt
-            - run: python utils/tests_fetcher.py --filter_tests
+            - run: python utils/tests_fetcher.py --fetch_all | tee tests_fetched_summary.txt || true
+            - run: python utils/tests_fetcher.py --filter_tests || true
            - run: export "GIT_COMMIT_MESSAGE=$(git show -s --format=%s)" && echo $GIT_COMMIT_MESSAGE && python .circleci/create_circleci_config.py --fetcher_folder test_preparation
            - run: |
                if [ ! -s test_preparation/generated_config.yml ]; then
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@ -32,16 +32,15 @@ jobs:
      options: --gpus all --privileged --ipc host
    steps:
      - name: Get repo
-        uses: actions/checkout@v4
+        uses: actions/checkout@v5
        with:
-          ref: ${{ github.event.pull_request.head.sha || github.sha }}
+          fetch-depth: 1

      - name: Install benchmark script dependencies
        run: python3 -m pip install -r benchmark_v2/requirements.txt kernels

      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
-        working-directory: /transformers
-        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e ".[torch]" && python3 -m pip uninstall -y torchvision # temp fix
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e ".[torch]"

      - name: Run benchmark
        run: |
@ -52,7 +51,7 @@ jobs:
            commit_id=$GITHUB_SHA
          fi
          commit_msg=$(git show -s --format=%s | cut -c1-70)
-          python3 benchmark_v2/run_benchmarks.py -b 32 -s 128 -n 256 --branch-name "$BRANCH_NAME" --commit-id "$commit_id" --commit-message "$commit_msg" --model-id "$MODEL_ID" --log-level INFO --push-result-to-dataset "$DATASET_ID"
+          python3 benchmark_v2/run_benchmarks.py -b 32 -s 128 -n 256 --level 2 --branch-name "$BRANCH_NAME" --commit-id "$commit_id" --commit-message "$commit_msg" --model-id "$MODEL_ID" --log-level INFO --push-result-to-dataset "$DATASET_ID"
        env:
          HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
          PUSH_TO_HUB_TOKEN: ${{ secrets.PUSH_TO_HUB_TOKEN }}
--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
@ -78,7 +78,7 @@ jobs:
        with:
          context: ./docker/transformers-all-latest-gpu
          build-args: |
-            REF=update_dockerfile
+            REF=main
            PYTORCH=2.8.0
            TORCHCODEC=0.7.0
            FLASH_ATTN=yes
@ -97,7 +97,7 @@ jobs:
  latest-torch-deepspeed-docker:
    name: "Latest PyTorch + DeepSpeed"
    runs-on:
-      group: aws-g4dn-2xlarge-cache
+      group: aws-general-8-plus
    steps:
      -
        name: Set up Docker Buildx
@ -200,6 +200,37 @@ jobs:
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

+  cache-latest-pytorch-amd:
+    name: "Cache Latest Pytorch (AMD) Image"
+    needs: latest-pytorch-amd
+    runs-on:
+      group: amd-mi325-1gpu
+    steps:
+      -
+        name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+        
+      - 
+        name: Pull and save docker image to cache
+        run: |
+          image="huggingface/transformers-pytorch-amd-gpu"
+          final_path="/mnt/image-cache/transformers-pytorch-amd-gpu.tar"
+          tmp_path="${final_path}.tmp"
+
+          echo "Pulling image: ${image}"
+          docker pull "${image}"
+
+          echo "Saving to temp file: ${tmp_path}"
+          docker save "${image}" -o "${tmp_path}"
+
+          echo "Moving to final path: ${final_path}"
+          mv -f "${tmp_path}" "${final_path}"
+
+          echo "Cache populated successfully at ${final_path}"
+
  latest-pytorch-deepspeed-amd:
    name: "PyTorch + DeepSpeed (AMD) [dev]"
    runs-on:
--- a/.github/workflows/check-workflow-permissions.yml
+++ b/.github/workflows/check-workflow-permissions.yml
@ -0,0 +1,23 @@
+---
+name: Check Permissions Advisor
+
+on:
+  workflow_dispatch:
+    inputs:
+      workflow_name:
+        description: 'Workflow file name'
+        type: string
+      run_count:
+        description: 'Number of runs to analyze'
+        type: string
+        default: "10"
+
+jobs:
+  advisor:
+    uses: huggingface/security-workflows/.github/workflows/permissions-advisor-reusable.yml@main
+    permissions:
+      actions: read
+      contents: read
+    with:
+      workflow_name: ${{ inputs.workflow_name }}
+      run_count: ${{ fromJSON(inputs.run_count) }}
--- a/.github/workflows/check_failed_tests.yml
+++ b/.github/workflows/check_failed_tests.yml
@ -6,9 +6,6 @@ on:
      docker:
        required: true
        type: string
-      start_sha:
-        required: true
-        type: string
      job:
        required: true
        type: string
@ -24,7 +21,13 @@ on:
      commit_sha:
        required: false
        type: string
-
+      pr_number:
+        required: false
+        type: string
+    outputs:
+      report:
+        description: "Content of the report of new failures"
+        value: ${{ jobs.process_new_failures_with_commit_info.outputs.report }}

 env:
  HF_HOME: /mnt/cache
@ -61,13 +64,15 @@ jobs:
      - name: Check file
        id: check_file
        working-directory: /transformers
+        env:
+          job: ${{ inputs.job }}
        run: |
-          if [ -f ci_results_${{ inputs.job }}/new_failures.json ]; then
-            echo "`ci_results_${{ inputs.job }}/new_failures.json` exists, continue ..."
+          if [ -f "ci_results_${job}/new_failures.json" ]; then
+            echo "\`ci_results_${job}/new_failures.json\` exists, continue ..."
            echo "process=true" >> $GITHUB_ENV
            echo "process=true" >> $GITHUB_OUTPUT
          else
-            echo "`ci_results_${{ inputs.job }}/new_failures.json` doesn't exist, abort."
+            echo "\`ci_results_${job}/new_failures.json\` doesn't exist, abort."
            echo "process=false" >> $GITHUB_ENV
            echo "process=false" >> $GITHUB_OUTPUT
          fi
@ -88,27 +93,62 @@ jobs:
            echo "PREV_WORKFLOW_RUN_ID=" >> $GITHUB_ENV
          fi

-          if [ -f setup_values/other_workflow_run_id.txt ]; then
-            echo "OTHER_WORKFLOW_RUN_ID=$(cat setup_values/other_workflow_run_id.txt)" >> $GITHUB_ENV
-          else
-            echo "OTHER_WORKFLOW_RUN_ID=" >> $GITHUB_ENV
-          fi
-
      - name: Update clone
        working-directory: /transformers
        if: ${{ env.process == 'true' }}
-        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
+        env:
+          commit_sha: ${{ inputs.commit_sha || github.sha }}
+        run: |
+          git fetch origin "$commit_sha" && git checkout "$commit_sha"

-      - name: Get target commit
+      - name: Get `START_SHA`
        working-directory: /transformers/utils
        if: ${{ env.process == 'true' }}
+        env:
+          commit_sha: ${{ inputs.commit_sha || github.sha }}
        run: |
-          echo "END_SHA=$(TOKEN=${{ secrets.ACCESS_REPO_INFO_TOKEN }} python3 -c 'import os; from get_previous_daily_ci import get_last_daily_ci_run_commit; commit=get_last_daily_ci_run_commit(token=os.environ["TOKEN"], workflow_run_id=os.environ["PREV_WORKFLOW_RUN_ID"]); print(commit)')" >> $GITHUB_ENV
+          echo "START_SHA=$commit_sha" >> $GITHUB_ENV

-      - name: Checkout to `start_sha`
-        working-directory: /transformers
-        if: ${{ env.process == 'true' }}
-        run: git fetch && git checkout ${{ inputs.start_sha }}
+      # This is used if the CI is triggered from a pull request `self-comment-ci.yml` (after security check is verified)
+      - name: Extract the base commit on `main` (of the merge commit created by Github) if it is a PR
+        id: pr_info
+        if: ${{ env.process == 'true' && inputs.pr_number != '' }}
+        uses: actions/github-script@v6
+        with:
+          script: |            
+            const { data: pr } = await github.rest.pulls.get({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              pull_number: ${{ inputs.pr_number }}
+            });
+
+            const { data: merge_commit }  = await github.rest.repos.getCommit({
+              owner: pr.base.repo.owner.login,
+              repo: pr.base.repo.name,
+              ref: '${{ inputs.commit_sha }}',
+            });
+
+            core.setOutput('merge_commit_base_sha', merge_commit.parents[0].sha);
+
+      # Usually, `END_SHA` should be the commit of the last previous workflow run of the **SAME** (scheduled) workflow.
+      # (This is why we don't need to specify `workflow_id` which would be fetched automatically in the python script.)
+      - name: Get `END_SHA` from previous CI runs of the same workflow
+        working-directory: /transformers/utils
+        if: ${{ env.process == 'true' && inputs.pr_number == '' }}
+        env:
+          ACCESS_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
+        run: |
+          echo "END_SHA=$(TOKEN="$ACCESS_TOKEN" python3 -c 'import os; from get_previous_daily_ci import get_last_daily_ci_run_commit; commit=get_last_daily_ci_run_commit(token=os.environ["TOKEN"], workflow_run_id=os.environ["PREV_WORKFLOW_RUN_ID"]); print(commit)')" >> $GITHUB_ENV
+
+      # However, for workflow runs triggered by `issue_comment` (for pull requests), we want to check against the
+      # parent commit (on `main`) of the `merge_commit` (dynamically created by GitHub). In this case, the goal is to
+      # see if a reported failing test is actually ONLY failing on the `merge_commit`.
+      - name: Set `END_SHA`
+        if: ${{ env.process == 'true' && inputs.pr_number != '' }}
+        env:
+          merge_commit_base_sha: ${{ steps.pr_info.outputs.merge_commit_base_sha }}
+        run: |
+          echo "END_SHA=$merge_commit_base_sha" >> $GITHUB_ENV

      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
        working-directory: /transformers
@ -138,14 +178,20 @@ jobs:
      - name: Check failed tests
        working-directory: /transformers
        if: ${{ env.process == 'true' }}
-        run: python3 utils/check_bad_commit.py --start_commit ${{ inputs.start_sha }} --end_commit ${{ env.END_SHA }} --file ci_results_${{ inputs.job }}/new_failures.json --output_file new_failures_with_bad_commit_${{ inputs.job }}_${{ matrix.run_idx }}.json
+        env:
+          job: ${{ inputs.job }}
+          run_idx: ${{ matrix.run_idx }}
+        run: python3 utils/check_bad_commit.py --start_commit "$START_SHA" --end_commit "$END_SHA" --file "ci_results_${job}/new_failures.json" --output_file "new_failures_with_bad_commit_${job}_${run_idx}.json"

      - name: Show results
        working-directory: /transformers
        if: ${{ env.process == 'true' }}
+        env:
+          job: ${{ inputs.job }}
+          run_idx: ${{ matrix.run_idx }}
        run: |
-          ls -l new_failures_with_bad_commit_${{ inputs.job }}_${{ matrix.run_idx }}.json
-          cat new_failures_with_bad_commit_${{ inputs.job }}_${{ matrix.run_idx }}.json
+          ls -l "new_failures_with_bad_commit_${job}_${run_idx}.json"
+          cat "new_failures_with_bad_commit_${job}_${run_idx}.json"

      - name: Upload artifacts
        uses: actions/upload-artifact@v4
@ -159,6 +205,8 @@ jobs:
    if: needs.check_new_failures.outputs.process == 'true'
    runs-on:
      group: aws-g5-4xlarge-cache
+    outputs:
+      report: ${{ steps.set_output.outputs.report }}
    container:
      image: ${{ inputs.docker }}
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@ -176,32 +224,28 @@ jobs:

      - name: Check files
        working-directory: /transformers
+        env:
+          job: ${{ inputs.job }}
        run: |
          ls -la /transformers
-          ls -la /transformers/new_failures_with_bad_commit_${{ inputs.job }}
+          ls -la "/transformers/new_failures_with_bad_commit_${job}"

      # Currently, we only run with a single runner by using `run_idx: [1]`. We might try to run with multiple runners
      # to further reduce the false positive caused by flaky tests, which requires further processing to merge reports.
      - name: Merge files
        shell: bash
        working-directory: /transformers
+        env:
+          job: ${{ inputs.job }}
        run: |
-          cp /transformers/new_failures_with_bad_commit_${{ inputs.job }}/new_failures_with_bad_commit_${{ inputs.job }}_1.json new_failures_with_bad_commit.json
+          cp "/transformers/new_failures_with_bad_commit_${job}/new_failures_with_bad_commit_${job}_1.json" new_failures_with_bad_commit.json

      - name: Update clone
-        working-directory: /transformers
-        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
-
-      - name: Process report
-        shell: bash
        working-directory: /transformers
        env:
-          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
-          TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}
-          JOB_NAME: ${{ inputs.job }}
-          REPORT_REPO_ID: ${{ inputs.report_repo_id }}
+          commit_sha: ${{ inputs.commit_sha || github.sha }}
        run: |
-          python3 utils/process_bad_commit_report.py
+          git fetch origin "$commit_sha" && git checkout "$commit_sha"

      - name: Process report
        shell: bash
@ -218,11 +262,37 @@ jobs:
            echo EOF
          } >> "$GITHUB_ENV"

-      - name: Prepare Slack report title
+      # The output is useful if a caller needs more processing, for example, we have a chain
+      # self-comment-ci.yml -> self-scheduled.yml -> this one (check_failed_tests.yml),
+      # and `self-comment-ci.yml` needs further processing before sending a GitHub comment to the pull request page.
+      - name: Show results & Set outputs
+        id: set_output
        working-directory: /transformers
+        run: |
+          ls -l new_failures_with_bad_commit.json
+          cat new_failures_with_bad_commit.json
+
+          {
+            echo 'report<<EOF'
+            cat new_failures_with_bad_commit.json
+            echo ''  # Force a newline
+            echo EOF
+          } >> "$GITHUB_OUTPUT"
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: new_failures_with_bad_commit_${{ inputs.job }}
+          path: /transformers/new_failures_with_bad_commit.json
+
+      - name: Prepare Slack report title
+        working-directory: /transformers
+        env:
+          ci_event: ${{ inputs.ci_event }}
+          job: ${{ inputs.job }}
        run: |
          pip install slack_sdk
-          echo "title=$(python3 -c 'import sys; sys.path.append("utils"); from utils.notification_service import job_to_test_map; ci_event = "${{ inputs.ci_event }}"; job = "${{ inputs.job }}"; test_name = job_to_test_map[job]; title = f"New failed tests of {ci_event}" + ":" + f" {test_name}"; print(title)')" >> $GITHUB_ENV
+          echo "title=$(python3 -c 'import sys; import os; sys.path.append("utils"); from utils.notification_service import job_to_test_map; ci_event = os.environ["ci_event"]; job = os.environ["job"]; test_name = job_to_test_map[job]; title = f"New failed tests of {ci_event}" + ":" + f" {test_name}"; print(title)')" >> $GITHUB_ENV

      - name: Send processed report
        if: ${{ !endsWith(env.REPORT_TEXT, '{}') }}
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@ -0,0 +1,22 @@
+---
+name: CodeQL Security Analysis
+
+on:
+  push:
+    branches: ["main", "fix_security_issue_*"]
+  # pull_request:
+  #   branches: ["main"]
+  workflow_dispatch:
+
+jobs:
+  codeql:
+    name: CodeQL Analysis
+    uses: huggingface/security-workflows/.github/workflows/codeql-reusable.yml@main
+    permissions:
+      security-events: write
+      packages: read
+      actions: read
+      contents: read
+    with:
+      languages: '["actions"]'
+      queries: 'security-extended,security-and-quality'
--- a/.github/workflows/get-pr-info.yml
+++ b/.github/workflows/get-pr-info.yml
@ -39,6 +39,9 @@ on:
      PR_MERGE_COMMIT_SHA:
        description: "The sha of the merge commit for the pull request (created by GitHub) in the base repository"
        value: ${{ jobs.get-pr-info.outputs.PR_MERGE_COMMIT_SHA }}
+      PR_MERGE_COMMIT_BASE_SHA:
+        description: "The sha of the parent commit of the the merge commit on the target branch in the base repository"
+        value: ${{ jobs.get-pr-info.outputs.PR_MERGE_COMMIT_BASE_SHA }}
      PR_HEAD_COMMIT_DATE:
        description: "The date of the head sha of the pull request branch in the head repository"
        value: ${{ jobs.get-pr-info.outputs.PR_HEAD_COMMIT_DATE }}
@ -74,6 +77,7 @@ jobs:
      PR_BASE_REF: ${{ steps.pr_info.outputs.base_ref }}
      PR_HEAD_SHA: ${{ steps.pr_info.outputs.head_sha }}
      PR_BASE_SHA: ${{ steps.pr_info.outputs.base_sha }}
+      PR_MERGE_COMMIT_BASE_SHA: ${{ steps.pr_info.outputs.merge_commit_base_sha }}
      PR_MERGE_COMMIT_SHA: ${{ steps.pr_info.outputs.merge_commit_sha }}
      PR_HEAD_COMMIT_DATE: ${{ steps.pr_info.outputs.head_commit_date }}
      PR_MERGE_COMMIT_DATE: ${{ steps.pr_info.outputs.merge_commit_date }}
@ -122,6 +126,7 @@ jobs:
            core.setOutput('base_ref', pr.base.ref);
            core.setOutput('head_sha', pr.head.sha);
            core.setOutput('base_sha', pr.base.sha);
+            core.setOutput('merge_commit_base_sha', merge_commit.parents[0].sha);
            core.setOutput('merge_commit_sha', pr.merge_commit_sha);
            core.setOutput('pr', pr);

@ -142,16 +147,21 @@ jobs:
              date: merge_commit.commit.committer.date
            });

+            console.log('PR Info:', {
+              pr_info: pr
+            });
+
      - name: Convert dates to timestamps
        id: get_timestamps
+        env:
+          head_commit_date: ${{ steps.pr_info.outputs.head_commit_date }}
+          merge_commit_date: ${{ steps.pr_info.outputs.merge_commit_date }}
        run: |
-          head_commit_date=${{ steps.pr_info.outputs.head_commit_date }}
-          merge_commit_date=${{ steps.pr_info.outputs.merge_commit_date }}
-          echo $head_commit_date
-          echo $merge_commit_date
+          echo "$head_commit_date"
+          echo "$merge_commit_date"
          head_commit_timestamp=$(date -d "$head_commit_date" +%s)
          merge_commit_timestamp=$(date -d "$merge_commit_date" +%s)
-          echo $head_commit_timestamp
-          echo $merge_commit_timestamp
+          echo "$head_commit_timestamp"
+          echo "$merge_commit_timestamp"
          echo "head_commit_timestamp=$head_commit_timestamp" >> $GITHUB_OUTPUT
-          echo "merge_commit_timestamp=$merge_commit_timestamp" >> $GITHUB_OUTPUT
+          echo "merge_commit_timestamp=$merge_commit_timestamp" >> $GITHUB_OUTPUT
--- a/.github/workflows/get-pr-number.yml
+++ b/.github/workflows/get-pr-number.yml
@ -15,13 +15,19 @@ jobs:
    steps:
      - name: Get PR number
        shell: bash
+        env:
+          issue_number: ${{ github.event.issue.number }}
+          is_pull_request_issue: ${{ github.event.issue.pull_request != null }}
+          pr_number: ${{ github.event.pull_request.number }}
+          is_pull_request: ${{ github.event.pull_request != null }}
+          event_number: ${{ github.event.number }}
        run: |
-          if [[ "${{ github.event.issue.number }}" != "" && "${{ github.event.issue.pull_request }}" != "" ]]; then
-            echo "PR_NUMBER=${{ github.event.issue.number }}" >> $GITHUB_ENV
-          elif [[ "${{ github.event.pull_request.number }}" != "" ]]; then
-            echo "PR_NUMBER=${{ github.event.pull_request.number }}" >> $GITHUB_ENV
-          elif [[ "${{ github.event.pull_request }}" != "" ]]; then
-            echo "PR_NUMBER=${{ github.event.number }}" >> $GITHUB_ENV
+          if [[ "$issue_number" != "" && "$is_pull_request_issue" == "true" ]]; then
+            echo "PR_NUMBER=$issue_number" >> $GITHUB_ENV
+          elif [[ "$pr_number" != "" ]]; then
+            echo "PR_NUMBER=$pr_number" >> $GITHUB_ENV
+          elif [[ "$is_pull_request" == "true" ]]; then
+            echo "PR_NUMBER=$event_number" >> $GITHUB_ENV
          else
            echo "PR_NUMBER=" >> $GITHUB_ENV
          fi
@ -29,8 +35,8 @@ jobs:
      - name: Check PR number
        shell: bash
        run: |
-          echo "${{ env.PR_NUMBER }}"
+          echo "$PR_NUMBER"

      - name: Set PR number
        id: set_pr_number
-        run: echo "PR_NUMBER=${{ env.PR_NUMBER }}" >> "$GITHUB_OUTPUT"
+        run: echo "PR_NUMBER=$PR_NUMBER" >> "$GITHUB_OUTPUT"
--- a/.github/workflows/model_jobs.yml
+++ b/.github/workflows/model_jobs.yml
@ -62,25 +62,33 @@ jobs:
    steps:
      - name: Echo input and matrix info
        shell: bash
+        env:
+          folder_slices: ${{ inputs.folder_slices }}
+          matrix_folders: ${{ matrix.folders }}
+          slice_data: ${{ toJson(fromJson(inputs.folder_slices)[inputs.slice_id]) }}
        run: |
-          echo "${{ inputs.folder_slices }}"
-          echo "${{ matrix.folders }}"
-          echo "${{ toJson(fromJson(inputs.folder_slices)[inputs.slice_id]) }}"
+          echo "$folder_slices"
+          echo "$matrix_folders"
+          echo "$slice_data"

      - name: Echo folder ${{ matrix.folders }}
        shell: bash
        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
        # set the artifact folder names (because the character `/` is not allowed).
+        env:
+          matrix_folders_raw: ${{ matrix.folders }}
        run: |
-          echo "${{ matrix.folders }}"
-          matrix_folders=${{ matrix.folders }}
-          matrix_folders=${matrix_folders/'models/'/'models_'}
+          echo "$matrix_folders_raw"
+          matrix_folders="${matrix_folders_raw/'models/'/'models_'}"
          echo "$matrix_folders"
          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV

      - name: Update clone
        working-directory: /transformers
-        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
+        env:
+          commit_sha: ${{ inputs.commit_sha || github.sha }}
+        run: |
+          git fetch origin "$commit_sha" && git checkout "$commit_sha"

      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
        working-directory: /transformers
@ -115,15 +123,17 @@ jobs:
        id: set_machine_type
        working-directory: /transformers
        shell: bash
+        env:
+          input_machine_type: ${{ inputs.machine_type }}
        run: |
-          echo "${{ inputs.machine_type }}"
+          echo "$input_machine_type"

-          if [ "${{ inputs.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+          if [ "$input_machine_type" = "aws-g5-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ inputs.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
+          elif [ "$input_machine_type" = "aws-g5-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
-            machine_type=${{ inputs.machine_type }}
+            machine_type="$input_machine_type"
          fi

          echo "$machine_type"
@ -132,15 +142,21 @@ jobs:

      - name: Create report directory if it doesn't exist
        shell: bash
+        env:
+          report_name_prefix: ${{ inputs.report_name_prefix }}
        run: |
-          mkdir -p /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
-          echo "dummy" > /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports/dummy.txt
-          ls -la /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
+          mkdir -p "/transformers/reports/${machine_type}_${report_name_prefix}_${matrix_folders}_test_reports"
+          echo "dummy" > "/transformers/reports/${machine_type}_${report_name_prefix}_${matrix_folders}_test_reports/dummy.txt"
+          ls -la "/transformers/reports/${machine_type}_${report_name_prefix}_${matrix_folders}_test_reports"

      - name: Run all tests on GPU
        working-directory: /transformers
+        env:
+          report_name_prefix: ${{ inputs.report_name_prefix }}
+          pytest_marker: ${{ inputs.pytest_marker }}
+          model: ${{ matrix.folders }}
        run: |
-          script -q -c "PATCH_TESTING_METHODS_TO_COLLECT_OUTPUTS=yes _PATCHED_TESTING_METHODS_OUTPUT_DIR=/transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports python3 -m pytest -rsfE -v -m '${{ inputs.pytest_marker }}' --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports tests/${{ matrix.folders }}" test_outputs.txt
+          script -q -c "PATCH_TESTING_METHODS_TO_COLLECT_OUTPUTS=yes _PATCHED_TESTING_METHODS_OUTPUT_DIR=/transformers/reports/${machine_type}_${report_name_prefix}_${matrix_folders}_test_reports python3 -m pytest -rsfE -v -m '${pytest_marker}' --make-reports=${machine_type}_${report_name_prefix}_${matrix_folders}_test_reports tests/${model}" test_outputs.txt
          ls -la
          # Extract the exit code from the output file
          EXIT_CODE=$(tail -1 test_outputs.txt | grep -o 'COMMAND_EXIT_CODE="[0-9]*"' | cut -d'"' -f2)
@ -151,19 +167,25 @@ jobs:
        # This step is only to show information on Github Actions log.
        # Always mark this step as successful, even if the report directory or the file `failures_short.txt` in it doesn't exist
        continue-on-error: true
-        run: cat /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports/failures_short.txt
+        env:
+          report_name_prefix: ${{ inputs.report_name_prefix }}
+        run: cat "/transformers/reports/${machine_type}_${report_name_prefix}_${matrix_folders}_test_reports/failures_short.txt"

      - name: Captured information
        if: ${{ failure() }}
        continue-on-error: true
+        env:
+          report_name_prefix: ${{ inputs.report_name_prefix }}
        run: |
-          cat /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports/captured_info.txt
+          cat "/transformers/reports/${machine_type}_${report_name_prefix}_${matrix_folders}_test_reports/captured_info.txt"

      - name: Copy test_outputs.txt
        if: ${{ always() }}
        continue-on-error: true
+        env:
+          report_name_prefix: ${{ inputs.report_name_prefix }}
        run: |
-          cp /transformers/test_outputs.txt /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
+          cp /transformers/test_outputs.txt "/transformers/reports/${machine_type}_${report_name_prefix}_${matrix_folders}_test_reports"

      - name: "Test suite reports artifacts: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports"
        if: ${{ always() }}
@ -174,7 +196,7 @@ jobs:

  collated_reports:
    name: Collated Reports
-    if: ${{ always() }}
+    if: ${{ always() && inputs.runner_type != '' }}
    needs: run_models_gpu
    uses: huggingface/transformers/.github/workflows/collated-reports.yml@main
    with:
--- a/.github/workflows/pr_slow_ci_suggestion.yml
+++ b/.github/workflows/pr_slow_ci_suggestion.yml
@ -1,4 +1,4 @@
-name: PR slow CI
+name: PR slow CI - Suggestion
 on:
  pull_request_target:
    types: [opened, synchronize, reopened]
@ -23,11 +23,28 @@ jobs:
    outputs:
      jobs: ${{ steps.get_jobs.outputs.jobs_to_run }}
    steps:
+      # This checkout to the main branch
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: "0"
+
+      # We need to use `${{ ... }}` here to avoid `Argument list too long` error when a PR changes a lot of files.
+      # (We could also try to use artifact approach, but it's more involved).
+      # `CodeQL` doesn't identify any security issue here. Also `PR_FILES` is from `get-pr-info.yml` by using an api
+      # `github.rest.pulls.listFiles`, which is fine.
+      - name: Write pr_files file
+        run: |
+          cat > pr_files.txt << 'EOF'
+          ${{ needs.get-pr-info.outputs.PR_FILES }}
+          EOF
+
      - name: Get repository content
        id: repo_content
        uses: actions/github-script@v6
        with:
          script: |
+            const fs = require('node:fs');
+
            const { data: tests_dir } = await github.rest.repos.getContent({
              owner: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_OWNER }}',
              repo: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_NAME }}',
@ -49,38 +66,10 @@ jobs:
              ref: '${{ needs.get-pr-info.outputs.PR_HEAD_SHA }}',
            });

-            core.setOutput('tests_dir', tests_dir);
-            core.setOutput('tests_models_dir', tests_models_dir);
-            core.setOutput('tests_quantization_dir', tests_quantization_dir);
-
-      # This checkout to the main branch
-      - uses: actions/checkout@v4
-        with:
-          fetch-depth: "0"
-
-      - name: Write pr_files file
-        run: |
-          cat > pr_files.txt << 'EOF'
-          ${{ needs.get-pr-info.outputs.PR_FILES }}
-          EOF
-
-      - name: Write tests_dir file
-        run: |
-          cat > tests_dir.txt << 'EOF'
-          ${{ steps.repo_content.outputs.tests_dir }}
-          EOF
-
-      - name: Write tests_models_dir file
-        run: |
-          cat > tests_models_dir.txt << 'EOF'
-          ${{ steps.repo_content.outputs.tests_models_dir }}
-          EOF
-
-      - name: Write tests_quantization_dir file
-        run: |
-          cat > tests_quantization_dir.txt << 'EOF'
-          ${{ steps.repo_content.outputs.tests_quantization_dir }}
-          EOF
+            // Write to files instead of outputs
+            fs.writeFileSync('tests_dir.txt', JSON.stringify(tests_dir, null, 2));
+            fs.writeFileSync('tests_models_dir.txt', JSON.stringify(tests_models_dir, null, 2));
+            fs.writeFileSync('tests_quantization_dir.txt', JSON.stringify(tests_quantization_dir, null, 2));

      - name: Run script to get jobs to run
        id: get_jobs
--- a/.github/workflows/push-important-models.yml
+++ b/.github/workflows/push-important-models.yml
@ -153,5 +153,5 @@ jobs:
      ci_event: push
      report_repo_id: hf-internal-testing/transformers_ci_push
      commit_sha: ${{ github.sha }}
-      models: ${{ needs.get_modified_models.outputs.matrix }}
+      subdirs: ${{ needs.get_modified_models.outputs.matrix }}
    secrets: inherit
--- a/.github/workflows/self-comment-ci.yml
+++ b/.github/workflows/self-comment-ci.yml
@ -23,62 +23,34 @@ env:
  TF_FORCE_GPU_ALLOW_GROWTH: true
  CUDA_VISIBLE_DEVICES: 0,1

+
 jobs:
  get-pr-number:
-    runs-on: ubuntu-22.04
    name: Get PR number
-    # For security: only allow team members to run
    if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "eustlb", "MekkCyber", "vasqu", "ivarflakstad", "stevhliu", "ebezzam", "remi-or", "itazap"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
-    outputs:
-      PR_NUMBER: ${{ steps.set_pr_number.outputs.PR_NUMBER }}
-    steps:
-      - name: Get PR number
-        shell: bash
-        run: |
-          if [[ "${{ github.event.issue.number }}" != "" && "${{ github.event.issue.pull_request }}" != "" ]]; then
-            echo "PR_NUMBER=${{ github.event.issue.number }}" >> $GITHUB_ENV
-          else
-            echo "PR_NUMBER=" >> $GITHUB_ENV
-          fi
+    uses: ./.github/workflows/get-pr-number.yml

-      - name: Check PR number
-        shell: bash
-        run: |
-          echo "${{ env.PR_NUMBER }}"
-
-      - name: Set PR number
-        id: set_pr_number
-        run: echo "PR_NUMBER=${{ env.PR_NUMBER }}" >> "$GITHUB_OUTPUT"
-
-  get-sha:
-    runs-on: ubuntu-22.04
+  get-pr-info:
+    name: Get PR commit SHA
    needs: get-pr-number
    if: ${{ needs.get-pr-number.outputs.PR_NUMBER != ''}}
-    outputs:
-      PR_HEAD_SHA: ${{ steps.get_sha.outputs.PR_HEAD_SHA }}
-      PR_MERGE_SHA: ${{ steps.get_sha.outputs.PR_MERGE_SHA }}
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          fetch-depth: "0"
-          ref: "refs/pull/${{needs.get-pr-number.outputs.PR_NUMBER}}/merge"
+    uses: ./.github/workflows/get-pr-info.yml
+    with:
+      pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}

-      - name: Get SHA (and verify timestamps against the issue comment date)
-        id: get_sha
+  check-timestamps:
+    name: Check timestamps (security check)
+    runs-on: ubuntu-22.04
+    needs: get-pr-info
+    outputs:
+      PR_HEAD_SHA: ${{ needs.get-pr-info.outputs.PR_HEAD_SHA }}
+      PR_MERGE_SHA: ${{ needs.get-pr-info.outputs.PR_MERGE_COMMIT_SHA }}
+    steps:
+      - name: Verify `merge_commit` timestamp is older than the issue comment timestamp
        env:
-          PR_NUMBER: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
          COMMENT_DATE: ${{ github.event.comment.created_at }}
+          PR_MERGE_COMMIT_TIMESTAMP: ${{ needs.get-pr-info.outputs.PR_MERGE_COMMIT_TIMESTAMP }}
        run: |
-            git fetch origin refs/pull/$PR_NUMBER/head:refs/remotes/pull/$PR_NUMBER/head
-            git checkout refs/remotes/pull/$PR_NUMBER/head
-            echo "PR_HEAD_SHA: $(git log -1 --format=%H)"
-            echo "PR_HEAD_SHA=$(git log -1 --format=%H)" >> "$GITHUB_OUTPUT"
-            git fetch origin refs/pull/$PR_NUMBER/merge:refs/remotes/pull/$PR_NUMBER/merge
-            git checkout refs/remotes/pull/$PR_NUMBER/merge
-            echo "PR_MERGE_SHA: $(git log -1 --format=%H)"
-            echo "PR_MERGE_SHA=$(git log -1 --format=%H)" >> "$GITHUB_OUTPUT"
-            PR_MERGE_COMMIT_TIMESTAMP=$(git log -1 --date=unix --format=%cd)
-            echo "PR_MERGE_COMMIT_TIMESTAMP: $PR_MERGE_COMMIT_TIMESTAMP"
            COMMENT_TIMESTAMP=$(date -d "${COMMENT_DATE}" +"%s")
            echo "COMMENT_DATE: $COMMENT_DATE"
            echo "COMMENT_TIMESTAMP: $COMMENT_TIMESTAMP"
@ -87,13 +59,10 @@ jobs:
              exit -1;
            fi

-  # use a python script to handle this complex logic
-  # case 1: `run-slow` (auto. infer with limited number of models, but in particular, new model)
-  # case 2: `run-slow model_1, model_2`
+  # use a python script to handle this complex logic.
  get-tests:
    runs-on: ubuntu-22.04
-    needs: [get-pr-number, get-sha]
-    if: ${{ needs.get-pr-number.outputs.PR_NUMBER != ''}}
+    needs: [get-pr-number, check-timestamps]
    outputs:
      models: ${{ steps.models_to_run.outputs.models }}
      quantizations: ${{ steps.models_to_run.outputs.quantizations }}
@ -101,11 +70,11 @@ jobs:
      - uses: actions/checkout@v4
        with:
          fetch-depth: "0"
-          ref: "refs/pull/${{needs.get-pr-number.outputs.PR_NUMBER}}/merge"
+          ref: "refs/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge"

      - name: Verify merge commit SHA
        env:
-          VERIFIED_PR_MERGE_SHA: ${{ needs.get-sha.outputs.PR_MERGE_SHA }}
+          VERIFIED_PR_MERGE_SHA: ${{ needs.check-timestamps.outputs.PR_MERGE_SHA }}
        run: |
            PR_MERGE_SHA=$(git log -1 --format=%H)
            if [ $PR_MERGE_SHA != $VERIFIED_PR_MERGE_SHA ]; then
@ -126,11 +95,33 @@ jobs:
      - name: Show models to test
        id: models_to_run
        run: |
-          echo "${{ env.models }}"
-          echo "models=${{ env.models }}" >> $GITHUB_ENV
-          echo "models=${{ env.models }}" >> $GITHUB_OUTPUT
-          echo "${{ env.quantizations }}"
-          echo "quantizations=${{ env.quantizations }}" >> $GITHUB_OUTPUT
+          echo "$models"
+          echo "models=$models" >> $GITHUB_OUTPUT
+          echo "$quantizations"
+          echo "quantizations=$quantizations" >> $GITHUB_OUTPUT
+
+  # Report back if we are not able to get the tests (for example, security check is failing)
+  report_error_earlier:
+    name: Report error earlier
+    if: ${{ always() && needs.get-pr-info.result == 'success' && needs.get-tests.result != 'success' }}
+    needs: [get-pr-number, get-pr-info, get-tests]
+    permissions:
+      pull-requests: write
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Reply to the comment
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
+          github_repository: ${{ github.repository }}
+          pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
+        run: |
+          gh api \
+            --method POST \
+            -H "Accept: application/vnd.github+json" \
+            -H "X-GitHub-Api-Version: 2022-11-28" \
+            "repos/${github_repository}/issues/${pr_number}/comments" \
+            -f body="💔 This comment contains \`run-slow\`, but unknown error occurred and [the workflow run]($GITHUB_RUN_URL) aborted!"

  reply_to_comment:
    name: Reply to the comment
@ -143,20 +134,20 @@ jobs:
      - name: Reply to the comment
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          MODELS: ${{ needs.get-tests.outputs.models }}
-          BODY: "\n\nmodels: ${{ needs.get-tests.outputs.models }}\nquantizations: ${{ needs.get-tests.outputs.quantizations }}"
+          BODY: '\n\nmodels: ${{ needs.get-tests.outputs.models }}\nquantizations: ${{ needs.get-tests.outputs.quantizations }}'
+          github_repository: ${{ github.repository }}
+          pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
        run: |
          gh api \
            --method POST \
            -H "Accept: application/vnd.github+json" \
            -H "X-GitHub-Api-Version: 2022-11-28" \
-            repos/${{ github.repository }}/issues/${{ needs.get-pr-number.outputs.PR_NUMBER }}/comments \
-            -f "body=This comment contains run-slow, running the specified jobs: ${{ env.BODY }} ..."
+            "repos/${github_repository}/issues/${pr_number}/comments" \
+            -f body="This comment contains \`run-slow\`, running the specified jobs: $(echo -e "$BODY")"

  create_run:
    name: Create run
-    if: ${{ needs.get-tests.outputs.models != '[]' || needs.get-tests.outputs.quantizations != '[]' }}
-    needs: [get-sha, get-tests, reply_to_comment]
+    needs: [check-timestamps, reply_to_comment]
    permissions:
      statuses: write
    runs-on: ubuntu-22.04
@ -168,248 +159,196 @@ jobs:
          # Create a commit status (pending) for a run of this workflow. The status has to be updated later in `update_run_status`.
          # See https://docs.github.com/en/rest/commits/statuses?apiVersion=2022-11-28#create-a-commit-status
          GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
+          github_repository: ${{ github.repository }}
+          pr_head_sha: ${{ needs.check-timestamps.outputs.PR_HEAD_SHA }}
        run: |
          gh api \
            --method POST \
            -H "Accept: application/vnd.github+json" \
            -H "X-GitHub-Api-Version: 2022-11-28" \
-            repos/${{ github.repository }}/statuses/${{ needs.get-sha.outputs.PR_HEAD_SHA }} \
+            "repos/${github_repository}/statuses/${pr_head_sha}" \
            -f "target_url=$GITHUB_RUN_URL" -f "state=pending" -f "description=Slow CI job" -f "context=pytest/custom-tests"

-  run_models_gpu:
-    name: Run all tests for the model
+  model-ci:
+    name: Model CI
    if: ${{ needs.get-tests.outputs.models != '[]' }}
-    needs: [get-pr-number, get-sha, get-tests, create_run]
-    strategy:
-      fail-fast: false
-      matrix:
-        folders: ${{ fromJson(needs.get-tests.outputs.models) }}
-        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
-    runs-on:
-       group: '${{ matrix.machine_type }}'
-    container:
-      image: huggingface/transformers-all-latest-gpu
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    steps:
-      - name: Echo input and matrix info
-        shell: bash
-        run: |
-          echo "${{ matrix.folders }}"
+    uses: ./.github/workflows/self-scheduled.yml
+    needs: [get-pr-number, check-timestamps, get-tests, create_run]
+    with:
+      job: run_models_gpu
+      slack_report_channel: "#transformers-ci-pr"
+      docker: huggingface/transformers-all-latest-gpu
+      ci_event: PR Comment CI
+      report_repo_id: hf-internal-testing/transformers_pr_ci
+      commit_sha: ${{ needs.check-timestamps.outputs.PR_MERGE_SHA }}
+      subdirs: ${{ needs.get-tests.outputs.models }}
+      pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
+    secrets: inherit

-      - name: Echo folder ${{ matrix.folders }}
-        shell: bash
-        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
-        # set the artifact folder names (because the character `/` is not allowed).
-        run: |
-          echo "${{ matrix.folders }}"
-          matrix_folders=${{ matrix.folders }}
-          matrix_folders=${matrix_folders/'models/'/'models_'}
-          echo "$matrix_folders"
-          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
-
-      - name: Checkout to PR merge commit
-        working-directory: /transformers
-        run: |
-          git fetch origin refs/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge:refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge
-          git checkout refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge
-          git log -1 --format=%H
-
-      - name: Verify merge commit SHA
-        env:
-          VERIFIED_PR_MERGE_SHA: ${{ needs.get-sha.outputs.PR_MERGE_SHA }}
-        working-directory: /transformers
-        run: |
-          PR_MERGE_SHA=$(git log -1 --format=%H)
-          if [ $PR_MERGE_SHA != $VERIFIED_PR_MERGE_SHA ]; then
-            echo "The merged commit SHA is not the same as the verified one! Security issue detected, abort the workflow!";
-            exit -1;
-          fi
-
-      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
-        working-directory: /transformers
-        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
-
-      - name: NVIDIA-SMI
-        run: |
-          nvidia-smi
-
-      - name: Set `machine_type` for report and artifact names
-        working-directory: /transformers
-        shell: bash
-        run: |
-          echo "${{ matrix.machine_type }}"
-          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
-            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
-            machine_type=multi-gpu
-          else
-            machine_type=${{ matrix.machine_type }}
-          fi
-          echo "$machine_type"
-          echo "machine_type=$machine_type" >> $GITHUB_ENV
-
-      - name: Environment
-        working-directory: /transformers
-        run: |
-          python3 utils/print_env.py
-
-      - name: Show installed libraries and their versions
-        working-directory: /transformers
-        run: pip freeze
-
-      - name: Run all tests on GPU
-        working-directory: /transformers
-        run: |
-          export CUDA_VISIBLE_DEVICES="$(python3 utils/set_cuda_devices_for_ci.py --test_folder ${{ matrix.folders }})"
-          echo $CUDA_VISIBLE_DEVICES
-          python3 -m pytest -v -rsfE --make-reports=${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
-
-      - name: Failure short reports
-        if: ${{ failure() }}
-        continue-on-error: true
-        run: cat /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
-
-      - name: Make sure report directory exists
-        shell: bash
-        run: |
-          mkdir -p /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
-          echo "hello" > /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/hello.txt
-          echo "${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports"
-
-      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports"
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports
-          path: /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
-
-  run_quantization_torch_gpu:
-    name: Run all tests for a quantization
+  quantization-ci:
+    name: Quantization CI
    if: ${{ needs.get-tests.outputs.quantizations != '[]' }}
-    needs: [get-pr-number, get-sha, get-tests, create_run]
-    strategy:
-      fail-fast: false
-      matrix:
-        folders: ${{ fromJson(needs.get-tests.outputs.quantizations) }}
-        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
-    runs-on:
-      group: '${{ matrix.machine_type }}'
-    container:
-      image: huggingface/transformers-quantization-latest-gpu
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    steps:
-      - name: Echo folder ${{ matrix.folders }}
-        shell: bash
-        run: |
-          echo "${{ matrix.folders }}"
-          matrix_folders=${{ matrix.folders }}
-          matrix_folders=${matrix_folders/'quantization/'/'quantization_'}
-          echo "$matrix_folders"
-          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
+    uses: ./.github/workflows/self-scheduled.yml
+    needs: [get-pr-number, check-timestamps, get-tests, create_run]
+    with:
+      job: run_quantization_torch_gpu
+      slack_report_channel: "#transformers-ci-pr"
+      docker: huggingface/transformers-quantization-latest-gpu
+      ci_event: PR Comment CI
+      report_repo_id: hf-internal-testing/transformers_pr_ci
+      commit_sha: ${{ needs.check-timestamps.outputs.PR_MERGE_SHA }}
+      subdirs: ${{ needs.get-tests.outputs.quantizations }}
+      pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
+    secrets: inherit

-      - name: Checkout to PR merge commit
-        working-directory: /transformers
-        run: |
-          git fetch origin refs/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge:refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge
-          git checkout refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge
-          git log -1 --format=%H
-
-      - name: Verify merge commit SHA
-        env:
-          VERIFIED_PR_MERGE_SHA: ${{ needs.get-sha.outputs.PR_MERGE_SHA }}
-        working-directory: /transformers
-        run: |
-          PR_MERGE_SHA=$(git log -1 --format=%H)
-          if [ $PR_MERGE_SHA != $VERIFIED_PR_MERGE_SHA ]; then
-            echo "The merged commit SHA is not the same as the verified one! Security issue detected, abort the workflow!";
-            exit -1;
-          fi
-
-      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
-        working-directory: /transformers
-        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
-      - name: NVIDIA-SMI
-        run: |
-          nvidia-smi
-
-      - name: Set `machine_type` for report and artifact names
-        working-directory: /transformers
-        shell: bash
-        run: |
-          echo "${{ matrix.machine_type }}"
-          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
-            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
-            machine_type=multi-gpu
-          else
-            machine_type=${{ matrix.machine_type }}
-          fi
-          echo "$machine_type"
-          echo "machine_type=$machine_type" >> $GITHUB_ENV
-
-      - name: Environment
-        working-directory: /transformers
-        run: |
-          python3 utils/print_env.py
-
-      - name: Show installed libraries and their versions
-        working-directory: /transformers
-        run: pip freeze
-
-      - name: Run quantization tests on GPU
-        working-directory: /transformers
-        run: |
-          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
-
-      - name: Failure short reports
-        if: ${{ failure() }}
-        continue-on-error: true
-        run: cat /transformers/reports/${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
-
-      - name: Make sure report directory exists
-        shell: bash
-        run: |
-          mkdir -p /transformers/reports/${{ env.machine_type }}_run_quantization_gpu_${{ matrix.folders }}_test_reports
-          echo "hello" > /transformers/reports/${{ env.machine_type }}_run_quantization_gpu_${{ matrix.folders }}_test_reports/hello.txt
-          echo "${{ env.machine_type }}_run_quantization_gpu_${{ matrix.folders }}_test_reports"
-
-      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports"
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: ${{ env.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports
-          path: /transformers/reports/${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports
-
-  update_run_status:
-    name: Update Check Run Status
-    needs: [get-sha, create_run, run_models_gpu, run_quantization_torch_gpu]
+  report:
+    name: Check & Report
+    needs: [get-pr-number, check-timestamps, create_run, model-ci, quantization-ci]
    permissions:
+      pull-requests: write
      statuses: write
    if: ${{ always() && needs.create_run.result == 'success' }}
    runs-on: ubuntu-22.04
-    env:
-      GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-      GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
-      STATUS_OK: ${{ contains(fromJSON('["skipped", "success"]'), needs.run_models_gpu.result) && contains(fromJSON('["skipped", "success"]'), needs.run_quantization_torch_gpu.result) }}
    steps:
-      - name: Get `run_models_gpu` job status
+      - name: Show reports from jobs
+        env:
+          MODEL_REPORT: ${{ needs.model-ci.outputs.report }}
+          QUANT_REPORT: ${{ needs.quantization-ci.outputs.report }}
        run: |
-          echo "${{ needs.run_models_gpu.result }}"
-          echo "${{ needs.run_quantization_torch_gpu.result }}"
-          echo $STATUS_OK
-          if [ "$STATUS_OK" = "true" ]; then
-            echo "STATUS=success" >> $GITHUB_ENV
-          else
-            echo "STATUS=failure" >> $GITHUB_ENV
-          fi
+          echo "$MODEL_REPORT"
+          echo "$QUANT_REPORT"

-      - name: Update PR commit statuses
+      - name: Process and filter reports
+        env:
+          MODEL_REPORT: ${{ needs.model-ci.outputs.report }}
+          QUANT_REPORT: ${{ needs.quantization-ci.outputs.report }}
        run: |
-          echo "${{ needs.run_models_gpu.result }}"
-          echo "${{ env.STATUS }}"
+          # Preprocess with Python
+          python3 << 'PYTHON_SCRIPT'
+          import json
+          import os
+          
+          def filter_and_format_report(data):
+            """
+            Filter out entries where commit is `None` (failing tests who status is not certain) and format as text
+            """
+            lines = []
+            
+            for model, model_result in data.items():
+                model_lines = []
+                for device, failures in model_result.items():
+                    
+                    # Filter out None commits and extract just the test names
+                    test_names = [
+                        failure['test'] 
+                        for failure in failures 
+                        if isinstance(failure, dict) and failure.get('commit') is not None
+                    ]
+
+                    # Add tests to model lines
+                    for idx, test_name in enumerate(test_names):
+                        if idx == 0:
+                            job_link = failures[idx]['job_link']
+                            model_lines.append(f"- [{model}]({job_link}):")
+          
+                        model_lines.append(f"    {test_name}")
+
+                # Only add model section if it has tests
+                if len(model_lines) > 0:
+                    lines.extend(model_lines)
+                    lines.append("")  # Empty line between models
+            
+            return "\n".join(lines).strip()
+          
+          # Load and filter reports
+          model_report_str = os.environ.get('MODEL_REPORT', '{}')
+          quant_report_str = os.environ.get('QUANT_REPORT', '{}')
+          
+          model_report = json.loads(model_report_str) if model_report_str else {}
+          quant_report = json.loads(quant_report_str) if quant_report_str else {}
+          
+          formatted_model = filter_and_format_report(model_report)
+          formatted_quant = filter_and_format_report(quant_report)
+          
+          # Write to files
+          with open('model_ci.txt', 'w') as f:
+              f.write(formatted_model)
+              if formatted_model:
+                  f.write('\n')
+          
+          with open('quantization_ci.txt', 'w') as f:
+              f.write(formatted_quant)
+              if formatted_quant:
+                  f.write('\n')
+          PYTHON_SCRIPT
+
+      - name: Post results as PR comment
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
+          github_repository: ${{ github.repository }}
+          pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
+          model_ci_result: ${{ needs.model-ci.result }}
+          quantization_ci_result: ${{ needs.quantization-ci.result }}
+        run: |
+          {
+            echo '## CI Results'
+            echo "[Workflow Run ⚙️]($GITHUB_RUN_URL)"
+            echo ''
+
+            # Check if both jobs were skipped or cancelled
+            if [[ "$model_ci_result" == "skipped" || "$model_ci_result" == "cancelled" ]] && \
+               [[ "$quantization_ci_result" == "skipped" || "$quantization_ci_result" == "cancelled" ]]; then
+              echo '⚠️ No test being reported (jobs are skipped or cancelled)!'
+              echo "STATUS=error" >> $GITHUB_ENV
+
+            # Check if either file has content
+            elif [ -s model_ci.txt ] || [ -s quantization_ci.txt ]; then
+              echo "STATUS=failure" >> $GITHUB_ENV
+
+              # Check if model_ci.txt has content
+              if [ -s model_ci.txt ]; then
+                echo '### Model CI Report'
+                echo ''
+                echo '#### ❌ Failed tests'
+                echo ''
+                cat model_ci.txt
+                echo ''
+              fi
+              
+              # Check if quantization_ci.txt has content
+              if [ -s quantization_ci.txt ]; then
+                echo '### Quantization CI Report'
+                echo ''
+                echo '#### ❌ Failed tests'
+                echo ''
+                cat quantization_ci.txt
+                echo ''
+              fi
+            else
+              echo "STATUS=success" >> $GITHUB_ENV
+              echo '✅ No failing test specific to this PR 🎉 !'
+            fi
+          } > comment_body.txt
+
          gh api \
            --method POST \
            -H "Accept: application/vnd.github+json" \
            -H "X-GitHub-Api-Version: 2022-11-28" \
-            repos/${{ github.repository }}/statuses/${{ needs.get-sha.outputs.PR_HEAD_SHA }} \
-            -f "target_url=$GITHUB_RUN_URL" -f "state=${{ env.STATUS }}" -f "description=Slow CI job" -f "context=pytest/custom-tests"
+            "repos/${github_repository}/issues/${pr_number}/comments" \
+            -F body=@comment_body.txt
+
+      - name: Update PR commit statuses
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
+          github_repository: ${{ github.repository }}
+          pr_head_sha: ${{ needs.check-timestamps.outputs.PR_HEAD_SHA }}
+        # The env. variable `STATUS` used here is set in the previous step
+        run: |
+          gh api \
+            --method POST \
+            -H "Accept: application/vnd.github+json" \
+            -H "X-GitHub-Api-Version: 2022-11-28" \
+            "repos/${github_repository}/statuses/${pr_head_sha}" \
+            -f "target_url=$GITHUB_RUN_URL" -f "state=$STATUS" -f "description=Slow CI job" -f "context=pytest/custom-tests"
--- a/.github/workflows/self-nightly-caller.yml
+++ b/.github/workflows/self-nightly-caller.yml
@ -51,6 +51,7 @@ jobs:
      slack_report_channel: "#transformers-ci-past-future"
      docker: huggingface/transformers-all-latest-torch-nightly-gpu
      ci_event: Nightly CI
+      runner_type: "a10"
      report_repo_id: hf-internal-testing/transformers_daily_ci_with_torch_nightly
      commit_sha: ${{ github.event.workflow_run.head_sha || github.sha }}
    secrets: inherit
--- a/.github/workflows/self-scheduled-amd-caller.yml
+++ b/.github/workflows/self-scheduled-amd-caller.yml
@ -2,7 +2,7 @@ name: Self-hosted runner (AMD scheduled CI caller)

 on:
  schedule:
-    - cron: "17 2 * * *"
+    - cron: "17 5 * * *"

 jobs:
  run_scheduled_amd_ci:
--- a/.github/workflows/self-scheduled-amd-mi355-caller.yml
+++ b/.github/workflows/self-scheduled-amd-mi355-caller.yml
@ -21,7 +21,7 @@ jobs:
      job: run_models_gpu
      slack_report_channel: "#amd-hf-ci"
      runner_group: hfc-amd-mi355
-      docker: huggingface/testing-rocm7.0-preview
+      docker: huggingface/transformers-pytorch-amd-gpu
      ci_event: Scheduled CI (AMD) - mi355
      report_repo_id: hf-transformers-bot/transformers-ci-dummy
    secrets: inherit
@ -33,7 +33,7 @@ jobs:
      job: run_pipelines_torch_gpu
      slack_report_channel: "#amd-hf-ci"
      runner_group: hfc-amd-mi355
-      docker: huggingface/testing-rocm7.0-preview
+      docker: huggingface/transformers-pytorch-amd-gpu
      ci_event: Scheduled CI (AMD) - mi355
      report_repo_id: hf-transformers-bot/transformers-ci-dummy
    secrets: inherit
@ -45,7 +45,7 @@ jobs:
      job: run_examples_gpu
      slack_report_channel: "#amd-hf-ci"
      runner_group: hfc-amd-mi355
-      docker: huggingface/testing-rocm7.0-preview
+      docker: huggingface/transformers-pytorch-amd-gpu
      ci_event: Scheduled CI (AMD) - mi355
      report_repo_id: hf-transformers-bot/transformers-ci-dummy
    secrets: inherit
--- a/.github/workflows/self-scheduled-caller.yml
+++ b/.github/workflows/self-scheduled-caller.yml
@ -33,10 +33,13 @@ jobs:
    runs-on: ubuntu-22.04
    steps:
      - name: Setup
+        env:
+          prev_workflow_run_id: ${{ inputs.prev_workflow_run_id || env.prev_workflow_run_id }}
+          other_workflow_run_id: ${{ inputs.other_workflow_run_id || env.other_workflow_run_id }}
        run: |
          mkdir "setup_values"
-          echo "${{ inputs.prev_workflow_run_id || env.prev_workflow_run_id }}" > "setup_values/prev_workflow_run_id.txt"
-          echo "${{ inputs.other_workflow_run_id || env.other_workflow_run_id }}" > "setup_values/other_workflow_run_id.txt"
+          echo "$prev_workflow_run_id" > "setup_values/prev_workflow_run_id.txt"
+          echo "$other_workflow_run_id" > "setup_values/other_workflow_run_id.txt"

      - name: Upload artifacts
        uses: actions/upload-artifact@v4
@ -118,3 +121,15 @@ jobs:
      report_repo_id: hf-internal-testing/transformers_daily_ci
      commit_sha: ${{ github.sha }}
    secrets: inherit
+
+  kernels-ci:
+    name: Kernels CI
+    uses: ./.github/workflows/self-scheduled.yml
+    with:
+      job: run_kernels_gpu
+      slack_report_channel: "#transformers-ci-daily-kernels"
+      docker: huggingface/transformers-all-latest-gpu
+      ci_event: Daily CI
+      report_repo_id: hf-internal-testing/transformers_daily_ci
+      commit_sha: ${{ github.sha }}
+    secrets: inherit
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@ -34,14 +34,20 @@ on:
      runner_type:
        required: false
        type: string
-      models:
+      subdirs:
        default: ""
        required: false
        type: string
      pytest_marker:
        required: false
        type: string
-
+      pr_number:
+        required: false
+        type: string
+    outputs:
+      report:
+        description: "Content of the report of new failures"
+        value: ${{ jobs.check_new_failures.outputs.report }}

 env:
  HF_HOME: /mnt/cache
@ -54,7 +60,6 @@ env:
  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
  TF_FORCE_GPU_ALLOW_GROWTH: true
  CUDA_VISIBLE_DEVICES: 0,1
-  NUM_SLICES: 2

 jobs:
  setup:
@ -75,8 +80,11 @@ jobs:
    steps:
      - name: Update clone
        working-directory: /transformers
+        env:
+          commit_sha: ${{ inputs.commit_sha || github.sha }}
        run: |
-          git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
+          git fetch origin $commit_sha
+          git fetch && git checkout $commit_sha

      - name: Cleanup
        working-directory: /transformers
@ -93,11 +101,17 @@ jobs:
        if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu"]'), inputs.job)
        name: Identify models to test
        working-directory: /transformers/tests
+        env:
+          job: ${{ inputs.job }}
+          subdirs: ${{ inputs.subdirs }}
+          NUM_SLICES: 2
        run: |
-          if [ "${{ inputs.job }}" = "run_models_gpu" ]; then
-            echo "folder_slices=$(python3 ../utils/split_model_tests.py --models '${{ inputs.models }}' --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
-            echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
-          elif [ "${{ inputs.job }}" = "run_trainer_and_fsdp_gpu" ]; then
+          if [ "$job" = "run_models_gpu" ]; then
+            python3 ../utils/split_model_tests.py --subdirs "$subdirs" --num_splits "$NUM_SLICES" > folder_slices.txt
+            echo "folder_slices=$(cat folder_slices.txt)" >> $GITHUB_OUTPUT
+            python3 -c "import ast; folder_slices = ast.literal_eval(open('folder_slices.txt').read()); open('slice_ids.txt', 'w').write(str(list(range(len(folder_slices)))))"
+            echo "slice_ids=$(cat slice_ids.txt)" >> $GITHUB_OUTPUT
+          elif [ "$job" = "run_trainer_and_fsdp_gpu" ]; then
            echo "folder_slices=[['trainer'], ['fsdp']]" >> $GITHUB_OUTPUT
            echo "slice_ids=[0, 1]" >> $GITHUB_OUTPUT
          fi
@ -106,8 +120,10 @@ jobs:
        if: ${{ inputs.job == 'run_quantization_torch_gpu' }}
        name: Identify quantization method to test
        working-directory: /transformers/tests
+        env:
+          subdirs: ${{ inputs.subdirs || 'None' }}
        run: |
-          echo "quantization_matrix=$(python3 -c 'import os; tests = os.getcwd(); quantization_tests = os.listdir(os.path.join(tests, "quantization")); d = sorted(list(filter(os.path.isdir, [f"quantization/{x}" for x in quantization_tests]))) ;  print(d)')" >> $GITHUB_OUTPUT
+          echo "quantization_matrix=$(python3 -c 'import ast; import os; tests = os.getcwd(); quantization_tests = os.listdir(os.path.join(tests, "quantization")); subdirs = ast.literal_eval(os.environ["subdirs"]); quantization_tests = [x.removeprefix("quantization/") for x in subdirs] if subdirs is not None else quantization_tests; d = sorted(list(filter(os.path.isdir, [f"quantization/{x}" for x in quantization_tests]))); print(d)')" >> $GITHUB_OUTPUT

      - name: NVIDIA-SMI
        run: |
@ -170,7 +186,9 @@ jobs:
    steps:
      - name: Update clone
        working-directory: /transformers
-        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
+        env:
+          commit_sha: ${{ inputs.commit_sha || github.sha }}
+        run: git fetch && git checkout "$commit_sha"

      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
        working-directory: /transformers
@ -192,15 +210,17 @@ jobs:
      - name: Set `machine_type` for report and artifact names
        working-directory: /transformers
        shell: bash
+        env:
+          matrix_machine_type: ${{ matrix.machine_type }}
        run: |
-          echo "${{ matrix.machine_type }}"
+          echo "$matrix_machine_type"

-          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+          if [ "$matrix_machine_type" = "aws-g5-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
+          elif [ "$matrix_machine_type" = "aws-g5-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
-            machine_type=${{ matrix.machine_type }}
+            machine_type="$matrix_machine_type"
          fi

          echo "$machine_type"
@ -209,12 +229,12 @@ jobs:
      - name: Run all pipeline tests on GPU
        working-directory: /transformers
        run: |
-          python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports tests/pipelines
+          python3 -m pytest -n 1 -v --dist=loadfile --make-reports="${machine_type}_run_pipelines_torch_gpu_test_reports" tests/pipelines

      - name: Failure short reports
        if: ${{ failure() }}
        continue-on-error: true
-        run: cat /transformers/reports/${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports/failures_short.txt
+        run: cat "/transformers/reports/${machine_type}_run_pipelines_torch_gpu_test_reports/failures_short.txt"

      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports"
        if: ${{ always() }}
@ -238,7 +258,9 @@ jobs:
    steps:
      - name: Update clone
        working-directory: /transformers
-        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
+        env:
+          commit_sha: ${{ inputs.commit_sha || github.sha }}
+        run: git fetch && git checkout "$commit_sha"

      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
        working-directory: /transformers
@ -260,15 +282,17 @@ jobs:
      - name: Set `machine_type` for report and artifact names
        working-directory: /transformers
        shell: bash
+        env:
+          matrix_machine_type: ${{ matrix.machine_type }}
        run: |
-          echo "${{ matrix.machine_type }}"
+          echo "$matrix_machine_type"

-          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+          if [ "$matrix_machine_type" = "aws-g5-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
+          elif [ "$matrix_machine_type" = "aws-g5-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
-            machine_type=${{ matrix.machine_type }}
+            machine_type="$matrix_machine_type"
          fi

          echo "$machine_type"
@ -278,12 +302,12 @@ jobs:
        working-directory: /transformers
        run: |
          pip install -r examples/pytorch/_tests_requirements.txt
-          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_examples_gpu_test_reports examples/pytorch
+          python3 -m pytest -v --make-reports="${machine_type}_run_examples_gpu_test_reports" examples/pytorch

      - name: Failure short reports
        if: ${{ failure() }}
        continue-on-error: true
-        run: cat /transformers/reports/${{ env.machine_type }}_run_examples_gpu_test_reports/failures_short.txt
+        run: cat "/transformers/reports/${machine_type}_run_examples_gpu_test_reports/failures_short.txt"

      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_examples_gpu_test_reports"
        if: ${{ always() }}
@ -307,7 +331,9 @@ jobs:
    steps:
      - name: Update clone
        working-directory: ${{ inputs.working-directory-prefix }}/transformers
-        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
+        env:
+          commit_sha: ${{ inputs.commit_sha || github.sha }}
+        run: git fetch && git checkout "$commit_sha"

      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
        working-directory: ${{ inputs.working-directory-prefix }}/transformers
@ -329,7 +355,7 @@ jobs:
        working-directory: ${{ inputs.working-directory-prefix }}/
        run: |
          python3 -m pip uninstall -y deepspeed
-          DS_DISABLE_NINJA=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+          DS_DISABLE_NINJA=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --no-build-isolation --config-settings="--build-option=build_ext" --config-settings="--build-option=-j8" --no-cache -v --disable-pip-version-check

      # To avoid unknown test failures
      - name: Pre build DeepSpeed *again* (for nightly & Past CI)
@ -339,7 +365,7 @@ jobs:
          python3 -m pip uninstall -y deepspeed
          rm -rf DeepSpeed
          git clone https://github.com/deepspeedai/DeepSpeed && cd DeepSpeed && rm -rf build
-          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --no-build-isolation --config-settings="--build-option=build_ext" --config-settings="--build-option=-j8" --no-cache -v --disable-pip-version-check

      - name: NVIDIA-SMI
        run: |
@ -357,15 +383,17 @@ jobs:
      - name: Set `machine_type` for report and artifact names
        working-directory: ${{ inputs.working-directory-prefix }}/transformers
        shell: bash
+        env:
+          matrix_machine_type: ${{ matrix.machine_type }}
        run: |
-          echo "${{ matrix.machine_type }}"
+          echo "$matrix_machine_type"

-          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+          if [ "$matrix_machine_type" = "aws-g5-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
+          elif [ "$matrix_machine_type" = "aws-g5-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
-            machine_type=${{ matrix.machine_type }}
+            machine_type="$matrix_machine_type"
          fi

          echo "$machine_type"
@ -374,12 +402,14 @@ jobs:
      - name: Run all tests on GPU
        working-directory: ${{ inputs.working-directory-prefix }}/transformers
        run: |
-          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended
+          python3 -m pytest -v --make-reports="${machine_type}_run_torch_cuda_extensions_gpu_test_reports" tests/deepspeed tests/extended

      - name: Failure short reports
        if: ${{ failure() }}
        continue-on-error: true
-        run: cat ${{ inputs.working-directory-prefix }}/transformers/reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt
+        env:
+          working_directory_prefix: ${{ inputs.working-directory-prefix }}
+        run: cat "${working_directory_prefix}/transformers/reports/${machine_type}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt"

      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports"
        if: ${{ always() }}
@ -406,16 +436,19 @@ jobs:
    steps:
      - name: Echo folder ${{ matrix.folders }}
        shell: bash
+        env:
+          matrix_folders_raw: ${{ matrix.folders }}
        run: |
-          echo "${{ matrix.folders }}"
-          matrix_folders=${{ matrix.folders }}
-          matrix_folders=${matrix_folders/'quantization/'/'quantization_'}
+          echo "$matrix_folders_raw"
+          matrix_folders="${matrix_folders_raw/'quantization/'/'quantization_'}"
          echo "$matrix_folders"
          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV

      - name: Update clone
        working-directory: /transformers
-        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
+        env:
+          commit_sha: ${{ inputs.commit_sha || github.sha }}
+        run: git fetch && git checkout "$commit_sha"

      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
        working-directory: /transformers
@ -437,15 +470,17 @@ jobs:
      - name: Set `machine_type` for report and artifact names
        working-directory: /transformers
        shell: bash
+        env:
+          matrix_machine_type: ${{ matrix.machine_type }}
        run: |
-          echo "${{ matrix.machine_type }}"
+          echo "$matrix_machine_type"

-          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+          if [ "$matrix_machine_type" = "aws-g5-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
+          elif [ "$matrix_machine_type" = "aws-g5-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
-            machine_type=${{ matrix.machine_type }}
+            machine_type="$matrix_machine_type"
          fi

          echo "$machine_type"
@ -453,20 +488,96 @@ jobs:

      - name: Run quantization tests on GPU
        working-directory: /transformers
+        env:
+          folders: ${{ matrix.folders }}
        run: |
-          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
+          python3 -m pytest -v --make-reports="${machine_type}_run_quantization_torch_gpu_${matrix_folders}_test_reports" tests/${folders}

      - name: Failure short reports
        if: ${{ failure() }}
        continue-on-error: true
-        run: cat /transformers/reports/${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
+        run: cat "/transformers/reports/${machine_type}_run_quantization_torch_gpu_${matrix_folders}_test_reports/failures_short.txt"

      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports"
        if: ${{ always() }}
        uses: actions/upload-artifact@v4
        with:
          name: ${{ env.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports
-          path: /transformers/reports/${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports
+          path: /transformers/reports/${{ env.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports
+
+  run_kernels_gpu:
+    if: ${{ inputs.job == 'run_kernels_gpu' }}
+    name: Kernel tests
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [aws-g5-4xlarge-cache]
+    runs-on:
+      group: '${{ matrix.machine_type }}'
+    container:
+      image: ${{ inputs.docker }}
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: Update clone
+        working-directory: /transformers
+        env:
+          commit_sha: ${{ inputs.commit_sha || github.sha }}
+        run: git fetch && git checkout "$commit_sha"
+
+      - name: Reinstall transformers in edit mode
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .[testing]
+  
+      - name: Install kernels
+        working-directory: /transformers
+        run: python3 -m pip install -U kernels
+  
+      - name: NVIDIA-SMI
+        run: nvidia-smi
+
+      - name: Environment
+        working-directory: /transformers
+        run: python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Set `machine_type` for report and artifact names
+        working-directory: /transformers
+        shell: bash
+        env:
+          matrix_machine_type: ${{ matrix.machine_type }}
+        run: |
+          echo "$matrix_machine_type"
+
+          if [ "$matrix_machine_type" = "aws-g5-4xlarge-cache" ]; then
+            machine_type=single-gpu
+          elif [ "$matrix_machine_type" = "aws-g5-12xlarge-cache" ]; then
+            machine_type=multi-gpu
+          else
+            machine_type="$matrix_machine_type"
+          fi
+
+          echo "$machine_type"
+          echo "machine_type=$machine_type" >> $GITHUB_ENV
+    
+      - name: Run kernel tests on GPU
+        working-directory: /transformers
+        run: |
+          python3 -m pytest -v --make-reports="${machine_type}_run_kernels_gpu_test_reports" tests/kernels/test_kernels.py
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat "/transformers/reports/${machine_type}_run_kernels_gpu_test_reports/failures_short.txt"
+
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_kernels_gpu_test_reports"
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ env.machine_type }}_run_kernels_gpu_test_reports
+          path: /transformers/reports/${{ env.machine_type }}_run_kernels_gpu_test_reports

  run_extract_warnings:
    # Let's only do this for the job `run_models_gpu` to simplify the (already complex) logic.
@ -475,11 +586,10 @@ jobs:
    runs-on: ubuntu-22.04
    needs: [setup, run_models_gpu]
    steps:
+      # Checkout in order to run `utils/extract_warnings.py`. Avoid **explicit** checkout (i.e. don't specify `ref`) for
+      # security reason.
      - name: Checkout transformers
        uses: actions/checkout@v4
-        with:
-          fetch-depth: 2
-          ref: ${{ inputs.commit_sha || github.sha }}

      - name: Install transformers
        run: pip install transformers
@ -499,9 +609,12 @@ jobs:
        working-directory: warnings_in_ci

      - name: Extract warnings in CI artifacts
+        env:
+          github_run_id: ${{ github.run_id }}
+          access_token: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
        run: |
-          python3 utils/extract_warnings.py --workflow_run_id ${{ github.run_id }} --output_dir warnings_in_ci --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }} --from_gh
-          echo "$(python3 -c 'import os; import json; fp = open("warnings_in_ci/selected_warnings.json"); d = json.load(fp); d = "\n".join(d) ;print(d)')"
+          python3 utils/extract_warnings.py --workflow_run_id "$github_run_id" --output_dir warnings_in_ci --token "$access_token" --from_gh
+          echo "$(python3 -c 'import os; import json; fp = open("warnings_in_ci/selected_warnings.json"); d = json.load(fp); d = "\n".join(d); print(d)')"

      - name: Upload artifact
        if: ${{ always() }}
@ -520,6 +633,7 @@ jobs:
      run_examples_gpu,
      run_torch_cuda_extensions_gpu,
      run_quantization_torch_gpu,
+      run_kernels_gpu,
      run_extract_warnings
    ]
    if: always() && !cancelled()
@ -539,16 +653,17 @@ jobs:
    secrets: inherit

  check_new_failures:
-    if: ${{ always() && inputs.ci_event == 'Daily CI' && needs.send_results.result == 'success' }}
+    if: ${{ always() && needs.send_results.result == 'success' }}
    name: Check new failures
    needs: send_results
    uses: ./.github/workflows/check_failed_tests.yml
    with:
      docker: ${{ inputs.docker }}
-      start_sha: ${{ inputs.commit_sha || github.sha }}
+      commit_sha: ${{ inputs.commit_sha || github.sha }}
      job: ${{ inputs.job }}
      slack_report_channel: ${{ inputs.slack_report_channel }}
      ci_event: ${{ inputs.ci_event }}
      report_repo_id: ${{ inputs.report_repo_id }}
+      pr_number: ${{ inputs.pr_number }}

    secrets: inherit
--- a/.github/workflows/slack-report.yml
+++ b/.github/workflows/slack-report.yml
@ -41,13 +41,16 @@ jobs:
      - name: Preliminary job status
        shell: bash
        # For the meaning of these environment variables, see the job `Setup`
+        env:
+          setup_status: ${{ inputs.setup_status }}
        run: |
-          echo "Setup status: ${{ inputs.setup_status }}"
+          echo "Setup status: $setup_status"

      - uses: actions/checkout@v4
        with:
          fetch-depth: 2
-          ref: ${{ inputs.commit_sha || github.sha }}
+          # Security: checkout to the `main` branch for untrusted triggers (issue_comment, pull_request_target), otherwise use the specified ref
+          ref: ${{ (github.event_name == 'issue_comment' || github.event_name == 'pull_request_target') && 'main' || (inputs.commit_sha || github.sha) }}

      - uses: actions/download-artifact@v4

@ -81,6 +84,8 @@ jobs:
          CI_TEST_JOB: ${{ inputs.job }}
          SETUP_STATUS: ${{ inputs.setup_status }}
          REPORT_REPO_ID: ${{ inputs.report_repo_id }}
+          quantization_matrix: ${{ inputs.quantization_matrix }}
+          folder_slices: ${{ inputs.folder_slices }}
        # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
        # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
        # For a job that doesn't depend on (i.e. `needs`) `setup`, the value for `inputs.folder_slices` would be an
@ -89,10 +94,10 @@ jobs:
          pip install huggingface_hub
          pip install slack_sdk
          pip show slack_sdk
-          if [ "${{ inputs.quantization_matrix }}" != "" ]; then
-            python utils/notification_service.py "${{ inputs.quantization_matrix }}"
+          if [ "$quantization_matrix" != "" ]; then
+            python utils/notification_service.py "$quantization_matrix"
          else
-            python utils/notification_service.py "${{ inputs.folder_slices }}"
+            python utils/notification_service.py "$folder_slices"
          fi

      # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.
--- a/.github/workflows/ssh-runner.yml
+++ b/.github/workflows/ssh-runner.yml
@ -4,7 +4,7 @@ on:
  workflow_dispatch:
    inputs:
      runner_type:
-        description: 'Type of runner to test (a10 or t4)'
+        description: 'Type of runner to test (a10)'
        required: true
      docker_image:
        description: 'Name of the Docker image'
@ -36,14 +36,10 @@ jobs:
          NUM_GPUS: ${{ github.event.inputs.num_gpus }}
          RUNNER_TYPE: ${{ github.event.inputs.runner_type }}
        run: |
-          if [[ "$NUM_GPUS" == "single" && "$RUNNER_TYPE" == "t4" ]]; then
-            echo "RUNNER=aws-g4dn-4xlarge-cache" >> $GITHUB_ENV
-          elif [[ "$NUM_GPUS" == "multi" && "$RUNNER_TYPE" == "t4" ]]; then
-            echo "RUNNER=aws-g4dn-12xlarge-cache" >> $GITHUB_ENV
-          elif [[ "$NUM_GPUS" == "single" && "$RUNNER_TYPE" == "a10" ]]; then
-            echo "RUNNER=aws-g5-4xlarge-cache" >> $GITHUB_ENV
+          if [[ "$NUM_GPUS" == "single" && "$RUNNER_TYPE" == "a10" ]]; then
+            echo "RUNNER=aws-g5-4xlarge-cache-ssh" >> $GITHUB_ENV
          elif [[ "$NUM_GPUS" == "multi" && "$RUNNER_TYPE" == "a10" ]]; then
-            echo "RUNNER=aws-g5-12xlarge-cache" >> $GITHUB_ENV
+            echo "RUNNER=aws-g5-12xlarge-cache-ssh" >> $GITHUB_ENV
          else
            echo "RUNNER=" >> $GITHUB_ENV
          fi
@ -51,8 +47,8 @@ jobs:
      - name: Set runner to use
        id: set_runner
        run: |
-          echo ${{ env.RUNNER }}
-          echo "RUNNER=${{ env.RUNNER }}" >> $GITHUB_OUTPUT
+          echo "$RUNNER"
+          echo "RUNNER=$RUNNER" >> $GITHUB_OUTPUT

  ssh_runner:
    name: "SSH"
@ -61,13 +57,13 @@ jobs:
      group: ${{ needs.get_runner.outputs.RUNNER }}
    container:
      image: ${{ github.event.inputs.docker_image }}
-      options: --gpus all --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-
    steps:
      - name: Update clone
        working-directory: /transformers
+        env:
+          commit_sha: ${{ github.sha }}
        run: |
-          git fetch && git checkout ${{ github.sha }}
+          git fetch && git checkout "$commit_sha"

      - name: Cleanup
        working-directory: /transformers
@ -99,14 +95,17 @@ jobs:
      - name: Store Slack infos
        #because the SSH can be enabled dynamically if the workflow failed, so we need to store slack infos to be able to retrieve them during the waitforssh step
        shell: bash
+        env:
+          user_slack_id: ${{ secrets[format('{0}_{1}', env.github_actor, 'SLACK_ID')] }}
+          default_slack_channel: ${{ secrets.SLACK_CIFEEDBACK_CHANNEL }}
        run: |
-          echo "${{ env.github_actor }}"
-          if [ "${{ secrets[format('{0}_{1}', env.github_actor, 'SLACK_ID')] }}" != "" ]; then
-            echo "SLACKCHANNEL=${{ secrets[format('{0}_{1}', env.github_actor, 'SLACK_ID')] }}" >> $GITHUB_ENV
+          echo "$github_actor"
+          if [ "$user_slack_id" != "" ]; then
+            echo "SLACKCHANNEL=$user_slack_id" >> $GITHUB_ENV
          else
-            echo "SLACKCHANNEL=${{ secrets.SLACK_CIFEEDBACK_CHANNEL }}" >> $GITHUB_ENV
+            echo "SLACKCHANNEL=$default_slack_channel" >> $GITHUB_ENV
          fi
-
+        
      - name: Tailscale # In order to be able to SSH when a test fails
        uses: huggingface/tailscale-action@main
        with:
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -125,8 +125,9 @@ If you're contributing a **vision-language model** (or any multimodal model that
 All new models should use the modular architecture pattern. Create a `modular_<model_name>.py` file using the modular model converter:

 - Use the CLI, [`transformers add-new-model-like`](https://github.com/huggingface/transformers/blob/main/src/transformers/cli/add_new_model_like.py) to generate a modular skeleton and get started
- All code should be in the modular file if possible. Modeling must be in it, it's better if configuration is in it as well. 
+- All code should be in the modular file if possible. Modeling must be in it, it's better if configuration is in it as well. [Modular guide](./modular_transformers#implementing-a-modular-file) shows a quick way to set up a modular file.
 - Reuse existing patterns from similar models as much as possible
+- You can make the model compatible with inference engines such as vLLM or SGLang, and enable zero-effort integration. See specific requirements for model implementation in ["Transformers modeling backend"](./transformers_as_backend#multimodal-models)

 To verify your modular file is correct, run:

--- a/1
+++ b/1
@ -45,6 +45,7 @@ repo-consistency:
 	python utils/check_modular_conversion.py
 	python utils/check_dummies.py
 	python utils/check_repo.py
+	python utils/check_init_weights_data.py
 	python utils/check_inits.py
 	python utils/check_pipeline_typing.py
 	python utils/check_config_docstrings.py
--- a/benchmark/requirements.txt
+++ b/benchmark/requirements.txt
@ -1,6 +1,5 @@
 gpustat==1.1.1
 psutil==6.0.0
 psycopg2==2.9.9
-torch>=2.4.0
 hf_xet
-pandas>=1.5.0
+pandas>=1.5.0
--- a/benchmark_v2/framework/benchmark_config.py
+++ b/benchmark_v2/framework/benchmark_config.py
@ -1,8 +1,11 @@
 import hashlib
+import itertools
 import json
 import logging
 from typing import Any

+from transformers.utils.import_utils import is_flash_attn_2_available
+

 KERNELIZATION_AVAILABLE = False
 try:
@ -18,11 +21,22 @@ logger = logging.getLogger(__name__)
 class BenchmarkConfig:
    """Configuration for a single benchmark scenario."""

+    all_attn_implementations = [
+        ("flash_attention_2", None),
+        ("eager", None),
+        ("sdpa", "math"),
+        ("sdpa", "flash_attention"),
+        ("flex_attention", None),
+    ]
+
+    all_compiled_modes = [None, "default", "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs"]
+
    def __init__(
        self,
        warmup_iterations: int = 5,
        measurement_iterations: int = 20,
        gpu_monitoring: bool = True,  # NOTE: you may want to disable this at times as we have obsvered it could heavily slow down benchmarks on AMD
+        continuous_batching: bool = False,
        batch_size: int = 1,
        sequence_length: int = 128,
        num_tokens_to_generate: int = 128,
@ -38,6 +52,7 @@ class BenchmarkConfig:
        self.warmup_iterations = warmup_iterations
        self.measurement_iterations = measurement_iterations
        self.gpu_monitoring = gpu_monitoring
+        self.continuous_batching = continuous_batching
        # Input parameters
        self.batch_size = batch_size
        self.sequence_length = sequence_length
@ -59,12 +74,35 @@ class BenchmarkConfig:
    def check_validity(self, skip_validity_check: bool = False) -> None:
        if skip_validity_check:
            return
+        # Check FA is installed
+        if self.attn_implementation == "flash_attention_2" and not is_flash_attn_2_available():
+            logger.warning(
+                "Flash attention does not support compile mode. Defaulting to SDPA w/ flash attention backend."
+            )
+            self.attn_implementation = "sdpa"
+            self.sdpa_backend = "flash_attention"
        # Flash attention does not support compile mode, so we turn it off # FIXME: it would be better to support it
        is_fa = self.attn_implementation == "flash_attention_2"
        is_fa |= self.attn_implementation == "sdpa" and self.sdpa_backend == "flash_attention"
        if is_fa:
            logger.warning("Flash attention does not support compile mode. Turning off compile mode.")
            self.compile_mode = None
+        # Handle SDPA backend if not determined by the config (needs to be done before skipping duplicates)
+        if self.attn_implementation == "sdpa" and self.sdpa_backend is None:
+            default_backend = "flash_attention"  # FIXME: torch has a _cur_sdpa_kernel_backends but it fails
+            logger.warning(f"No SDPA backend provided, using {default_backend} instead.")
+            self.sdpa_backend = default_backend
+        if self.continuous_batching:
+            if self.attn_implementation == "flex_attention":
+                logger.error(
+                    "disabling continuous batching because of invalid configuration: flex attention is not supported"
+                )
+                self.continuous_batching = False
+            elif self.attn_implementation == "sdpa" and self.sdpa_backend is not None:
+                logger.warning(
+                    "when continuous batching is enabled, sdpa_backend must be None because of the attention mask, setting it to None"
+                )
+                self.sdpa_backend = "math"

    @property
    def hash(self) -> str:
@ -80,6 +118,7 @@ class BenchmarkConfig:
            attn_code += f"_{self.sdpa_backend}" if self.attn_implementation == "sdpa" else ""
            compile_str = f"compiled_{self.compile_mode}" if self.compile_mode is not None else "uncompiled"
            kernelize_str = "kernelized" if self.kernelize else "unkernelized"
+            continuous_batching_str = "cb" if self.continuous_batching else "generate"
            sep = "-"
        else:
            iter_str = f"{self.warmup_iterations} warmup, {self.measurement_iterations} iterations"
@ -89,8 +128,11 @@ class BenchmarkConfig:
            attn_code += f" with {self.sdpa_backend} backend" if self.attn_implementation == "sdpa" else ""
            compile_str = "compiled" if self.compile_mode is not None else "not compiled"
            kernelize_str = "kernelized" if self.kernelize else "not kernelized"
+            continuous_batching_str = "continuous batching" if self.continuous_batching else "regular generate"
            sep = ", "
-        return sep.join([iter_str, gpu_monitor_str, dimensions_str, attn_code, compile_str, kernelize_str])
+        return sep.join(
+            [iter_str, gpu_monitor_str, dimensions_str, attn_code, compile_str, kernelize_str, continuous_batching_str]
+        )

    def to_dict(self) -> dict[str, Any]:
        return {
@ -98,6 +140,7 @@ class BenchmarkConfig:
            "warmup_iterations": self.warmup_iterations,
            "measurement_iterations": self.measurement_iterations,
            "gpu_monitoring": self.gpu_monitoring,
+            "continuous_batching": self.continuous_batching,
            "batch_size": self.batch_size,
            "sequence_length": self.sequence_length,
            "num_tokens_to_generate": self.num_tokens_to_generate,
@ -114,6 +157,7 @@ class BenchmarkConfig:
            warmup_iterations=data.get("warmup_iterations", 5),
            measurement_iterations=data.get("measurement_iterations", 20),
            gpu_monitoring=data.get("gpu_monitoring", False),
+            continuous_batching=data.get("continuous_batching", False),
            batch_size=data.get("batch_size", 1),
            sequence_length=data.get("sequence_length", 128),
            num_tokens_to_generate=data.get("num_tokens_to_generate", 128),
@ -127,88 +171,72 @@ class BenchmarkConfig:
        )


-def cross_generate_configs(
-    attn_impl_and_sdpa_backend: list[tuple[str, str | None]],
-    compiled_mode: list[str | None],
-    kernelized: list[bool],
-    warmup_iterations: int = 5,
-    measurement_iterations: int = 20,
-    batch_size: int = 1,
-    sequence_length: int = 128,
-    num_tokens_to_generate: int = 128,
-    gpu_monitoring: bool = True,
+def adapt_configs(
+    configs: list[BenchmarkConfig],
+    warmup_iterations: int | list[int] = 5,
+    measurement_iterations: int | list[int] = 20,
+    batch_size: int | list[int] = 1,
+    sequence_length: int | list[int] = 128,
+    num_tokens_to_generate: int | list[int] = 128,
+    gpu_monitoring: bool | list[bool] = True,
 ) -> list[BenchmarkConfig]:
-    # Create kwargs common to all configs
-    kwargs = {
-        "warmup_iterations": warmup_iterations,
-        "measurement_iterations": measurement_iterations,
-        "batch_size": batch_size,
-        "sequence_length": sequence_length,
-        "num_tokens_to_generate": num_tokens_to_generate,
-        "gpu_monitoring": gpu_monitoring,
-    }
-    # Cross-generate all combinations of attn_implementation, compiled_mode, and kernelized
-    configs = []
-    for attn_implementation, sdpa_backend in list(dict.fromkeys(attn_impl_and_sdpa_backend)):
-        for cm in list(dict.fromkeys(compiled_mode)):
-            for kernelize_on in list(dict.fromkeys(kernelized)):
-                config = BenchmarkConfig(
-                    attn_implementation=attn_implementation,
-                    sdpa_backend=sdpa_backend,
-                    compile_mode=cm,
-                    kernelize=kernelize_on,
-                    **kwargs,
-                )
-                configs.append(config)
-    return configs
-
-
-def generate_all_configs(
-    warmup_iterations: int = 5,
-    measurement_iterations: int = 20,
-    batch_size: int = 1,
-    sequence_length: int = 128,
-    num_tokens_to_generate: int = 128,
-    gpu_monitoring: bool = True,
-) -> list[BenchmarkConfig]:
-    all_attn_implementations = [
-        ("flash_attention_2", None),
-        ("eager", None),
-        ("sdpa", "math"),
-        ("sdpa", "flash_attention"),
-        ("flex_attention", None),
-    ]
-    return cross_generate_configs(
-        attn_impl_and_sdpa_backend=all_attn_implementations,
-        compiled_mode=[None, "default", "reduce-overhead", "max-autotune", "max-autotune-no-cudagraphs"],
-        kernelized=[False, KERNELIZATION_AVAILABLE],
-        warmup_iterations=warmup_iterations,
-        measurement_iterations=measurement_iterations,
-        batch_size=batch_size,
-        sequence_length=sequence_length,
-        num_tokens_to_generate=num_tokens_to_generate,
-        gpu_monitoring=gpu_monitoring,
+    parameters = (
+        x if isinstance(x, list) else [x]
+        for x in [
+            warmup_iterations,
+            measurement_iterations,
+            batch_size,
+            sequence_length,
+            num_tokens_to_generate,
+            gpu_monitoring,
+        ]
    )
+    iterator = itertools.product(*parameters)
+
+    adapted_configs = []
+    for warmup_iters, measurement_iters, bs, seqlen, ntok, monitor in iterator:
+        for config in configs:
+            config = config.to_dict()
+            config["warmup_iterations"] = warmup_iters
+            config["measurement_iterations"] = measurement_iters
+            config["batch_size"] = bs
+            config["sequence_length"] = seqlen
+            config["num_tokens_to_generate"] = ntok
+            config["gpu_monitoring"] = monitor
+            adapted_configs.append(BenchmarkConfig.from_dict(config))
+    return adapted_configs


-def generate_main_configs(
-    warmup_iterations: int = 5,
-    measurement_iterations: int = 20,
-    batch_size: int = 1,
-    sequence_length: int = 128,
-    num_tokens_to_generate: int = 128,
-) -> list[BenchmarkConfig]:
-    # Create kwargs common to all configs
-    kwargs = {
-        "warmup_iterations": warmup_iterations,
-        "measurement_iterations": measurement_iterations,
-        "batch_size": batch_size,
-        "sequence_length": sequence_length,
-        "num_tokens_to_generate": num_tokens_to_generate,
-    }
-    return [  # TODO: test max-autotune instead of default
-        BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default", gpu_monitoring=False, **kwargs),
-        BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default", gpu_monitoring=True, **kwargs),
-        BenchmarkConfig(attn_implementation="eager", compile_mode="default", gpu_monitoring=True, **kwargs),
-        BenchmarkConfig(attn_implementation="flash_attention_2", gpu_monitoring=True, **kwargs),
-    ]
+def get_config_by_level(level: int) -> list[BenchmarkConfig]:
+    configs = []
+    # Early return if level is greater than 3: we generate all combinations of configs, maybe even w/ all compile modes
+    if level >= 3:
+        for attn_implementation, sdpa_backend in BenchmarkConfig.all_attn_implementations:
+            # Usually there is not much to gain by compiling with other modes, but we allow it for level 4
+            compile_modes = BenchmarkConfig.all_compiled_modes if level >= 4 else [None, "default"]
+            for cm in compile_modes:
+                for kernelize_on in {False, KERNELIZATION_AVAILABLE}:
+                    for cb_on in [False, True]:
+                        configs.append(
+                            BenchmarkConfig(
+                                attn_implementation=attn_implementation,
+                                sdpa_backend=sdpa_backend,
+                                compile_mode=cm,
+                                kernelize=kernelize_on,
+                                continuous_batching=cb_on,
+                            )
+                        )
+        return configs
+    # Otherwise, we add the configs for the given level
+    if level >= 0:
+        configs.append(BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default"))
+    if level >= 1:
+        configs.append(BenchmarkConfig(attn_implementation="flash_attention_2"))
+        configs.append(BenchmarkConfig(attn_implementation="eager", compile_mode="default"))
+        configs.append(BenchmarkConfig(attn_implementation="flash_attention_2", continuous_batching=True))
+    if level >= 2:
+        configs.append(BenchmarkConfig(attn_implementation="sdpa", compile_mode="default"))
+        configs.append(BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default", kernelize=True))
+        configs.append(BenchmarkConfig(attn_implementation="flash_attention_2", kernelize=True))
+        configs.append(BenchmarkConfig(attn_implementation="paged|sdpa", continuous_batching=True))
+    return configs
--- a/benchmark_v2/framework/benchmark_runner.py
+++ b/benchmark_v2/framework/benchmark_runner.py
@ -117,8 +117,6 @@ def flush_memory():
    # Clear CUDA cache
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
-        torch.cuda.reset_max_memory_allocated()
-        torch.cuda.reset_peak_memory_stats()
        torch.cuda.synchronize()
    gc.collect()

@ -234,8 +232,9 @@ class BenchmarkRunner:
            self.logger.info(f"Running benchmark scenario: {config.name}")

            # Quick validation: try one measurement first to see if this scenario works
+            generate_fn = self.time_generate_batch if config.continuous_batching else self.time_generate
            flush_memory()
-            e2e_latency, token_generation_times, shape_and_decoded_output, gpu_metrics = self.time_generate(
+            e2e_latency, token_generation_times, shape_and_decoded_output, gpu_metrics = generate_fn(
                max_new_tokens=1, gpu_monitor=None
            )
            if e2e_latency < 0:
@ -245,14 +244,14 @@ class BenchmarkRunner:
            # Warmup runs
            self.logger.info(f"Warming up with {config.warmup_iterations} iterations...")
            for _ in trange(config.warmup_iterations):
-                _ = self.time_generate(max_new_tokens=config.num_tokens_to_generate)
+                _ = generate_fn(max_new_tokens=config.num_tokens_to_generate)
            self.logger.info("Warmup over.")

            # Measurement runs
            result = BenchmarkResult()
            self.logger.info(f"Benchmarking with {config.measurement_iterations} iterations.")
            for _ in trange(config.measurement_iterations):
-                e2e_latency, token_generation_times, shape_and_decoded_output, gpu_metrics = self.time_generate(
+                e2e_latency, token_generation_times, shape_and_decoded_output, gpu_metrics = generate_fn(
                    max_new_tokens=config.num_tokens_to_generate,
                    gpu_monitor=(GPUMonitor(logger=self.logger) if config.gpu_monitoring else None),
                )
@ -274,6 +273,58 @@ class BenchmarkRunner:
                "config": config,
            }

+    # TODO: refactor `generate_batch` to handle streaming so we can use it here
+    def time_generate_batch(
+        self,
+        max_new_tokens: int,
+        gpu_monitor: GPUMonitor | None = None,
+    ) -> tuple[float, list[float], str, GPURawMetrics | None]:
+        if gpu_monitor is not None:
+            gpu_monitor.start()
+        config = GenerationConfig(
+            max_new_tokens=max_new_tokens,
+            eos_token_id=self.tokenizer.eos_token_id,
+            pad_token_id=self.tokenizer.pad_token_id,
+            do_sample=True,
+        )
+        manager = self.model.init_continuous_batching(config)
+        manager.start()
+        try:
+            first_req_results = []
+            timestamps = []
+            wall_time_0 = time.perf_counter()
+            inputs = self.inputs["input_ids"].tolist()
+            manager.add_requests(inputs, max_new_tokens=max_new_tokens, streaming=True)
+            first_req_id = None
+            num_requests = len(inputs)
+            finished_requests = 0
+            while finished_requests < num_requests:
+                # NOTE: I don't like having the extra if stmt here, but hopefully won't degrade perf too much
+                result = manager.get_result()
+                if result:
+                    timestamps.append(time.perf_counter() - wall_time_0)
+                    if result.is_finished():
+                        finished_requests += 1
+                    if first_req_id is None:
+                        first_req_id = result.request_id
+                    if result.request_id == first_req_id:
+                        first_req_results.append(result)
+                else:
+                    if not manager.is_running():
+                        raise RuntimeError("Generation thread exited unexpectedly")
+            wall_time_1 = time.perf_counter()
+            gpu_metrics = gpu_monitor.stop_and_collect() if gpu_monitor is not None else None
+            decoded_output = self.tokenizer.decode(
+                [res.generated_tokens[0] for res in first_req_results], skip_special_tokens=True
+            )
+            shape_and_decoded_output = f"{(1, len(first_req_results))} | {decoded_output}"
+            e2e_latency = wall_time_1 - wall_time_0
+            return e2e_latency, timestamps, shape_and_decoded_output, gpu_metrics
+        except Exception as e:
+            raise e
+        finally:
+            manager.stop()
+
    def time_generate(
        self,
        max_new_tokens: int,
@ -339,12 +390,6 @@ class BenchmarkRunner:

        n_configs = len(benchmark_configs)
        for i, config in enumerate(benchmark_configs):
-            # Handle SDPA backend if not determined by the config (needs to be done before skipping duplicates)
-            if config.attn_implementation == "sdpa" and config.sdpa_backend is None:
-                default_backend = "flash_attention"  # FIXME: torch has a _cur_sdpa_kernel_backends but it fails
-                self.logger.warning(f"No SDPA backend provided, using {default_backend} instead.")
-                config.sdpa_backend = default_backend
-
            # Skip if already run
            if config.hash in all_results:
                self.logger.info(f"Skipping duplicate config {config.name} for model {model_id} ({i + 1}/{n_configs})")
@ -368,21 +413,27 @@ class BenchmarkRunner:
            self.cleanup()
            self.save_results(model_id, all_results, timestamp=timestamp)

+        if len(all_results) < 1:
+            raise RuntimeError("No benchmark was run succesfully")
+
        if pretty_print_summary:
            print()
            print("=" * 100)
            print(f"Finished benchmarks in {time.perf_counter() - start_time:.2f} seconds")
            print(f"Total number of benchmarks: {len(all_results)}")
-            if len(all_results) > 0:
-                print("First run metadata:")
-                first_key = list(all_results.keys())[0]
-                first_metadata = all_results[first_key]["metadata"].to_dict()
-                hardware_info = first_metadata.pop("hardware_info")
-                pretty_print_dict(first_metadata | hardware_info, tabs=1)
+            print("First run metadata:")
+            first_key = list(all_results.keys())[0]
+            first_metadata = all_results[first_key]["metadata"].to_dict()
+            hardware_info = first_metadata.pop("hardware_info")
+            pretty_print_dict(first_metadata | hardware_info, tabs=1)
            for result in all_results.values():
                print("=" * 100)
                print(f"Config: {result['config'].infer_name(compact=False)}\n")
-                result["measurements"].pprint(batch_size=result["config"].batch_size, tabs=1)
+                result["measurements"].pprint(
+                    batch_size=result["config"].batch_size,
+                    num_generated_tokens=result["config"].num_tokens_to_generate,
+                    tabs=1,
+                )
            print("=" * 100)

        return (timestamp, all_results)
--- a/benchmark_v2/framework/data_classes.py
+++ b/benchmark_v2/framework/data_classes.py
@ -36,16 +36,17 @@ def add_unit_to_duration(stats: dict[str, float]) -> dict[str, str]:
    return stats


-def equalize_lengths_and_collate(stats: list[dict[str, str]]) -> list[str]:
+def equalize_lengths_and_collate(stats: dict[str, dict[str, str]]) -> dict[str, str]:
+    """Note: This operation is destructive as it will update values in place before returning a new correctly formatted dict"""
    keys = ["avg", "std", "min", "med", "max", "p95"]
    for key in keys:
-        max_length = max(len(stat[key]) for stat in stats)
-        for stat in stats:
+        max_length = max(len(stat[key]) for stat in stats.values())
+        for stat in stats.values():
            stat[key] = stat[key].ljust(max_length, " ")
-    return [" ".join([f"{key}={stat[key]}" for key in keys]) for stat in stats]
+    return {name: " ".join([f"{key}={stat[key]}" for key in keys]) for name, stat in stats.items()}


-def pretty_print_dict(data: dict[str, Any], tabs: int = 0) -> None:
+def pretty_print_dict(data: dict[str, str], tabs: int = 0) -> None:
    max_key_length = max([len(key) for key in data.keys()])
    for key, value in data.items():
        tabs_str = "  " * tabs
@ -141,27 +142,19 @@ class BenchmarkResult:
    def get_measured_itl(self) -> list[float]:
        return [(dt[-1] - dt[0]) / (len(dt) - 1) for dt in self.token_generation_times if len(dt) > 1]

-    def get_throughput(self, batch_size: int) -> float:
-        return [
-            batch_size * len(dt) / e2e_latency
-            for e2e_latency, dt in zip(self.e2e_latency, self.token_generation_times)
-        ]
+    def get_throughput(self, total_generated_tokens: int) -> list[float]:
+        return [total_generated_tokens / e2e_latency for e2e_latency in self.e2e_latency]

-    def pprint(self, batch_size: int = 0, tabs: int = 0) -> None:
-        stats_to_collate = [
-            add_unit_to_duration(compute_basic_statistics(self.e2e_latency)),
-            add_unit_to_duration(compute_basic_statistics(self.get_measured_ttft())),
-            add_unit_to_duration(compute_basic_statistics(self.get_measured_itl())),
-        ]
-        if batch_size > 0:
-            throughput_stats = compute_basic_statistics(self.get_throughput(batch_size))
-            stats_to_collate.append({key: f"{value:.2f}tok/s" for key, value in throughput_stats.items()})
-        collated_stats = equalize_lengths_and_collate(stats_to_collate)
-        dict_to_pprint = {
-            "E2E Latency": collated_stats[0],
-            "Time to First Token": collated_stats[1],
-            "Inter-Token Latency": collated_stats[2],
+    def pprint(self, batch_size: int = 0, num_generated_tokens: int = 0, tabs: int = 0) -> None:
+        measurements = {
+            "E2E Latency": add_unit_to_duration(compute_basic_statistics(self.e2e_latency)),
+            "Time to First Token": add_unit_to_duration(compute_basic_statistics(self.get_measured_ttft())),
        }
+        itl_values = self.get_measured_itl()
+        if len(itl_values) > 0:
+            measurements["Inter-Token Latency"] = add_unit_to_duration(compute_basic_statistics(itl_values))
        if batch_size > 0:
-            dict_to_pprint["Throughput"] = collated_stats[3]
+            throughput_stats = compute_basic_statistics(self.get_throughput(batch_size * num_generated_tokens))
+            measurements["Throughput"] = {key: f"{value:.2f}tok/s" for key, value in throughput_stats.items()}
+        dict_to_pprint = equalize_lengths_and_collate(measurements)
        pretty_print_dict(dict_to_pprint, tabs=tabs)
--- a/benchmark_v2/requirements.txt
+++ b/benchmark_v2/requirements.txt
@ -2,6 +2,5 @@ numpy>=1.21.0
 psutil>=5.8.0
 gpustat>=1.0.0
 torch>=2.0.0
-transformers>=4.30.0
 datasets>=2.10.0
 huggingface_hub>=0.16.0
--- a/benchmark_v2/run_benchmarks.py
+++ b/benchmark_v2/run_benchmarks.py
@ -23,7 +23,7 @@ import logging
 import sys
 import uuid

-from framework.benchmark_config import BenchmarkConfig, generate_all_configs, generate_main_configs
+from framework.benchmark_config import adapt_configs, get_config_by_level
 from framework.benchmark_runner import BenchmarkRunner


@ -40,7 +40,14 @@ if __name__ == "__main__":
    parser.add_argument("--sequence-length", "-s", type=int, nargs="+", help="Sequence length")
    parser.add_argument("--num-tokens-to-generate", "-n", type=int, nargs="+", help="Number of tokens to generate")

-    parser.add_argument("--cross-generate", action="store_true", help="Cross-generate all combinations of configs")
+    parser.add_argument(
+        "--level",
+        type=int,
+        default=1,
+        help="Level of coverage for the benchmark. 0: only the main config, 1: a few important configs, 2: a config for"
+        " each attn implementation an option, 3: cross-generate all combinations of configs, 4: cross-generate all"
+        " combinations of configs w/ all compile modes",
+    )
    parser.add_argument("--num-tokens-to-profile", "-p", type=int, default=0, help="Number of tokens to profile")

    parser.add_argument("--branch-name", type=str, help="Git branch name")
@ -73,70 +80,34 @@ if __name__ == "__main__":
    logger.info(f"Benchmark run UUID: {benchmark_run_uuid}")
    logger.info(f"Output directory: {args.output_dir}")

+    # We cannot compute ITL if we don't have at least two measurements
+    if any(n <= 1 for n in args.num_tokens_to_generate):
+        raise ValueError("--num_tokens_to_generate arguments should be larger than 1")
+
    # Error out if one of the arguments is not provided
    if len(args.batch_size) * len(args.sequence_length) * len(args.num_tokens_to_generate) == 0:
        raise ValueError(
            "At least one of the arguments --batch-size, --sequence-length, or --num-tokens-to-generate is required"
        )

-    # If there is only one (batch_size, sequence_length, num_tokens_to_generate), we benchmark across configs
-    elif len(args.batch_size) * len(args.sequence_length) * len(args.num_tokens_to_generate) == 1:
-        if args.cross_generate:
-            benchmark_configs = generate_all_configs(
-                warmup_iterations=args.warmup,
-                measurement_iterations=args.iterations,
-                batch_size=args.batch_size[0],
-                sequence_length=args.sequence_length[0],
-                num_tokens_to_generate=args.num_tokens_to_generate[0],
-                gpu_monitoring=not args.no_gpu_monitoring,
-            )
-        else:
-            benchmark_configs = generate_main_configs(
-                warmup_iterations=args.warmup,
-                measurement_iterations=args.iterations,
-                batch_size=args.batch_size[0],
-                sequence_length=args.sequence_length[0],
-                num_tokens_to_generate=args.num_tokens_to_generate[0],
-            )
-
-    # Otherwise, we benchmark across all combinations of dimensions
-    else:
-        main_config = generate_main_configs(
-            warmup_iterations=args.warmup,
-            measurement_iterations=args.iterations,
-            batch_size=args.batch_size[0],
-            sequence_length=args.sequence_length[0],
-            num_tokens_to_generate=args.num_tokens_to_generate[0],
-        )[0]
-        benchmark_configs = []
-        for num_tokens_to_generate in args.num_tokens_to_generate:
-            for sequence_length in args.sequence_length:
-                for batch_size in args.batch_size:
-                    cfg_dict = main_config.to_dict()
-                    cfg_dict["batch_size"] = batch_size
-                    cfg_dict["sequence_length"] = sequence_length
-                    cfg_dict["num_tokens_to_generate"] = num_tokens_to_generate
-                    cfg_dict.pop("name")
-                    benchmark_configs.append(BenchmarkConfig.from_dict(cfg_dict))
-
-    runner = BenchmarkRunner(
-        logger,
-        args.output_dir,
-        args.branch_name,
-        args.commit_id,
-        args.commit_message,
+    # Get the configs for the given coverage level
+    configs = get_config_by_level(args.level)
+    # Adapt the configs to the given arguments
+    configs = adapt_configs(
+        configs,
+        args.warmup,
+        args.iterations,
+        args.batch_size,
+        args.sequence_length,
+        args.num_tokens_to_generate,
+        not args.no_gpu_monitoring,
    )
+
+    runner = BenchmarkRunner(logger, args.output_dir, args.branch_name, args.commit_id, args.commit_message)
    timestamp, results = runner.run_benchmarks(
-        args.model_id,
-        benchmark_configs,
-        args.num_tokens_to_profile,
-        pretty_print_summary=True,
+        args.model_id, configs, args.num_tokens_to_profile, pretty_print_summary=True
    )

    dataset_id = args.push_result_to_dataset
    if dataset_id is not None and len(results) > 0:
-        runner.push_results_to_hub(
-            dataset_id,
-            results,
-            timestamp,
-        )
+        runner.push_results_to_hub(dataset_id, results, timestamp)
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@ -67,7 +67,7 @@ RUN set -e; \

 RUN python3 -m pip install --no-cache-dir -U timm

-RUN [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git || echo "Don't install detectron2 with nightly torch"
+RUN [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir --no-build-isolation git+https://github.com/facebookresearch/detectron2.git || echo "Don't install detectron2 with nightly torch"

 RUN python3 -m pip install --no-cache-dir pytesseract

--- a/docker/transformers-doc-builder/Dockerfile
+++ b/docker/transformers-doc-builder/Dockerfile
@ -10,7 +10,7 @@ RUN apt-get -y update && apt-get install -y libsndfile1-dev && apt install -y te
 # Torch needs to be installed before deepspeed
 RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed]

-RUN python3 -m pip install --no-cache-dir torchvision git+https://github.com/facebookresearch/detectron2.git pytesseract
+RUN python3 -m pip install --no-cache-dir --no-build-isolation torchvision git+https://github.com/facebookresearch/detectron2.git pytesseract
 RUN python3 -m pip install -U "itsdangerous<2.1.0"

 # Test if the image could successfully build the doc. before publishing the image
--- a/docker/transformers-pytorch-amd-gpu/Dockerfile
+++ b/docker/transformers-pytorch-amd-gpu/Dockerfile
@ -1,4 +1,4 @@
-FROM rocm/pytorch:rocm7.0.2_ubuntu24.04_py3.12_pytorch_release_2.7.1
+FROM rocm/pytorch:rocm7.1_ubuntu22.04_py3.10_pytorch_release_2.8.0
 LABEL maintainer="Hugging Face"

 ARG DEBIAN_FRONTEND=noninteractive
@ -39,7 +39,7 @@ RUN python3 -m pip install --no-cache-dir "torchcodec==0.5"
 # Install flash attention from source. Tested with commit 6387433156558135a998d5568a9d74c1778666d8
 RUN git clone https://github.com/ROCm/flash-attention/ -b tridao && \
    cd flash-attention && \
-    GPU_ARCHS="gfx942;gfx950" python setup.py install  
-# GPU_ARCHS builds for MI300, MI325 and MI355
+    GPU_ARCHS="gfx942" python setup.py install  
+# GPU_ARCHS builds for MI300, MI325 but not MI355: we would need to add `;gfx950` but it takes too long to build.

 RUN python3 -m pip install --no-cache-dir einops
--- a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
@ -29,7 +29,7 @@ RUN python3 -m pip uninstall -y apex torch torchvision torchaudio
 RUN python3 -m pip install torch==$PYTORCH torchvision==$TORCH_VISION torchaudio==$TORCH_AUDIO --index-url https://download.pytorch.org/whl/rocm$ROCM --no-cache-dir

 # Pre-build DeepSpeed, so it's be ready for testing (to avoid timeout)
-RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache-dir -v --disable-pip-version-check 2>&1
+RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --no-build-isolation --config-settings="--build-option=build_ext" --config-settings="--build-option=-j8" --no-cache-dir -v --disable-pip-version-check 2>&1

 ARG REF=main
 WORKDIR /
--- a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
@ -21,7 +21,7 @@ RUN python3 -m pip install --no-cache-dir './transformers[deepspeed-testing]' 'p
 # Install latest release PyTorch
 # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
 # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
-RUN python3 -m pip uninstall -y torch torchvision torchaudio && python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/$CUDA
+RUN python3 -m pip uninstall -y torch torchvision torchaudio torchcodec && python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA

 RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate

@ -43,7 +43,7 @@ RUN python3 -m pip uninstall -y deepspeed
 # This has to be run (again) inside the GPU VMs running the tests.
 # The installation works here, but some tests fail, if we don't pre-build deepspeed again in the VMs running the tests.
 # TODO: Find out why test fail.
-RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
+RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --no-build-isolation --config-settings="--build-option=build_ext" --config-settings="--build-option=-j8" --no-cache -v --disable-pip-version-check 2>&1

 # `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
 RUN python3 -m pip uninstall -y kernels
--- a/docker/transformers-quantization-latest-gpu/Dockerfile
+++ b/docker/transformers-quantization-latest-gpu/Dockerfile
@ -24,7 +24,7 @@ RUN [ ${#PYTORCH} -gt 0 ] && VERSION='torch=='$PYTORCH'.*' ||  VERSION='torch';
 RUN echo torch=$VERSION
 # `torchvision` and `torchaudio` should be installed along with `torch`, especially for nightly build.
 # Currently, let's just use their latest releases (when `torch` is installed with a release version)
-RUN python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/$CUDA
+RUN python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA

 RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate

@ -50,7 +50,7 @@ RUN python3 -m pip install --no-cache-dir hqq
 RUN python3 -m pip install --no-cache-dir gguf

 # Add autoawq for quantization testing
-RUN python3 -m pip install --no-cache-dir autoawq[kernels]
+RUN python3 -m pip install --no-cache-dir --no-build-isolation autoawq[kernels]

 # Add quanto for quantization testing
 RUN python3 -m pip install --no-cache-dir optimum-quanto
@ -81,7 +81,7 @@ RUN python3 -m pip uninstall -y flash-attn
 RUN cd transformers && python3 setup.py develop

 # Add fp-quant for quantization testing
-RUN python3 -m pip install --no-cache-dir "fp-quant>=0.2.0"
+RUN python3 -m pip install --no-cache-dir "fp-quant>=0.3.2"

 # Low usage or incompatible lib, will enable later on

--- a/docs/source/de/add_new_model.md
+++ b/docs/source/de/add_new_model.md
@ -508,16 +508,16 @@ BERT `_init_weights` Methode:
 def _init_weights(self, module):
    """Initialize the weights"""
    if isinstance(module, nn.Linear):
-        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        module.weight.normal_(mean=0.0, std=self.config.initializer_range)
        if module.bias is not None:
-            module.bias.data.zero_()
+            module.bias.zero_()
    elif isinstance(module, nn.Embedding):
-        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        module.weight.normal_(mean=0.0, std=self.config.initializer_range)
        if module.padding_idx is not None:
            module.weight.data[module.padding_idx].zero_()
    elif isinstance(module, nn.LayerNorm):
-        module.bias.data.zero_()
-        module.weight.data.fill_(1.0)
+        module.bias.zero_()
+        module.weight.fill_(1.0)
 ```

 Sie können weitere benutzerdefinierte Schemata verwenden, wenn Sie eine spezielle Initialisierung für einige Module benötigen. Zum Beispiel in
@ -533,9 +533,9 @@ def _init_weights(self, module):
        module.project_hid._is_hf_initialized = True
        module.project_q._is_hf_initialized = True
    elif isinstance(module, nn.Linear):
-        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        module.weight.normal_(mean=0.0, std=self.config.initializer_range)
        if module.bias is not None:
-            module.bias.data.zero_()
+            module.bias.zero_()
 ```

 Das Flag `_is_hf_initialized` wird intern verwendet, um sicherzustellen, dass wir ein Submodul nur einmal initialisieren. Wenn Sie es auf
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@ -118,7 +118,9 @@
  - local: tools
    title: Tools
  - local: transformers_as_backend
-    title: Inference server backends
+    title: Transformers as modeling backend
+  - local: continuous_batching
+    title: Continuous Batching
  title: Inference
 - isExpanded: false
  sections:
@ -418,8 +420,6 @@
        title: BLOOM
      - local: model_doc/blt
        title: BLT
-      - local: model_doc/bort
-        title: BORT
      - local: model_doc/byt5
        title: ByT5
      - local: model_doc/camembert
@ -474,8 +474,6 @@
        title: Ernie4_5
      - local: model_doc/ernie4_5_moe
        title: Ernie4_5_MoE
-      - local: model_doc/ernie_m
-        title: ErnieM
      - local: model_doc/esm
        title: ESM
      - local: model_doc/exaone4
@ -530,8 +528,6 @@
        title: GPTBigCode
      - local: model_doc/gpt_oss
        title: GptOss
-      - local: model_doc/gptsan-japanese
-        title: GPTSAN Japanese
      - local: model_doc/gpt-sw3
        title: GPTSw3
      - local: model_doc/granite
@ -556,8 +552,6 @@
        title: Jamba
      - local: model_doc/jetmoe
        title: JetMoe
-      - local: model_doc/jukebox
-        title: Jukebox
      - local: model_doc/led
        title: LED
      - local: model_doc/lfm2
@ -592,8 +586,6 @@
        title: MarkupLM
      - local: model_doc/mbart
        title: MBart and MBart-50
-      - local: model_doc/mega
-        title: MEGA
      - local: model_doc/megatron-bert
        title: MegatronBERT
      - local: model_doc/megatron_gpt2
@ -628,8 +620,6 @@
        title: myt5
      - local: model_doc/nemotron
        title: Nemotron
-      - local: model_doc/nezha
-        title: NEZHA
      - local: model_doc/nllb
        title: NLLB
      - local: model_doc/nllb-moe
@ -644,8 +634,6 @@
        title: Olmo3
      - local: model_doc/olmoe
        title: OLMoE
-      - local: model_doc/open-llama
-        title: Open-Llama
      - local: model_doc/opt
        title: OPT
      - local: model_doc/pegasus
@ -666,8 +654,6 @@
        title: PLBart
      - local: model_doc/prophetnet
        title: ProphetNet
-      - local: model_doc/qdqbert
-        title: QDQBert
      - local: model_doc/qwen2
        title: Qwen2
      - local: model_doc/qwen2_moe
@ -680,16 +666,12 @@
        title: Qwen3Next
      - local: model_doc/rag
        title: RAG
-      - local: model_doc/realm
-        title: REALM
      - local: model_doc/recurrent_gemma
        title: RecurrentGemma
      - local: model_doc/reformer
        title: Reformer
      - local: model_doc/rembert
        title: RemBERT
-      - local: model_doc/retribert
-        title: RetriBERT
      - local: model_doc/roberta
        title: RoBERTa
      - local: model_doc/roberta-prelayernorm
@ -718,10 +700,6 @@
        title: T5Gemma
      - local: model_doc/t5v1.1
        title: T5v1.1
-      - local: model_doc/tapex
-        title: TAPEX
-      - local: model_doc/transfo-xl
-        title: Transformer XL
      - local: model_doc/ul2
        title: UL2
      - local: model_doc/umt5
@ -734,8 +712,6 @@
        title: XGLM
      - local: model_doc/xlm
        title: XLM
-      - local: model_doc/xlm-prophetnet
-        title: XLM-ProphetNet
      - local: model_doc/xlm-roberta
        title: XLM-RoBERTa
      - local: model_doc/xlm-roberta-xl
@ -782,8 +758,6 @@
        title: Depth Anything V2
      - local: model_doc/depth_pro
        title: DepthPro
-      - local: model_doc/deta
-        title: DETA
      - local: model_doc/detr
        title: DETR
      - local: model_doc/dinat
@ -798,8 +772,6 @@
        title: DiT
      - local: model_doc/dpt
        title: DPT
-      - local: model_doc/efficientformer
-        title: EfficientFormer
      - local: model_doc/efficientloftr
        title: EfficientLoFTR
      - local: model_doc/efficientnet
@ -836,8 +808,6 @@
        title: MobileViT
      - local: model_doc/mobilevitv2
        title: MobileViTV2
-      - local: model_doc/nat
-        title: NAT
      - local: model_doc/poolformer
        title: PoolFormer
      - local: model_doc/prompt_depth_anything
@ -884,12 +854,8 @@
        title: Timm Wrapper
      - local: model_doc/upernet
        title: UperNet
-      - local: model_doc/van
-        title: VAN
      - local: model_doc/vit
        title: Vision Transformer (ViT)
-      - local: model_doc/vit_hybrid
-        title: ViT Hybrid
      - local: model_doc/vitdet
        title: ViTDet
      - local: model_doc/vit_mae
@ -928,8 +894,6 @@
        title: Hubert
      - local: model_doc/kyutai_speech_to_text
        title: Kyutai Speech-To-Text
-      - local: model_doc/mctct
-        title: MCTCT
      - local: model_doc/mimi
        title: Mimi
      - local: model_doc/mms
@ -956,8 +920,6 @@
        title: SEW-D
      - local: model_doc/speech_to_text
        title: Speech2Text
-      - local: model_doc/speech_to_text_2
-        title: Speech2Text2
      - local: model_doc/speecht5
        title: SpeechT5
      - local: model_doc/unispeech
@ -1006,6 +968,8 @@
        title: AltCLIP
      - local: model_doc/aria
        title: Aria
+      - local: model_doc/audioflamingo3
+        title: AudioFlamingo3
      - local: model_doc/aya_vision
        title: AyaVision
      - local: model_doc/blip
@ -1062,6 +1026,8 @@
        title: Gemma3n
      - local: model_doc/git
        title: GIT
+      - local: model_doc/glm46v
+        title: Glm46V
      - local: model_doc/glm4v
        title: glm4v
      - local: model_doc/glm4v_moe
@ -1182,8 +1148,6 @@
        title: TAPAS
      - local: model_doc/trocr
        title: TrOCR
-      - local: model_doc/tvlt
-        title: TVLT
      - local: model_doc/tvp
        title: TVP
      - local: model_doc/udop
@ -1210,8 +1174,6 @@
    - sections:
      - local: model_doc/decision_transformer
        title: Decision Transformer
-      - local: model_doc/trajectory_transformer
-        title: Trajectory Transformer
      title: Reinforcement learning models
    - sections:
      - local: model_doc/autoformer
@ -1227,10 +1189,6 @@
      - local: model_doc/timesfm
        title: TimesFM
      title: Time series models
-    - sections:
-      - local: model_doc/graphormer
-        title: Graphormer
-      title: Graph models
    title: Models
  - sections:
    - local: internal/modeling_utils
--- a/docs/source/en/add_new_model.md
+++ b/docs/source/en/add_new_model.md
@ -314,16 +314,16 @@ Random initialization occurs in the `_init_weights` method of `BrandNewLlamaPreT
 def _init_weights(self, module):
    """Initialize the weights"""
    if isinstance(module, nn.Linear):
-        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        module.weight.normal_(mean=0.0, std=self.config.initializer_range)
        if module.bias is not None:
-            module.bias.data.zero_()
+            module.bias.zero_()
    elif isinstance(module, nn.Embedding):
-        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        module.weight.normal_(mean=0.0, std=self.config.initializer_range)
        if module.padding_idx is not None:
            module.weight.data[module.padding_idx].zero_()
    elif isinstance(module, nn.LayerNorm):
-        module.bias.data.zero_()
-        module.weight.data.fill_(1.0)
+        module.bias.zero_()
+        module.weight.fill_(1.0)
 ```

 The initialization scheme can look different if you need to adapt it to your model. For example, [`Wav2Vec2ForPreTraining`] initializes [nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html) in its last two linear layers.
@ -339,9 +339,9 @@ def _init_weights(self, module):
        module.project_hid._is_hf_initialized = True
        module.project_q._is_hf_initialized = True
    elif isinstance(module, nn.Linear):
-        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        module.weight.normal_(mean=0.0, std=self.config.initializer_range)
        if module.bias is not None:
-            module.bias.data.zero_()
+            module.bias.zero_()
 ```

 ### Convert checkpoints to Transformers
--- a/docs/source/en/continuous_batching.md
+++ b/docs/source/en/continuous_batching.md
@ -0,0 +1,194 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Continuous Batching
+
+Continuous Batching (CB) is an advanced technique to optimize the inference of transformer models by dynamically grouping multiple requests into batches. This approach maximizes GPU utilization and throughput, specifically for workloads with many variable-length inputs.
+
+We are particularly interested in having Continuous Batching in transformers for the following use cases:
+- Evaluation of models on large datasets with variable-length inputs
+- Generating outputs for multiple sequences for GRPO policies
+
+CB is what makes inference engines like vLLM or SGLang efficient. That being said, transformers does not aim to be a production-ready inference engine, but a complete framework for model development. For this reason, CB is available in `transformers serve`.
+
+If you are not familiar with some of the core concepts CB is built upon, we invite you to read the associated blog post: [Continuous Batching: Efficient Inference for Large Language Models](https://huggingface.co/blog/continuous-batching). _broken link for now_
+
+## API Reference
+
+## Usage Examples
+
+The main way to use CB in transformers is via the `generate_batch` method.
+
+Unlike `generate`, CB takes already tokenized inputs, known as input IDs. Each sequence of input IDs is represented as a list of integers, in python: `list[int]`. Since 
+
+For a more detailed example, please refer to: [examples/continuous_batching](./path/to/example)
+
+### `generate_batch` example
+
+We have created a `ContinuousMixin` that is inherited by the `GenerationMixin` so that all auto regressive text models support CB.
+
+This adds the `generate_batch` method to all models that inherit from `GenerationMixin`.
+
+You can use it as follows:
+
+```py
+import datasets
+import torch
+
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation import GenerationConfig
+
+model = AutoModelForCausalLM.from_pretrained(
+    "Qwen/Qwen3-4B-Instruct-2507",
+    attn_implementation="spda_paged",
+    device_map="cuda",  # if you need cuda
+    dtype=torch.bfloat16,
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, padding_side="left")
+
+# prepare a batch of inputs
+dataset = datasets.load_dataset("openai/gsm8k", "socratic", split="test")
+dataset = dataset.select(range(args.samples))
+tokenized_datasets = dataset.map(lambda x: tokenizer(x["question"]), batched=True)
+simple_batch_inputs = [item["input_ids"] for item in tokenized_datasets]
+
+generation_config = GenerationConfig(
+    max_new_tokens=32,
+    use_cuda_graph=False,  # Not supported for simple version
+    eos_token_id=tokenizer.eos_token_id,
+    pad_token_id=tokenizer.pad_token_id,
+    do_sample=False,
+    max_batch_tokens=512,  # max number of tokens in a batch, this is just a default value you should tune based on your hardware
+)
+
+batch_outputs = model.generate_batch(
+    inputs=simple_batch_inputs,
+    generation_config=generation_config,
+)
+
+for request_id, output in batch_outputs.items():
+    generated_text = tokenizer.decode(output.generated_tokens, skip_special_tokens=True)
+    print(f"Request {request_id} output: {generated_text}")
+```
+
+### `ContinuousBatchingManager` example
+
+If you want more control w.r.t. how you want to schedule requests using CB, you can use the `ContinuousBatchingManager` class directly.
+
+This is what we use in `transformers serve` because requests arrive asynchronously and we can leverage the asynchronous nature of the CB process to make things more efficient.
+
+Under the hood, the `ContinuousBatchingManager` creates a background thread that receives inputs from a python `queue.Queue` which it uses to get requests to batch in each forward pass.
+
+Note that the manager is thread safe!
+
+```py
+import datasets
+import torch
+
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers.generation import GenerationConfig
+from transformers.generation.continuous_batching import RequestStatus
+
+model = AutoModelForCausalLM.from_pretrained(
+    "Qwen/Qwen3-4B-Instruct-2507",
+    attn_implementation="spda_paged",
+    device_map="cuda",  # if you need cuda
+    dtype=torch.bfloat16,
+)
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, padding_side="left")
+
+# prepare a batch of inputs
+dataset = datasets.load_dataset("openai/gsm8k", "socratic", split="test")
+dataset = dataset.select(range(args.samples))
+tokenized_datasets = dataset.map(lambda x: tokenizer(x["question"]), batched=True)
+simple_batch_inputs = [item["input_ids"] for item in tokenized_datasets]
+
+# initialize the manager, available method thanks to the `ContinuousMixin`
+manager = model.init_continuous_batching(generation_config=generation_config)
+
+# start the background thread
+manager.start()
+
+# this is for demonstration purposes only, in practice this is most useful to do concurrently
+for i, input in enumerate(simple_batch_inputs):
+    request_id = manager.add_request(input_ids=input, request_id=f"request_{i}")  # if you do not specify a request_id, one will be generated for you
+
+# Can be done in an other thread
+for id, request in manager.get_result():
+    generated_text = tokenizer.decode(request.generated_tokens, skip_special_tokens=True)
+    print(f"Request {id} output: {generated_text}")
+
+# you can also get results for a specific request id
+result = manager.get_result(request_id="request_5")  # this is blocking and will wait for the result to be ready
+
+# or get results for a request that is streaming
+manager.add_request(
+    input_ids=input,
+    request_id="streaming_request",
+    stream=True,
+)
+for chunk in manager.request_id_iter(request_id="streaming_request"):
+    generated_text = tokenizer.decode(chunk.generated_tokens, skip_special_tokens=True)
+    print(generated_text)
+    # FIXME: stop iteration in `request_id_iter` when finished instead of doing it externally
+    if chunk.status == RequestStatus.FINISHED:
+        break
+
+# stop the background thread before exiting the process
+manager.stop()
+```
+
+## Supported & Unsupported Features
+
+### Supported Features
+
+- Dynamic scheduling of variable-length requests
+- Chunked prefill
+- Paged Attention Cache
+- Sliding window attention
+- Chat templates
+
+### Unsupported Features
+
+At the moment, the following features are not supported with CB. We plan to add support to the following:
+
+- Prefix caching
+- Beam search
+- tool calling
+
+The others are unplanned, but depending on community requests we might consider adding them:
+
+- MTP (multi token prediction)
+- Medusa
+
+## Performance Considerations
+
+
+## Integration with Serving
+
+You can use CB in `transformers serve` by passing the `--continuous-batching` flag when starting the server.
+
+## Monitoring
+
+We have added `opentelemetry` support to Continuous Batching to help you monitor its performance in production. To enable it, you need to install the `opentelemetry` extra when installing `transformers`:
+
+```sh
+# this installs `opentelemetry-api`, `opentelemetry-sdk` and `opentelemetry-exporter-otlp`
+pip install transformers[open-telemetry]
+```
+
+This will enable traces and metrics collection in CB. You will then have to setup the backend to collect and visualize the traces and metrics.
+
--- a/docs/source/en/llm_optims.md
+++ b/docs/source/en/llm_optims.md
@ -393,3 +393,9 @@ model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-v0.1", quantization_config=quant_config, device_map="auto"
 )
 ```
+
+## Continuous Batching
+
+When serving LLMs for inference, you may have multiple requests arriving at different times. Continuous Batching (CB) is a technique that groups incoming requests into batches to maximize GPU utilization and throughput.
+
+See the [Continuous Batching](./continuous_batching) guide for more details on how to use CB in transformers.
--- a/docs/source/en/model_doc/audioflamingo3.md
+++ b/docs/source/en/model_doc/audioflamingo3.md
@ -0,0 +1,402 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+*This model was released on 2025-07-10 and added to Hugging Face Transformers on 2025-11-11.*
+
+# Audio Flamingo 3
+
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
+## Overview
+
+Audio Flamingo 3 (AF3) is a fully open large audio–language model designed for robust understanding and reasoning over speech, environmental sounds, and music. AF3 pairs a Whisper-style audio encoder with a causal language model and performs replace-in-place audio–text fusion: the processor aligns post-pool audio frames to a dedicated placeholder token and the model replaces those token slots with projected audio embeddings during the forward pass.
+
+The model checkpoint is available at: [nvidia/audio-flamingo-3-hf](https://huggingface.co/nvidia/audio-flamingo-3-hf)
+
+Highlights:
+
+- Unified audio encoder across speech, sound, and music.
+- **Long-audio support via windowing and post-pool alignment (up to 10 minutes maximum).** The model processes audio in 30-second windows with a hard limit of 20 windows (10 minutes total). Audio longer than 10 minutes will be truncated.
+- Deterministic fusion that preserves sequence length by replacing audio placeholder tokens with audio embeddings.
+
+This model was contributed by [Lasha Koroshinadze](https://huggingface.co/lashahub) and [Eric Bezzam](https://huggingface.co/bezzam).
+
+### Paper
+
+[Audio Flamingo 3](https://huggingface.co/papers/2507.08128): Advancing Audio Intelligence with Fully Open Large Audio Language Models  
+A. Goel, S. Ghosh, J. Kim, S. Kumar, Z. Kong, S. Lee, C.-H. H. Yang, R. Duraiswami, D. Manocha, R. Valle, B. Catanzaro  
+NVIDIA and University of Maryland  
+Project: https://research.nvidia.com/labs/adlr/AF3/
+
+## Usage
+
+### Audio Instruct Mode
+
+The model supports audio-text instructions, including multi-turn interactions, all processed in batches.
+
+➡️ audio + text instruction
+
+```python
+from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
+
+model_id = "nvidia/audio-flamingo-3-hf"
+processor = AutoProcessor.from_pretrained(model_id)
+model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")
+
+conversation = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "text", "text": "Transcribe the input speech."},
+            {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/WhDJDIviAOg_120_10.mp3"},
+        ],
+    }
+]
+
+inputs = processor.apply_chat_template(
+    conversation,
+    tokenize=True,
+    add_generation_prompt=True,
+    return_dict=True,
+).to(model.device)
+
+outputs = model.generate(**inputs, max_new_tokens=500)
+
+decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
+print(decoded_outputs)
+```
+
+➡️ multi-turn:
+
+```python
+from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
+
+model_id = "nvidia/audio-flamingo-3-hf"
+processor = AutoProcessor.from_pretrained(model_id)
+model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")
+
+conversation = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "Instruction: How does the tone of female speech change throughout the audio? Choose the correct option among the options below: (A) Sad to happy (B) Happy to sad (C) Neutral to happy (D) Happy to neutral.",
+            },
+            {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/000000786159.31.wav"},
+        ],
+    },
+    {
+        "role": "assistant",
+        "content": [{"type": "text", "text": "(A) Sad to happy"}],
+    },
+    {
+        "role": "user",
+        "content": [
+            {"type": "text", "text": "Why do you think so?"},
+        ],
+    },
+]
+
+inputs = processor.apply_chat_template(
+    conversation,
+    tokenize=True,
+    add_generation_prompt=True,
+    return_dict=True,
+).to(model.device)
+
+outputs = model.generate(**inputs, max_new_tokens=500)
+
+decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
+print(decoded_outputs)
+```
+
+➡️ text only:
+
+```python
+from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
+
+model_id = "nvidia/audio-flamingo-3-hf"
+processor = AutoProcessor.from_pretrained(model_id)
+model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")
+
+conversation = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "text", "text": "What is the capital of France?"},
+        ],
+    }
+]
+
+inputs = processor.apply_chat_template(
+    conversation,
+    tokenize=True,
+    add_generation_prompt=True,
+    return_dict=True,
+).to(model.device)
+
+outputs = model.generate(**inputs, max_new_tokens=500)
+
+decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
+print(decoded_outputs)
+```
+
+➡️ audio only:
+
+```python
+from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
+
+model_id = "nvidia/audio-flamingo-3-hf"
+processor = AutoProcessor.from_pretrained(model_id)
+model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")
+
+conversation = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/WhDJDIviAOg_120_10.mp3"},
+        ],
+    }
+]
+
+inputs = processor.apply_chat_template(
+    conversation,
+    tokenize=True,
+    add_generation_prompt=True,
+    return_dict=True,
+).to(model.device)
+
+outputs = model.generate(**inputs, max_new_tokens=500)
+
+decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
+print(decoded_outputs)
+```
+
+➡️ batched inference!
+
+```python
+from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
+
+model_id = "nvidia/audio-flamingo-3-hf"
+processor = AutoProcessor.from_pretrained(model_id)
+model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")
+
+conversations = [
+    [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Transcribe the input speech."},
+                {
+                    "type": "audio",
+                    "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/t_837b89f2-26aa-4ee2-bdf6-f73f0dd59b26.wav",
+                },
+            ],
+        }
+    ],
+    [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "This track feels really peaceful and introspective. What elements make it feel so calming and meditative?",
+                },
+                {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/FPSbCAANfbJLVSwD.mp3"},
+            ],
+        }
+    ],
+]
+
+inputs = processor.apply_chat_template(
+    conversations,
+    tokenize=True,
+    add_generation_prompt=True,
+    return_dict=True,
+).to(model.device)
+
+outputs = model.generate(**inputs, max_new_tokens=500)
+
+decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
+print(decoded_outputs)
+```
+
+➡️ Training:
+
+```python
+from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
+
+model_id = "nvidia/audio-flamingo-3-hf"
+processor = AutoProcessor.from_pretrained(model_id)
+model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")
+model.train()
+
+conversation = [
+    [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Transcribe the input speech."},
+                {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/WhDJDIviAOg_120_10.mp3"},
+            ],
+        },
+        {
+            "role": "assistant",
+            "content": [{"type": "text", "text": "The transcription of the audio is 'summer follows spring the days grow longer and the nights are warm'."}],
+        }
+    ],
+    [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "This track feels really peaceful and introspective. What elements make it feel so calming and meditative?",
+                },
+                {"type": "audio", "path": "https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/FPSbCAANfbJLVSwD.mp3"},
+            ],
+        },
+        {
+            "role": "assistant",
+            "content": [{"type": "text", "text": "The transcription of the audio is 'some transcription of the audio'."}],
+        }
+
+    ]
+]
+
+inputs = processor.apply_chat_template(
+    conversation,
+    tokenize=True,
+    add_generation_prompt=True,
+    return_dict=True,
+    output_labels=True,
+).to(model.device)
+
+loss = model(**inputs).loss
+loss.backward()
+```
+
+➡️ transcription shortcut
+
+```python
+from transformers import AudioFlamingo3ForConditionalGeneration, AutoProcessor
+
+model_id = "nvidia/audio-flamingo-3-hf"
+processor = AutoProcessor.from_pretrained(model_id)
+model = AudioFlamingo3ForConditionalGeneration.from_pretrained(model_id, device_map="auto")
+
+inputs = processor.apply_transcription_request(audio="https://huggingface.co/datasets/nvidia/AudioSkills/resolve/main/assets/t_837b89f2-26aa-4ee2-bdf6-f73f0dd59b26.wav").to(model.device)
+
+outputs = model.generate(**inputs, max_new_tokens=500)
+decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True, strip_prefix=True)
+
+print(decoded_outputs)
+```
+
+The model is trained to emit transcriptions prefixed with assistant framing such as `The spoken content of the audio is "<text>".`. Use `strip_prefix=True` (as shown above) to remove the fixed assistant sentence and surrounding quotes so that only the transcription remains.
+
+## How the model works
+
+### Architecture
+
+* **AudioFlamingo3Encoder**
+  Whisper-style feature extractor + encoder → average-pool over time (stride 2) → LayerNorm.
+  Produces per-frame hidden states at the post-pool rate.
+
+* **AudioFlamingo3MultiModalProjector**
+  A small MLP that maps encoder features to the language model’s hidden size.
+
+* **AudioFlamingo3ForConditionalGeneration**
+  A causal language model that accepts text embeddings where each audio placeholder token slot is replaced, in place, by an audio frame embedding. No sequence-length change is introduced by fusion.
+
+### Processor-level alignment
+
+1. Each raw waveform is split into fixed-length windows based on the feature extractor’s `chunk_length` (seconds) and `sampling_rate` (Hz).
+2. For each window, the processor computes the number of post-pool frames `post_pool_len` that the encoder will output (matching the conv/pool schedule).
+3. The processor expands the audio placeholder token by the total number of post-pool frames across all windows.
+4. The model later replaces those token positions with the corresponding projected audio embeddings.
+
+## Usage patterns
+
+### Transcription shortcut
+
+For automatic speech recognition you can skip writing the default instruction each time and call
+[`~transformers.AudioFlamingo3Processor.apply_transcription_request`]:
+
+```python
+inputs = processor.apply_transcription_request(audio=audio_array)
+```
+
+Pass `prompt="Transcribe the input speech."` (or a list of prompts for batch audio) to customize the instruction while
+keeping the audio placeholder handling.
+
+`audio` accepts in-memory arrays, local file paths, or URLs. Any processor kwargs (`text_kwargs`, `audio_kwargs`, etc.)
+are forwarded, so you can tweak padding or tensor formats just like when calling `processor(...)`.
+
+## Long audio and windowing
+
+**Important: Maximum audio length is 10 minutes.** Audio longer than this will be truncated.
+
+* The default setup processes 30-second windows at 16 kHz mono.
+* **The processor enforces a hard limit of 20 windows per sample, resulting in a maximum of 10 minutes of audio (20 windows × 30 seconds).**
+* For each window:
+
+  * `mel_len` is the padded mel length.
+  * A conv stack reduces time as `conv_output_len = (mel_len - 1) // 2 + 1`.
+  * Post-pool frames per window: `post_pool_len = (conv_output_len - 2) // 2 + 1`.
+  * An audio placeholder token is expanded to the sum of `post_pool_len` across all windows.
+
+## Padding, attention, and caching
+
+* **Left padding vs right padding**
+  For generation with mixed prompt lengths in a batch, left padding is usually preferable.
+  For training, right padding is common; AF3’s fusion mechanism itself is padding-agnostic because it replaces in place.
+* **Attention masks**
+  The processor returns `attention_mask` (text) and `input_features_mask` (audio). The model builds an internal 4-D mask on the encoder’s pre-pool axis with negative infinity at pad positions.
+* **Caching**
+  During generation, `input_features` and `input_features_mask` are only passed on the first step. Subsequent steps use cached keys/values from the language model.
+
+## Troubleshooting
+
+* Empty or truncated outputs when batching
+  Use left padding for batched generation and decode only the new tokens after the prompt length, as shown in the quickstart.
+
+## AudioFlamingo3Config
+
+[[autodoc]] AudioFlamingo3Config
+
+## AudioFlamingo3EncoderConfig
+
+[[autodoc]] AudioFlamingo3EncoderConfig
+
+## AudioFlamingo3Processor
+
+[[autodoc]] AudioFlamingo3Processor
+
+## AudioFlamingo3Encoder
+
+[[autodoc]] AudioFlamingo3Encoder
+    - forward
+
+## AudioFlamingo3ForConditionalGeneration
+
+[[autodoc]] AudioFlamingo3ForConditionalGeneration
+    - forward
--- a/docs/source/en/model_doc/bort.md
+++ b/docs/source/en/model_doc/bort.md
@ -1,60 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-*This model was released on 2020-10-20 and added to Hugging Face Transformers on 2023-06-20.*
-
-# BORT
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-</div>
-
-<Tip warning={true}>
-
-This model is in maintenance mode only, we do not accept any new PRs changing its code.
-
-If you run into any issues running this model, please reinstall the last version that supported this model: v4.30.0.
-You can do so by running the following command: `pip install -U transformers==4.30.0`.
-
-</Tip>
-
-## Overview
-
-The BORT model was proposed in [Optimal Subarchitecture Extraction for BERT](https://huggingface.co/papers/2010.10499) by
-Adrian de Wynter and Daniel J. Perry. It is an optimal subset of architectural parameters for the BERT, which the
-authors refer to as "Bort".
-
-The abstract from the paper is the following:
-
-*We extract an optimal subset of architectural parameters for the BERT architecture from Devlin et al. (2018) by
-applying recent breakthroughs in algorithms for neural architecture search. This optimal subset, which we refer to as
-"Bort", is demonstrably smaller, having an effective (that is, not counting the embedding layer) size of 5.5% the
-original BERT-large architecture, and 16% of the net size. Bort is also able to be pretrained in 288 GPU hours, which
-is 1.2% of the time required to pretrain the highest-performing BERT parametric architectural variant, RoBERTa-large
-(Liu et al., 2019), and about 33% of that of the world-record, in GPU hours, required to train BERT-large on the same
-hardware. It is also 7.9x faster on a CPU, as well as being better performing than other compressed variants of the
-architecture, and some of the non-compressed variants: it obtains performance improvements of between 0.3% and 31%,
-absolute, with respect to BERT-large, on multiple public natural language understanding (NLU) benchmarks.*
-
-This model was contributed by [stefan-it](https://huggingface.co/stefan-it). The original code can be found [here](https://github.com/alexa/bort/).
-
-## Usage tips
-
- BORT's model architecture is based on BERT, refer to [BERT's documentation page](bert) for the
-  model's API reference as well as usage examples.
- BORT uses the RoBERTa tokenizer instead of the BERT tokenizer, refer to [RoBERTa's documentation page](roberta) for the tokenizer's API reference as well as usage examples.
- BORT requires a specific fine-tuning algorithm, called [Agora](https://adewynter.github.io/notes/bort_algorithms_and_applications.html#fine-tuning-with-algebraic-topology) ,
-  that is sadly not open-sourced yet. It would be very useful for the community, if someone tries to implement the
-  algorithm to make BORT fine-tuning work.
--- a/docs/source/en/model_doc/colqwen2.md
+++ b/docs/source/en/model_doc/colqwen2.md
@ -158,6 +158,24 @@ print("Retrieval scores (query x image):")
 print(scores)
 ```

+You can also use checkpoints for `ColQwen2.5` that are **compatible with the ColQwen2 architecture**. This version of the model uses [Qwen2_5_VL](./qwen2_5_vl) as the backbone.
+
+```python
+import torch
+from transformers import ColQwen2ForRetrieval, ColQwen2Processor
+from transformers.utils.import_utils import is_flash_attn_2_available
+
+model_name = "Sahil-Kabir/colqwen2.5-v0.2-hf" # An existing compatible checkpoint
+
+model = ColQwen2ForRetrieval.from_pretrained(
+    model_name,
+    dtype=torch.bfloat16,
+    device_map="auto",
+    attn_implementation="flash_attention_2" if is_flash_attn_2_available() else "sdpa"
+)
+processor = ColQwen2Processor.from_pretrained(model_name)
+```
+
 ## Notes

 - [`~ColQwen2Processor.score_retrieval`] returns a 2D tensor where the first dimension is the number of queries and the second dimension is the number of images. A higher score indicates more similarity between the query and image.
--- a/docs/source/en/model_doc/deta.md
+++ b/docs/source/en/model_doc/deta.md
@ -1,78 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-*This model was released on 2022-12-12 and added to Hugging Face Transformers on 2023-06-20.*
-
-# DETA
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-</div>
-
-<Tip warning={true}>
-
-This model is in maintenance mode only, we don't accept any new PRs changing its code.
-If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
-You can do so by running the following command: `pip install -U transformers==4.40.2`.
-
-</Tip>
-
-## Overview
-
-The DETA model was proposed in [NMS Strikes Back](https://huggingface.co/papers/2212.06137) by Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Krähenbühl.
-DETA (short for Detection Transformers with Assignment) improves [Deformable DETR](deformable_detr) by replacing the one-to-one bipartite Hungarian matching loss
-with one-to-many label assignments used in traditional detectors with non-maximum suppression (NMS). This leads to significant gains of up to 2.5 mAP.
-
-The abstract from the paper is the following:
-
-*Detection Transformer (DETR) directly transforms queries to unique objects by using one-to-one bipartite matching during training and enables end-to-end object detection. Recently, these models have surpassed traditional detectors on COCO with undeniable elegance. However, they differ from traditional detectors in multiple designs, including model architecture and training schedules, and thus the effectiveness of one-to-one matching is not fully understood. In this work, we conduct a strict comparison between the one-to-one Hungarian matching in DETRs and the one-to-many label assignments in traditional detectors with non-maximum supervision (NMS). Surprisingly, we observe one-to-many assignments with NMS consistently outperform standard one-to-one matching under the same setting, with a significant gain of up to 2.5 mAP. Our detector that trains Deformable-DETR with traditional IoU-based label assignment achieved 50.2 COCO mAP within 12 epochs (1x schedule) with ResNet50 backbone, outperforming all existing traditional or transformer-based detectors in this setting. On multiple datasets, schedules, and architectures, we consistently show bipartite matching is unnecessary for performant detection transformers. Furthermore, we attribute the success of detection transformers to their expressive transformer architecture.*
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/deta_architecture.jpg"
-alt="drawing" width="600"/>
-
-<small> DETA overview. Taken from the <a href="https://huggingface.co/papers/2212.06137">original paper</a>. </small>
-
-This model was contributed by [nielsr](https://huggingface.co/nielsr).
-The original code can be found [here](https://github.com/jozhang97/DETA).
-
-## Resources
-
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DETA.
-
- Demo notebooks for DETA can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/DETA).
- Scripts for finetuning [`DetaForObjectDetection`] with [`Trainer`] or [Accelerate](https://huggingface.co/docs/accelerate/index) can be found [here](https://github.com/huggingface/transformers/tree/main/examples/pytorch/object-detection).
- See also: [Object detection task guide](../tasks/object_detection).
-
-If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
-
-## DetaConfig
-
-[[autodoc]] DetaConfig
-
-## DetaImageProcessor
-
-[[autodoc]] DetaImageProcessor
-    - preprocess
-    - post_process_object_detection
-
-## DetaModel
-
-[[autodoc]] DetaModel
-    - forward
-
-## DetaForObjectDetection
-
-[[autodoc]] DetaForObjectDetection
-    - forward
--- a/docs/source/en/model_doc/dinov3.md
+++ b/docs/source/en/model_doc/dinov3.md
@ -169,6 +169,9 @@ print("Pooled output shape:", pooled_output.shape)
 [[autodoc]] DINOv3ViTModel
    - forward

+## DINOv3ViTBackbone    
+[[autodoc]] DINOv3ViTBackbone
+
 ## DINOv3ConvNextModel

 [[autodoc]] DINOv3ConvNextModel
--- a/docs/source/en/model_doc/efficientformer.md
+++ b/docs/source/en/model_doc/efficientformer.md
@ -1,85 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-*This model was released on 2022-06-02 and added to Hugging Face Transformers on 2023-06-20.*
-
-# EfficientFormer
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-</div>
-
-<Tip warning={true}>
-
-This model is in maintenance mode only, we don't accept any new PRs changing its code.
-If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
-You can do so by running the following command: `pip install -U transformers==4.40.2`.
-
-</Tip>
-
-## Overview
-
-The EfficientFormer model was proposed in [EfficientFormer: Vision Transformers at MobileNet Speed](https://huggingface.co/papers/2206.01191)
-by Yanyu Li, Geng Yuan, Yang Wen, Eric Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.  EfficientFormer proposes a
-dimension-consistent pure transformer that can be run on mobile devices for dense prediction tasks like image classification, object
-detection and semantic segmentation.
-
-The abstract from the paper is the following:
-
-*Vision Transformers (ViT) have shown rapid progress in computer vision tasks, achieving promising results on various benchmarks.
-However, due to the massive number of parameters and model design, e.g., attention mechanism, ViT-based models are generally
-times slower than lightweight convolutional networks. Therefore, the deployment of ViT for real-time applications is particularly
-challenging, especially on resource-constrained hardware such as mobile devices. Recent efforts try to reduce the computation
-complexity of ViT through network architecture search or hybrid design with MobileNet block, yet the inference speed is still
-unsatisfactory. This leads to an important question: can transformers run as fast as MobileNet while obtaining high performance?
-To answer this, we first revisit the network architecture and operators used in ViT-based models and identify inefficient designs.
-Then we introduce a dimension-consistent pure transformer (without MobileNet blocks) as a design paradigm.
-Finally, we perform latency-driven slimming to get a series of final models dubbed EfficientFormer.
-Extensive experiments show the superiority of EfficientFormer in performance and speed on mobile devices.
-Our fastest model, EfficientFormer-L1, achieves 79.2% top-1 accuracy on ImageNet-1K with only 1.6 ms inference latency on
-iPhone 12 (compiled with CoreML), which { runs as fast as MobileNetV2×1.4 (1.6 ms, 74.7% top-1),} and our largest model,
-EfficientFormer-L7, obtains 83.3% accuracy with only 7.0 ms latency. Our work proves that properly designed transformers can
-reach extremely low latency on mobile devices while maintaining high performance.*
-
-This model was contributed by [novice03](https://huggingface.co/novice03) and [Bearnardd](https://huggingface.co/Bearnardd).
-The original code can be found [here](https://github.com/snap-research/EfficientFormer).
-
-## Documentation resources
-
- [Image classification task guide](../tasks/image_classification)
-
-## EfficientFormerConfig
-
-[[autodoc]] EfficientFormerConfig
-
-## EfficientFormerImageProcessor
-
-[[autodoc]] EfficientFormerImageProcessor
-    - preprocess
-
-## EfficientFormerModel
-
-[[autodoc]] EfficientFormerModel
-    - forward
-
-## EfficientFormerForImageClassification
-
-[[autodoc]] EfficientFormerForImageClassification
-    - forward
-
-## EfficientFormerForImageClassificationWithTeacher
-
-[[autodoc]] EfficientFormerForImageClassificationWithTeacher
-    - forward
--- a/docs/source/en/model_doc/ernie_m.md
+++ b/docs/source/en/model_doc/ernie_m.md
@ -1,97 +0,0 @@
-<!--Copyright 2023 The HuggingFace and Baidu Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-*This model was released on 2020-12-31 and added to Hugging Face Transformers on 2023-06-20.*
-
-# ErnieM
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-</div>
-
-<Tip warning={true}>
-
-This model is in maintenance mode only, we don't accept any new PRs changing its code.
-If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
-You can do so by running the following command: `pip install -U transformers==4.40.2`.
-
-</Tip>
-
-## Overview
-
-The ErnieM model was proposed in [ERNIE-M: Enhanced Multilingual Representation by Aligning
-Cross-lingual Semantics with Monolingual Corpora](https://huggingface.co/papers/2012.15674)  by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun,
-Hao Tian, Hua Wu, Haifeng Wang.
-
-The abstract from the paper is the following:
-
-*Recent studies have demonstrated that pre-trained cross-lingual models achieve impressive performance in downstream cross-lingual tasks. This improvement benefits from learning a large amount of monolingual and parallel corpora. Although it is generally acknowledged that parallel corpora are critical for improving the model performance, existing methods are often constrained by the size of parallel corpora, especially for lowresource languages. In this paper, we propose ERNIE-M, a new training method that encourages the model to align the representation of multiple languages with monolingual corpora, to overcome the constraint that the parallel corpus size places on the model performance. Our key insight is to integrate back-translation into the pre-training process. We generate pseudo-parallel sentence pairs on a monolingual corpus to enable the learning of semantic alignments between different languages, thereby enhancing the semantic modeling of cross-lingual models. Experimental results show that ERNIE-M outperforms existing cross-lingual models and delivers new state-of-the-art results in various cross-lingual downstream tasks.*
-This model was contributed by [Susnato Dhar](https://huggingface.co/susnato). The original code can be found [here](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/paddlenlp/transformers/ernie_m).
-
-## Usage tips
-
- Ernie-M is a BERT-like model so it is a stacked Transformer Encoder.
- Instead of using MaskedLM for pretraining (like BERT) the authors used two novel techniques: `Cross-attention Masked Language Modeling` and `Back-translation Masked Language Modeling`. For now these two LMHead objectives are not implemented here.
- It is a multilingual language model.
- Next Sentence Prediction was not used in pretraining process.
-
-## Resources
-
- [Text classification task guide](../tasks/sequence_classification)
- [Token classification task guide](../tasks/token_classification)
- [Question answering task guide](../tasks/question_answering)
- [Multiple choice task guide](../tasks/multiple_choice)
-
-## ErnieMConfig
-
-[[autodoc]] ErnieMConfig
-
-## ErnieMTokenizer
-
-[[autodoc]] ErnieMTokenizer
-    - build_inputs_with_special_tokens
-    - get_special_tokens_mask
-    - create_token_type_ids_from_sequences
-    - save_vocabulary
-
-## ErnieMModel
-
-[[autodoc]] ErnieMModel
-    - forward
-
-## ErnieMForSequenceClassification
-
-[[autodoc]] ErnieMForSequenceClassification
-    - forward
-
-## ErnieMForMultipleChoice
-
-[[autodoc]] ErnieMForMultipleChoice
-    - forward
-
-## ErnieMForTokenClassification
-
-[[autodoc]] ErnieMForTokenClassification
-    - forward
-
-## ErnieMForQuestionAnswering
-
-[[autodoc]] ErnieMForQuestionAnswering
-    - forward
-
-## ErnieMForInformationExtraction
-
-[[autodoc]] ErnieMForInformationExtraction
-    - forward
--- a/docs/source/en/model_doc/fuyu.md
+++ b/docs/source/en/model_doc/fuyu.md
@ -75,11 +75,11 @@ A processor requires an image_processor and a tokenizer. Hence, inputs can be lo
 from PIL import Image
 from transformers import AutoTokenizer
 from transformers.models.fuyu.processing_fuyu import FuyuProcessor
-from transformers.models.fuyu.image_processing_fuyu import FuyuImageProcessor
+from transformers.models.fuyu.image_processing_fuyu_fast import FuyuImageProcessorFast


 tokenizer = AutoTokenizer.from_pretrained('adept-hf-collab/fuyu-8b')
-image_processor = FuyuImageProcessor()
+image_processor = FuyuImageProcessorFast()


 processor = FuyuProcessor(image_processor=image_processor, tokenizer=tokenizer)
@ -118,6 +118,11 @@ The `LlamaTokenizer` is used as it is a standard wrapper around sentencepiece.
 [[autodoc]] FuyuImageProcessor
    - __call__

+## FuyuImageProcessor
+
+[[autodoc]] FuyuImageProcessorFast
+    - __call__
+
 ## FuyuProcessor

 [[autodoc]] FuyuProcessor
--- a/docs/source/en/model_doc/glm46v.md
+++ b/docs/source/en/model_doc/glm46v.md
@ -0,0 +1,34 @@
+# GLM-4.6V
+
+## Glm46VConfig
+
+[[autodoc]] Glm46VConfig
+
+## Glm46VImageProcessor
+
+[[autodoc]] Glm46VImageProcessor
+    - preprocess
+
+## Glm46VVideoProcessor
+
+[[autodoc]] Glm46VVideoProcessor
+    - preprocess
+
+## Glm46VImageProcessorFast
+
+[[autodoc]] Glm46VImageProcessorFast
+    - preprocess
+
+## Glm46VProcessor
+
+[[autodoc]] Glm46VProcessor
+
+## Glm46VModel
+
+[[autodoc]] Glm46VModel
+    - forward
+
+## Glm46VForConditionalGeneration
+
+[[autodoc]] Glm46VForConditionalGeneration
+    - forward
--- a/docs/source/en/model_doc/glm4v.md
+++ b/docs/source/en/model_doc/glm4v.md
@ -170,6 +170,11 @@ print(output_text)

 [[autodoc]] Glm4vConfig

+
+## Glm4vVisionConfig
+
+[[autodoc]] Glm4vVisionConfig
+
 ## Glm4vTextConfig

 [[autodoc]] Glm4vTextConfig
@ -193,6 +198,11 @@ print(output_text)

 [[autodoc]] Glm4vProcessor

+## Glm4vVisionModel
+
+[[autodoc]] Glm4vVisionModel
+    - forward
+
 ## Glm4vTextModel

 [[autodoc]] Glm4vTextModel
--- a/docs/source/en/model_doc/glm4v_moe.md
+++ b/docs/source/en/model_doc/glm4v_moe.md
@ -22,7 +22,7 @@ rendered properly in your Markdown viewer.
 <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">    </div>
 </div>

-# Glm4vMoe
+# Glm4vMoeMoe

 ## Overview

@ -48,10 +48,20 @@ The model also introduces a **Thinking Mode** switch, allowing users to balance

 [[autodoc]] Glm4vMoeConfig

+
+## Glm4vMoeVisionConfig
+
+[[autodoc]] Glm4vMoeVisionConfig
+
 ## Glm4vMoeTextConfig

 [[autodoc]] Glm4vMoeTextConfig

+## Glm4vMoeVisionModel
+
+[[autodoc]] Glm4vMoeVisionModel
+    - forward
+
 ## Glm4vMoeTextModel

 [[autodoc]] Glm4vMoeTextModel
@ -65,4 +75,4 @@ The model also introduces a **Thinking Mode** switch, allowing users to balance
 ## Glm4vMoeForConditionalGeneration

 [[autodoc]] Glm4vMoeForConditionalGeneration
-    - forward
+    - forward
--- a/docs/source/en/model_doc/glpn.md
+++ b/docs/source/en/model_doc/glpn.md
@ -61,6 +61,11 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] GLPNImageProcessor
    - preprocess

+## GLPNImageProcessorFast
+
+[[autodoc]] GLPNImageProcessorFast
+    - preprocess
+
 ## GLPNModel

 [[autodoc]] GLPNModel
--- a/docs/source/en/model_doc/gptsan-japanese.md
+++ b/docs/source/en/model_doc/gptsan-japanese.md
@ -1,145 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-*This model was released on 2023-02-07 and added to Hugging Face Transformers on 2023-06-20.*
-
-# GPTSAN-japanese
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-</div>
-
-<Tip warning={true}>
-
-This model is in maintenance mode only, we don't accept any new PRs changing its code.
-If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
-You can do so by running the following command: `pip install -U transformers==4.40.2`.
-
-</Tip>
-
-## Overview
-
-The [GPTSAN-japanese](https://huggingface.co/Tanrei/GPTSAN-japanese) model was released in the repository by Toshiyuki Sakamoto (tanreinama).
-
-GPTSAN is a Japanese language model using Switch Transformer. It has the same structure as the model introduced as Prefix LM
-in the T5 paper, and support both Text Generation and Masked Language Modeling tasks. These basic tasks similarly can
-fine-tune for translation or summarization.
-
-### Usage example
-
-The `generate()` method can be used to generate text using GPTSAN-Japanese model.
-
-```python
->>> from transformers import AutoModel, AutoTokenizer
-from accelerate import Accelerator
->>> import torch
-
->>> device = Accelerator().device
->>> tokenizer = AutoTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
->>> model = AutoModel.from_pretrained("Tanrei/GPTSAN-japanese").to(device)
->>> x_tok = tokenizer("は、", prefix_text="織田信長", return_tensors="pt")
->>> torch.manual_seed(0)
->>> gen_tok = model.generate(x_tok.input_ids.to(model.device), token_type_ids=x_tok.token_type_ids.to(model.device), max_new_tokens=20)
->>> tokenizer.decode(gen_tok[0])
-'織田信長は、2004年に『戦国BASARA』のために、豊臣秀吉'
-```
-
-## GPTSAN Features
-
-GPTSAN has some unique features. It has a model structure of Prefix-LM. It works as a shifted Masked Language Model for Prefix Input tokens. Un-prefixed inputs behave like normal generative models.
-The Spout vector is a GPTSAN specific input. Spout is pre-trained with random inputs, but you can specify a class of text or an arbitrary vector during fine-tuning. This allows you to indicate the tendency of the generated text.
-GPTSAN has a sparse Feed Forward based on Switch-Transformer. You can also add other layers and train them partially. See the original GPTSAN repository for details.
-
-### Prefix-LM Model
-
-GPTSAN has the structure of the model named Prefix-LM in the `T5` paper. (The original GPTSAN repository calls it `hybrid`)
-In GPTSAN, the `Prefix` part of Prefix-LM, that is, the input position that can be referenced by both tokens, can be specified with any length.
-Arbitrary lengths can also be specified differently for each batch.
-This length applies to the text entered in `prefix_text` for the tokenizer.
-The tokenizer returns the mask of the `Prefix` part of Prefix-LM as `token_type_ids`.
-The model treats the part where `token_type_ids` is 1 as a `Prefix` part, that is, the input can refer to both tokens before and after.
-
-## Usage tips
-
-Specifying the Prefix part is done with a mask passed to self-attention.
-When token_type_ids=None or all zero, it is equivalent to regular causal mask
-
-for example:
-
->>> x_token = tokenizer("ｱｲｳｴ")
-
-```text
-input_ids:      | SOT | SEG | ｱ | ｲ | ｳ | ｴ |
-token_type_ids: | 1   | 0   | 0 | 0 | 0 | 0 |
-prefix_lm_mask:
-SOT | 1 0 0 0 0 0 |
-SEG | 1 1 0 0 0 0 |
-ｱ   | 1 1 1 0 0 0 |
-ｲ   | 1 1 1 1 0 0 |
-ｳ   | 1 1 1 1 1 0 |
-ｴ   | 1 1 1 1 1 1 |
-```
-
->>> x_token = tokenizer("", prefix_text="ｱｲｳｴ")
-
-```text
-input_ids:      | SOT | ｱ | ｲ | ｳ | ｴ | SEG |
-token_type_ids: | 1   | 1 | 1 | 1 | 1 | 0  |
-prefix_lm_mask:
-SOT | 1 1 1 1 1 0 |
-ｱ   | 1 1 1 1 1 0 |
-ｲ   | 1 1 1 1 1 0 |
-ｳ   | 1 1 1 1 1 0 |
-ｴ   | 1 1 1 1 1 0 |
-SEG | 1 1 1 1 1 1 |
-```
-
->>> x_token = tokenizer("ｳｴ", prefix_text="ｱｲ")
-
-```text
-input_ids:      | SOT | ｱ | ｲ | SEG | ｳ | ｴ |
-token_type_ids: | 1   | 1 | 1 | 0   | 0 | 0 |
-prefix_lm_mask:
-SOT | 1 1 1 0 0 0 |
-ｱ   | 1 1 1 0 0 0 |
-ｲ   | 1 1 1 0 0 0 |
-SEG | 1 1 1 1 0 0 |
-ｳ   | 1 1 1 1 1 0 |
-ｴ   | 1 1 1 1 1 1 |
-```
-
-### Spout Vector
-
-A Spout Vector is a special vector for controlling text generation.
-This vector is treated as the first embedding in self-attention to bring extraneous attention to the generated tokens.
-In the pre-trained model published from `Tanrei/GPTSAN-japanese`, the Spout Vector is a 128-dimensional vector that passes through 8 fully connected layers in the model and is projected into the space acting as external attention.
-The Spout Vector projected by the fully connected layer is split to be passed to all self-attentions.
-
-## GPTSanJapaneseConfig
-
-[[autodoc]] GPTSanJapaneseConfig
-
-## GPTSanJapaneseTokenizer
-
-[[autodoc]] GPTSanJapaneseTokenizer
-
-## GPTSanJapaneseModel
-
-[[autodoc]] GPTSanJapaneseModel
-
-## GPTSanJapaneseForConditionalGeneration
-
-[[autodoc]] GPTSanJapaneseForConditionalGeneration
-    - forward
--- a/docs/source/en/model_doc/graphormer.md
+++ b/docs/source/en/model_doc/graphormer.md
@ -1,60 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team and Microsoft. All rights reserved.
-
-Licensed under the MIT License; you may not use this file except in compliance with
-the License.
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-*This model was released on 2021-06-09 and added to Hugging Face Transformers on 2023-06-20.*
-
-# Graphormer
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-</div>
-
-<Tip warning={true}>
-
-This model is in maintenance mode only, we don't accept any new PRs changing its code.
-If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
-You can do so by running the following command: `pip install -U transformers==4.40.2`.
-
-</Tip>
-
-## Overview
-
-The Graphormer model was proposed in [Do Transformers Really Perform Bad for Graph Representation?](https://huggingface.co/papers/2106.05234)  by
-Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen and Tie-Yan Liu. It is a Graph Transformer model, modified to allow computations on graphs instead of text sequences by generating embeddings and features of interest during preprocessing and collation, then using a modified attention.
-
-The abstract from the paper is the following:
-
-*The Transformer architecture has become a dominant choice in many domains, such as natural language processing and computer vision. Yet, it has not achieved competitive performance on popular leaderboards of graph-level prediction compared to mainstream GNN variants. Therefore, it remains a mystery how Transformers could perform well for graph representation learning. In this paper, we solve this mystery by presenting Graphormer, which is built upon the standard Transformer architecture, and could attain excellent results on a broad range of graph representation learning tasks, especially on the recent OGB Large-Scale Challenge. Our key insight to utilizing Transformer in the graph is the necessity of effectively encoding the structural information of a graph into the model. To this end, we propose several simple yet effective structural encoding methods to help Graphormer better model graph-structured data. Besides, we mathematically characterize the expressive power of Graphormer and exhibit that with our ways of encoding the structural information of graphs, many popular GNN variants could be covered as the special cases of Graphormer.*
-
-This model was contributed by [clefourrier](https://huggingface.co/clefourrier). The original code can be found [here](https://github.com/microsoft/Graphormer).
-
-## Usage tips
-
-This model will not work well on large graphs (more than 100 nodes/edges), as it will make the memory explode.
-You can reduce the batch size, increase your RAM, or decrease the `UNREACHABLE_NODE_DISTANCE` parameter in algos_graphormer.pyx, but it will be hard to go above 700 nodes/edges.
-
-This model does not use a tokenizer, but instead a special collator during training.
-
-## GraphormerConfig
-
-[[autodoc]] GraphormerConfig
-
-## GraphormerModel
-
-[[autodoc]] GraphormerModel
-    - forward
-
-## GraphormerForGraphClassification
-
-[[autodoc]] GraphormerForGraphClassification
-    - forward
--- a/docs/source/en/model_doc/jukebox.md
+++ b/docs/source/en/model_doc/jukebox.md
@ -1,99 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-*This model was released on 2020-04-30 and added to Hugging Face Transformers on 2023-06-20.*
-
-# Jukebox
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-</div>
-
-<Tip warning={true}>
-
-This model is in maintenance mode only, we don't accept any new PRs changing its code.
-If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
-You can do so by running the following command: `pip install -U transformers==4.40.2`.
-
-</Tip>
-
-## Overview
-
-The Jukebox model was proposed in [Jukebox: A generative model for music](https://huggingface.co/papers/2005.00341)
-by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford,
-Ilya Sutskever. It introduces a generative music model which can produce minute long samples that can be conditioned on
-an artist, genres and lyrics.
-
-The abstract from the paper is the following:
-
-*We introduce Jukebox, a model that generates music with singing in the raw audio domain. We tackle the long context of raw audio using a multiscale VQ-VAE to compress it to discrete codes, and modeling those using autoregressive Transformers. We show that the combined model at scale can generate high-fidelity and diverse songs with coherence up to multiple minutes. We can condition on artist and genre to steer the musical and vocal style, and on unaligned lyrics to make the singing more controllable. We are releasing thousands of non cherry-picked samples, along with model weights and code.*
-
-As shown on the following figure, Jukebox is made of 3 `priors` which are decoder only models. They follow the architecture described in [Generating Long Sequences with Sparse Transformers](https://huggingface.co/papers/1904.10509), modified to support longer context length.
-First, a autoencoder is used to encode the text lyrics. Next, the first (also called `top_prior`) prior attends to the last hidden states extracted from the lyrics encoder. The priors are linked to the previous priors respectively via an `AudioConditioner` module. The`AudioConditioner` upsamples the outputs of the previous prior to raw tokens at a certain audio frame per second resolution.
-The metadata such as *artist, genre and timing* are passed to each prior, in the form of a start token and positional embedding for the timing data.  The hidden states are mapped to the closest codebook vector from the VQVAE in order to convert them to raw audio.
-
-![JukeboxModel](https://gist.githubusercontent.com/ArthurZucker/92c1acaae62ebf1b6a951710bdd8b6af/raw/c9c517bf4eff61393f6c7dec9366ef02bdd059a3/jukebox.svg)
-
-This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ).
-The original code can be found [here](https://github.com/openai/jukebox).
-
-## Usage tips
-
- This model only supports inference. This is for a few reasons, mostly because it requires a crazy amount of memory to train. Feel free to open a PR and add what's missing to have a full integration with the hugging face trainer!
- This model is very slow, and takes 8h to generate a minute long audio using the 5b top prior on a V100 GPU. In order automaticallay handle the device on which the model should execute, use `accelerate`.
- Contrary to the paper, the order of the priors goes from `0` to `1` as it felt more intuitive : we sample starting from `0`.
- Primed sampling (conditioning the sampling on raw audio) requires more memory than ancestral sampling and should be used with `fp16` set to `True`.
-
-This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ).
-The original code can be found [here](https://github.com/openai/jukebox).
-
-## JukeboxConfig
-
-[[autodoc]] JukeboxConfig
-
-## JukeboxPriorConfig
-
-[[autodoc]] JukeboxPriorConfig
-
-## JukeboxVQVAEConfig
-
-[[autodoc]] JukeboxVQVAEConfig
-
-## JukeboxTokenizer
-
-[[autodoc]] JukeboxTokenizer
-    - save_vocabulary
-
-## JukeboxModel
-
-[[autodoc]] JukeboxModel
-    - ancestral_sample
-    - primed_sample
-    - continue_sample
-    - upsample
-    - _sample
-
-## JukeboxPrior
-
-[[autodoc]] JukeboxPrior
-    - sample
-    - forward
-
-## JukeboxVQVAE
-
-[[autodoc]] JukeboxVQVAE
-    - forward
-    - encode
-    - decode
--- a/docs/source/en/model_doc/mctct.md
+++ b/docs/source/en/model_doc/mctct.md
@ -1,84 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-*This model was released on 2021-10-30 and added to Hugging Face Transformers on 2023-06-20.*
-
-# M-CTC-T
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-</div>
-
-<Tip warning={true}>
-
-This model is in maintenance mode only, so we won't accept any new PRs changing its code.
-
-If you run into any issues running this model, please reinstall the last version that supported this model: v4.30.0.
-You can do so by running the following command: `pip install -U transformers==4.30.0`.
-
-</Tip>
-
-## Overview
-
-The M-CTC-T model was proposed in [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://huggingface.co/papers/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert. The model is a 1B-param transformer encoder, with a CTC head over 8065 character labels and a language identification head over 60 language ID labels. It is trained on Common Voice (version 6.1, December 2020 release) and VoxPopuli. After training on Common Voice and VoxPopuli, the model is trained on Common Voice only. The labels are unnormalized character-level transcripts (punctuation and capitalization are not removed). The model takes as input Mel filterbank features from a 16Khz audio signal.
-
-The abstract from the paper is the following:
-
-*Semi-supervised learning through pseudo-labeling has become a staple of state-of-the-art monolingual
-speech recognition systems. In this work, we extend pseudo-labeling to massively multilingual speech
-recognition with 60 languages. We propose a simple pseudo-labeling recipe that works well even
-with low-resource languages: train a supervised multilingual model, fine-tune it with semi-supervised
-learning on a target language, generate pseudo-labels for that language, and train a final model using
-pseudo-labels for all languages, either from scratch or by fine-tuning. Experiments on the labeled
-Common Voice and unlabeled VoxPopuli datasets show that our recipe can yield a model with better
-performance for many languages that also transfers well to LibriSpeech.*
-
-This model was contributed by [cwkeam](https://huggingface.co/cwkeam). The original code can be found [here](https://github.com/flashlight/wav2letter/tree/main/recipes/mling_pl).
-
-## Usage tips
-
-The PyTorch version of this model is only available in torch 1.9 and higher.
-
-## Resources
-
- [Automatic speech recognition task guide](../tasks/asr)
-
-## MCTCTConfig
-
-[[autodoc]] MCTCTConfig
-
-## MCTCTFeatureExtractor
-
-[[autodoc]] MCTCTFeatureExtractor
-    - __call__
-
-## MCTCTProcessor
-
-[[autodoc]] MCTCTProcessor
-    - __call__
-    - from_pretrained
-    - save_pretrained
-    - batch_decode
-    - decode
-
-## MCTCTModel
-
-[[autodoc]] MCTCTModel
-    - forward
-
-## MCTCTForCTC
-
-[[autodoc]] MCTCTForCTC
-    - forward
--- a/docs/source/en/model_doc/mega.md
+++ b/docs/source/en/model_doc/mega.md
@ -1,94 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-*This model was released on 2022-09-21 and added to Hugging Face Transformers on 2023-06-20.*
-
-# MEGA
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-</div>
-
-<Tip warning={true}>
-
-This model is in maintenance mode only, we don't accept any new PRs changing its code.
-If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
-You can do so by running the following command: `pip install -U transformers==4.40.2`.
-
-</Tip>
-
-## Overview
-
-The MEGA model was proposed in [Mega: Moving Average Equipped Gated Attention](https://huggingface.co/papers/2209.10655) by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.
-MEGA proposes a new approach to self-attention with each encoder layer having a multi-headed exponential moving average in addition to a single head of standard dot-product attention, giving the attention mechanism
-stronger positional biases. This allows MEGA to perform competitively to Transformers on standard benchmarks including LRA
-while also having significantly fewer parameters. MEGA's compute efficiency allows it to scale to very long sequences, making it an
-attractive option for long-document NLP tasks.
-
-The abstract from the paper is the following:
-
- *The design choices in the Transformer attention mechanism, including weak inductive bias and quadratic computational complexity, have limited its application for modeling long sequences. In this paper, we introduce Mega, a simple, theoretically grounded, single-head gated attention mechanism equipped with (exponential) moving average to incorporate inductive bias of position-aware local dependencies into the position-agnostic attention mechanism. We further propose a variant of Mega that offers linear time and space complexity yet yields only minimal quality loss, by efficiently splitting the whole sequence into multiple chunks with fixed length. Extensive experiments on a wide range of sequence modeling benchmarks, including the Long Range Arena, neural machine translation, auto-regressive language modeling, and image and speech classification, show that Mega achieves significant improvements over other sequence models, including variants of Transformers and recent state space models.*
-
-This model was contributed by [mnaylor](https://huggingface.co/mnaylor).
-The original code can be found [here](https://github.com/facebookresearch/mega).
-
-## Usage tips
-
- MEGA can perform quite well with relatively few parameters. See Appendix D in the MEGA paper for examples of architectural specs which perform well in various settings. If using MEGA as a decoder, be sure to set `bidirectional=False` to avoid errors with default bidirectional.
- Mega-chunk is a variant of mega that reduces time and spaces complexity from quadratic to linear. Utilize chunking with MegaConfig.use_chunking and control chunk size with MegaConfig.chunk_size
-
-## Implementation Notes
-
- The original implementation of MEGA had an inconsistent expectation of attention masks for padding and causal self-attention between the softmax attention and Laplace/squared ReLU method. This implementation addresses that inconsistency.
- The original implementation did not include token type embeddings; this implementation adds support for these, with the option controlled by MegaConfig.add_token_type_embeddings
-
-## MegaConfig
-
-[[autodoc]] MegaConfig
-
-## MegaModel
-
-[[autodoc]] MegaModel
-    - forward
-
-## MegaForCausalLM
-
-[[autodoc]] MegaForCausalLM
-    - forward
-
-## MegaForMaskedLM
-
-[[autodoc]] MegaForMaskedLM
-    - forward
-
-## MegaForSequenceClassification
-
-[[autodoc]] MegaForSequenceClassification
-    - forward
-
-## MegaForMultipleChoice
-
-[[autodoc]] MegaForMultipleChoice
-    - forward
-
-## MegaForTokenClassification
-
-[[autodoc]] MegaForTokenClassification
-    - forward
-
-## MegaForQuestionAnswering
-
-[[autodoc]] MegaForQuestionAnswering
-    - forward
--- a/docs/source/en/model_doc/nat.md
+++ b/docs/source/en/model_doc/nat.md
@ -1,101 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-*This model was released on 2022-04-14 and added to Hugging Face Transformers on 2023-06-20.*
-
-# Neighborhood Attention Transformer
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-</div>
-
-<Tip warning={true}>
-
-This model is in maintenance mode only, we don't accept any new PRs changing its code.
-If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
-You can do so by running the following command: `pip install -U transformers==4.40.2`.
-
-</Tip>
-
-## Overview
-
-NAT was proposed in [Neighborhood Attention Transformer](https://huggingface.co/papers/2204.07143)
-by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
-
-It is a hierarchical vision transformer based on Neighborhood Attention, a sliding-window self attention pattern.
-
-The abstract from the paper is the following:
-
-*We present Neighborhood Attention (NA), the first efficient and scalable sliding-window attention mechanism for vision.
-NA is a pixel-wise operation, localizing self attention (SA) to the nearest neighboring pixels, and therefore enjoys a
-linear time and space complexity compared to the quadratic complexity of SA. The sliding-window pattern allows NA's
-receptive field to grow without needing extra pixel shifts, and preserves translational equivariance, unlike
-Swin Transformer's Window Self Attention (WSA). We develop NATTEN (Neighborhood Attention Extension), a Python package
-with efficient C++ and CUDA kernels, which allows NA to run up to 40% faster than Swin's WSA while using up to 25% less
-memory. We further present Neighborhood Attention Transformer (NAT), a new hierarchical transformer design based on NA
-that boosts image classification and downstream vision performance. Experimental results on NAT are competitive;
-NAT-Tiny reaches 83.2% top-1 accuracy on ImageNet, 51.4% mAP on MS-COCO and 48.4% mIoU on ADE20K, which is 1.9%
-ImageNet accuracy, 1.0% COCO mAP, and 2.6% ADE20K mIoU improvement over a Swin model with similar size.*
-
-<img
-src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/neighborhood-attention-pattern.jpg"
-alt="drawing" width="600"/>
-
-<small> Neighborhood Attention compared to other attention patterns.
-Taken from the <a href="https://huggingface.co/papers/2204.07143">original paper</a>.</small>
-
-This model was contributed by [Ali Hassani](https://huggingface.co/alihassanijr).
-The original code can be found [here](https://github.com/SHI-Labs/Neighborhood-Attention-Transformer).
-
-## Usage tips
-
- One can use the [`AutoImageProcessor`] API to prepare images for the model.
- NAT can be used as a *backbone*. When `output_hidden_states = True`,
-it will output both `hidden_states` and `reshaped_hidden_states`.
-The `reshaped_hidden_states` have a shape of `(batch, num_channels, height, width)` rather than
-`(batch_size, height, width, num_channels)`.
-
-Notes:
-
- NAT depends on [NATTEN](https://github.com/SHI-Labs/NATTEN/)'s implementation of Neighborhood Attention.
-You can install it with pre-built wheels for Linux by referring to [shi-labs.com/natten](https://shi-labs.com/natten),
-or build on your system by running `pip install natten`.
-Note that the latter will likely take time to compile. NATTEN does not support Windows devices yet.
- Patch size of 4 is only supported at the moment.
-
-## Resources
-
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with NAT.
-
-<PipelineTag pipeline="image-classification"/>
-
- [`NatForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
- See also: [Image classification task guide](../tasks/image_classification)
-
-If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
-
-## NatConfig
-
-[[autodoc]] NatConfig
-
-## NatModel
-
-[[autodoc]] NatModel
-    - forward
-
-## NatForImageClassification
-
-[[autodoc]] NatForImageClassification
-    - forward
--- a/docs/source/en/model_doc/nezha.md
+++ b/docs/source/en/model_doc/nezha.md
@ -1,101 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-*This model was released on 2019-08-31 and added to Hugging Face Transformers on 2023-06-20.*
-
-# Nezha
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-</div>
-
-<Tip warning={true}>
-
-This model is in maintenance mode only, we don't accept any new PRs changing its code.
-If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
-You can do so by running the following command: `pip install -U transformers==4.40.2`.
-
-</Tip>
-
-## Overview
-
-The Nezha model was proposed in [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://huggingface.co/papers/1909.00204) by Junqiu Wei et al.
-
-The abstract from the paper is the following:
-
-*The pre-trained language models have achieved great successes in various natural language understanding (NLU) tasks
-due to its capacity to capture the deep contextualized information in text by pre-training on large-scale corpora.
-In this technical report, we present our practice of pre-training language models named NEZHA (NEural contextualiZed
-representation for CHinese lAnguage understanding) on Chinese corpora and finetuning for the Chinese NLU tasks.
-The current version of NEZHA is based on BERT with a collection of proven improvements, which include Functional
-Relative Positional Encoding as an effective positional encoding scheme, Whole Word Masking strategy,
-Mixed Precision Training and the LAMB Optimizer in training the models. The experimental results show that NEZHA
-achieves the state-of-the-art performances when finetuned on several representative Chinese tasks, including
-named entity recognition (People's Daily NER), sentence matching (LCQMC), Chinese sentiment classification (ChnSenti)
-and natural language inference (XNLI).*
-
-This model was contributed by [sijunhe](https://huggingface.co/sijunhe). The original code can be found [here](https://github.com/huawei-noah/Pretrained-Language-Model/tree/master/NEZHA-PyTorch).
-
-## Resources
-
- [Text classification task guide](../tasks/sequence_classification)
- [Token classification task guide](../tasks/token_classification)
- [Question answering task guide](../tasks/question_answering)
- [Masked language modeling task guide](../tasks/masked_language_modeling)
- [Multiple choice task guide](../tasks/multiple_choice)
-
-## NezhaConfig
-
-[[autodoc]] NezhaConfig
-
-## NezhaModel
-
-[[autodoc]] NezhaModel
-    - forward
-
-## NezhaForPreTraining
-
-[[autodoc]] NezhaForPreTraining
-    - forward
-
-## NezhaForMaskedLM
-
-[[autodoc]] NezhaForMaskedLM
-    - forward
-
-## NezhaForNextSentencePrediction
-
-[[autodoc]] NezhaForNextSentencePrediction
-    - forward
-
-## NezhaForSequenceClassification
-
-[[autodoc]] NezhaForSequenceClassification
-    - forward
-
-## NezhaForMultipleChoice
-
-[[autodoc]] NezhaForMultipleChoice
-    - forward
-
-## NezhaForTokenClassification
-
-[[autodoc]] NezhaForTokenClassification
-    - forward
-
-## NezhaForQuestionAnswering
-
-[[autodoc]] NezhaForQuestionAnswering
-    - forward
--- a/docs/source/en/model_doc/open-llama.md
+++ b/docs/source/en/model_doc/open-llama.md
@ -1,66 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-*This model was released on 2023-04-16 and added to Hugging Face Transformers on 2023-06-20.*
-
-# Open-Llama
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-</div>
-
-<Tip warning={true}>
-
-This model is in maintenance mode only, we don't accept any new PRs changing its code.
-
-If you run into any issues running this model, please reinstall the last version that supported this model: v4.31.0.
-You can do so by running the following command: `pip install -U transformers==4.31.0`.
-
-</Tip>
-
-<Tip warning={true}>
-
-This model differs from the [OpenLLaMA models](https://huggingface.co/models?search=openllama) on the Hugging Face Hub, which primarily use the [LLaMA](llama) architecture.
-
-</Tip>
-
-## Overview
-
-The Open-Llama model was proposed in the open source Open-Llama project by community developer s-JoL.
-
-The model is mainly based on LLaMA with some modifications, incorporating memory-efficient attention from Xformers, stable embedding from Bloom, and shared input-output embedding from PaLM.
-And the model is pre-trained on both Chinese and English, which gives it better performance on Chinese language tasks.
-
-This model was contributed by [s-JoL](https://huggingface.co/s-JoL).
-The original code was released on GitHub by [s-JoL](https://github.com/s-JoL), but is now removed.
-
-## OpenLlamaConfig
-
-[[autodoc]] OpenLlamaConfig
-
-## OpenLlamaModel
-
-[[autodoc]] OpenLlamaModel
-    - forward
-
-## OpenLlamaForCausalLM
-
-[[autodoc]] OpenLlamaForCausalLM
-    - forward
-
-## OpenLlamaForSequenceClassification
-
-[[autodoc]] OpenLlamaForSequenceClassification
-    - forward
--- a/docs/source/en/model_doc/qdqbert.md
+++ b/docs/source/en/model_doc/qdqbert.md
@ -1,183 +0,0 @@
-<!--Copyright 2021 NVIDIA Corporation and The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-*This model was released on 2020-04-20 and added to Hugging Face Transformers on 2023-06-20.*
-
-# QDQBERT
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-</div>
-
-<Tip warning={true}>
-
-This model is in maintenance mode only, we don't accept any new PRs changing its code.
-If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
-You can do so by running the following command: `pip install -U transformers==4.40.2`.
-
-</Tip>
-
-## Overview
-
-The QDQBERT model can be referenced in [Integer Quantization for Deep Learning Inference: Principles and Empirical
-Evaluation](https://huggingface.co/papers/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius
-Micikevicius.
-
-The abstract from the paper is the following:
-
-*Quantization techniques can reduce the size of Deep Neural Networks and improve inference latency and throughput by
-taking advantage of high throughput integer instructions. In this paper we review the mathematical aspects of
-quantization parameters and evaluate their choices on a wide range of neural network models for different application
-domains, including vision, speech, and language. We focus on quantization techniques that are amenable to acceleration
-by processors with high-throughput integer math pipelines. We also present a workflow for 8-bit quantization that is
-able to maintain accuracy within 1% of the floating-point baseline on all networks studied, including models that are
-more difficult to quantize, such as MobileNets and BERT-large.*
-
-This model was contributed by [shangz](https://huggingface.co/shangz).
-
-## Usage tips
-
- QDQBERT model adds fake quantization operations (pair of QuantizeLinear/DequantizeLinear ops) to (i) linear layer
-  inputs and weights, (ii) matmul inputs, (iii) residual add inputs, in BERT model.
- QDQBERT requires the dependency of [Pytorch Quantization Toolkit](https://github.com/NVIDIA/TensorRT/tree/master/tools/pytorch-quantization). To install `pip install pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com`
- QDQBERT model can be loaded from any checkpoint of HuggingFace BERT model (for example *google-bert/bert-base-uncased*), and
-  perform Quantization Aware Training/Post Training Quantization.
- A complete example of using QDQBERT model to perform Quatization Aware Training and Post Training Quantization for
-  SQUAD task can be found at https://github.com/huggingface/transformers-research-projects/tree/main/quantization-qdqbert.
-
-### Set default quantizers
-
-QDQBERT model adds fake quantization operations (pair of QuantizeLinear/DequantizeLinear ops) to BERT by
-`TensorQuantizer` in [Pytorch Quantization Toolkit](https://github.com/NVIDIA/TensorRT/tree/master/tools/pytorch-quantization). `TensorQuantizer` is the module
-for quantizing tensors, with `QuantDescriptor` defining how the tensor should be quantized. Refer to [Pytorch
-Quantization Toolkit userguide](https://docs.nvidia.com/deeplearning/tensorrt/pytorch-quantization-toolkit/docs/userguide.html) for more details.
-
-Before creating QDQBERT model, one has to set the default `QuantDescriptor` defining default tensor quantizers.
-
-Example:
-
-```python
->>> import pytorch_quantization.nn as quant_nn
->>> from pytorch_quantization.tensor_quant import QuantDescriptor
-
->>> # The default tensor quantizer is set to use Max calibration method
->>> input_desc = QuantDescriptor(num_bits=8, calib_method="max")
->>> # The default tensor quantizer is set to be per-channel quantization for weights
->>> weight_desc = QuantDescriptor(num_bits=8, axis=((0,)))
->>> quant_nn.QuantLinear.set_default_quant_desc_input(input_desc)
->>> quant_nn.QuantLinear.set_default_quant_desc_weight(weight_desc)
-```
-
-### Calibration
-
-Calibration is the terminology of passing data samples to the quantizer and deciding the best scaling factors for
-tensors. After setting up the tensor quantizers, one can use the following example to calibrate the model:
-
-```python
->>> # Find the TensorQuantizer and enable calibration
->>> for name, module in model.named_modules():
-...     if name.endswith("_input_quantizer"):
-...         module.enable_calib()
-...         module.disable_quant()  # Use full precision data to calibrate
-
->>> # Feeding data samples
->>> model(x)
->>> # ...
-
->>> # Finalize calibration
->>> for name, module in model.named_modules():
-...     if name.endswith("_input_quantizer"):
-...         module.load_calib_amax()
-...         module.enable_quant()
-
->>> # If running on accelerator, it needs to call `.to(xx)` again because new tensors will be created by calibration process
->>> from accelerate import Accelerator
->>> device = Accelerator().device
->>> model.to(device)
-
->>> # Keep running the quantized model
->>> # ...
-```
-
-### Export to ONNX
-
-The goal of exporting to ONNX is to deploy inference by [TensorRT](https://developer.nvidia.com/tensorrt). Fake
-quantization will be broken into a pair of QuantizeLinear/DequantizeLinear ONNX ops. After setting static member of
-TensorQuantizer to use Pytorch's own fake quantization functions, fake quantized model can be exported to ONNX, follow
-the instructions in [torch.onnx](https://pytorch.org/docs/stable/onnx.html). Example:
-
-```python
->>> from pytorch_quantization.nn import TensorQuantizer
-
->>> TensorQuantizer.use_fb_fake_quant = True
-
->>> # Load the calibrated model
->>> ...
->>> # ONNX export
->>> torch.onnx.export(...)
-```
-
-## Resources
-
- [Text classification task guide](../tasks/sequence_classification)
- [Token classification task guide](../tasks/token_classification)
- [Question answering task guide](../tasks/question_answering)
- [Causal language modeling task guide](../tasks/language_modeling)
- [Masked language modeling task guide](../tasks/masked_language_modeling)
- [Multiple choice task guide](../tasks/multiple_choice)
-
-## QDQBertConfig
-
-[[autodoc]] QDQBertConfig
-
-## QDQBertModel
-
-[[autodoc]] QDQBertModel
-    - forward
-
-## QDQBertLMHeadModel
-
-[[autodoc]] QDQBertLMHeadModel
-    - forward
-
-## QDQBertForMaskedLM
-
-[[autodoc]] QDQBertForMaskedLM
-    - forward
-
-## QDQBertForSequenceClassification
-
-[[autodoc]] QDQBertForSequenceClassification
-    - forward
-
-## QDQBertForNextSentencePrediction
-
-[[autodoc]] QDQBertForNextSentencePrediction
-    - forward
-
-## QDQBertForMultipleChoice
-
-[[autodoc]] QDQBertForMultipleChoice
-    - forward
-
-## QDQBertForTokenClassification
-
-[[autodoc]] QDQBertForTokenClassification
-    - forward
-
-## QDQBertForQuestionAnswering
-
-[[autodoc]] QDQBertForQuestionAnswering
-    - forward
--- a/docs/source/en/model_doc/qwen2_5_omni.md
+++ b/docs/source/en/model_doc/qwen2_5_omni.md
@ -136,7 +136,7 @@ inputs = processor.apply_chat_template(
    tokenize=True,
    return_dict=True,
    return_tensors="pt",
-    video_fps=1,
+    fps=1,

    # kwargs to be passed to `Qwen2-5-OmniProcessor`
    padding=True,
@ -245,7 +245,7 @@ inputs = processor.apply_chat_template(
    tokenize=True,
    return_dict=True,
    return_tensors="pt",
-    video_fps=1,
+    fps=1,

    # kwargs to be passed to `Qwen2-5-OmniProcessor`
    padding=True,
--- a/docs/source/en/model_doc/qwen2_audio.md
+++ b/docs/source/en/model_doc/qwen2_audio.md
@ -54,7 +54,7 @@ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B", trust_remote_co
 prompt = "<|audio_bos|><|AUDIO|><|audio_eos|>Generate the caption in English:"
 url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Audio/glass-breaking-151256.mp3"
 audio, sr = librosa.load(BytesIO(urlopen(url).read()), sr=processor.feature_extractor.sampling_rate)
-inputs = processor(text=prompt, audios=audio, return_tensors="pt").to(model.device)
+inputs = processor(text=prompt, audio=audio, return_tensors="pt").to(model.device)

 generate_ids = model.generate(**inputs, max_length=256)
 generate_ids = generate_ids[:, inputs.input_ids.size(1):]
@ -63,7 +63,7 @@ response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_

 # We can also omit the audio_bos and audio_eos tokens
 prompt = "<|AUDIO|>Generate the caption in English:"
-inputs = processor(text=prompt, audios=audio, return_tensors="pt").to(model.device)
+inputs = processor(text=prompt, audio=audio, return_tensors="pt").to(model.device)

 generate_ids = model.generate(**inputs, max_length=256)
 generate_ids = generate_ids[:, inputs.input_ids.size(1):]
@ -106,7 +106,7 @@ for message in conversation:
                    sr=processor.feature_extractor.sampling_rate)[0]
                )

-inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True)
+inputs = processor(text=text, audio=audios, return_tensors="pt", padding=True)
 inputs.input_ids = inputs.input_ids.to(model.device)

 generate_ids = model.generate(**inputs, max_length=256)
@ -156,7 +156,7 @@ for message in conversation:
                        sr=processor.feature_extractor.sampling_rate)[0]
                )

-inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True)
+inputs = processor(text=text, audio=audios, return_tensors="pt", padding=True)
 inputs.input_ids = inputs.input_ids.to(model.device)

 generate_ids = model.generate(**inputs, max_length=256)
@ -213,7 +213,7 @@ for conversation in conversations:
                            sr=processor.feature_extractor.sampling_rate)[0]
                    )

-inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True)
+inputs = processor(text=text, audio=audios, return_tensors="pt", padding=True)
 inputs['input_ids'] = inputs['input_ids'].to(model.device)
 inputs.input_ids = inputs.input_ids.to(model.device)

--- a/docs/source/en/model_doc/qwen3_omni_moe.md
+++ b/docs/source/en/model_doc/qwen3_omni_moe.md
@ -80,7 +80,7 @@ inputs = processor.apply_chat_template(
    tokenize=True,
    return_dict=True,
    return_tensors="pt",
-    video_fps=1,
+    fps=1,

    # kwargs to be passed to `Qwen3OmniMoeProcessor`
    padding=True,
@ -136,7 +136,7 @@ inputs = processor.apply_chat_template(
    tokenize=True,
    return_dict=True,
    return_tensors="pt",
-    video_fps=1,
+    fps=1,

    # kwargs to be passed to `Qwen3OmniMoeProcessor`
    padding=True,
@ -245,7 +245,7 @@ inputs = processor.apply_chat_template(
    tokenize=True,
    return_dict=True,
    return_tensors="pt",
-    video_fps=1,
+    fps=1,

    # kwargs to be passed to `Qwen3OmniMoeProcessor`
    padding=True,
--- a/docs/source/en/model_doc/realm.md
+++ b/docs/source/en/model_doc/realm.md
@ -1,102 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-*This model was released on 2020-02-10 and added to Hugging Face Transformers on 2023-06-20.*
-
-# REALM
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-</div>
-
-<Tip warning={true}>
-
-This model is in maintenance mode only, we don't accept any new PRs changing its code.
-If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
-You can do so by running the following command: `pip install -U transformers==4.40.2`.
-
-</Tip>
-
-## Overview
-
-The REALM model was proposed in [REALM: Retrieval-Augmented Language Model Pre-Training](https://huggingface.co/papers/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang. It's a
-retrieval-augmented language model that firstly retrieves documents from a textual knowledge corpus and then
-utilizes retrieved documents to process question answering tasks.
-
-The abstract from the paper is the following:
-
-*Language model pre-training has been shown to capture a surprising amount of world knowledge, crucial for NLP tasks
-such as question answering. However, this knowledge is stored implicitly in the parameters of a neural network,
-requiring ever-larger networks to cover more facts. To capture knowledge in a more modular and interpretable way, we
-augment language model pre-training with a latent knowledge retriever, which allows the model to retrieve and attend
-over documents from a large corpus such as Wikipedia, used during pre-training, fine-tuning and inference. For the
-first time, we show how to pre-train such a knowledge retriever in an unsupervised manner, using masked language
-modeling as the learning signal and backpropagating through a retrieval step that considers millions of documents. We
-demonstrate the effectiveness of Retrieval-Augmented Language Model pre-training (REALM) by fine-tuning on the
-challenging task of Open-domain Question Answering (Open-QA). We compare against state-of-the-art models for both
-explicit and implicit knowledge storage on three popular Open-QA benchmarks, and find that we outperform all previous
-methods by a significant margin (4-16% absolute accuracy), while also providing qualitative benefits such as
-interpretability and modularity.*
-
-This model was contributed by [qqaatw](https://huggingface.co/qqaatw). The original code can be found
-[here](https://github.com/google-research/language/tree/master/language/realm).
-
-## RealmConfig
-
-[[autodoc]] RealmConfig
-
-## RealmTokenizer
-
-[[autodoc]] RealmTokenizer
-    - build_inputs_with_special_tokens
-    - get_special_tokens_mask
-    - create_token_type_ids_from_sequences
-    - save_vocabulary
-    - batch_encode_candidates
-
-## RealmTokenizerFast
-
-[[autodoc]] RealmTokenizerFast
-    - batch_encode_candidates
-
-## RealmRetriever
-
-[[autodoc]] RealmRetriever
-
-## RealmEmbedder
-
-[[autodoc]] RealmEmbedder
-    - forward
-
-## RealmScorer
-
-[[autodoc]] RealmScorer
-    - forward
-
-## RealmKnowledgeAugEncoder
-
-[[autodoc]] RealmKnowledgeAugEncoder
-    - forward
-
-## RealmReader
-
-[[autodoc]] RealmReader
-    - forward
-
-## RealmForOpenQA
-
-[[autodoc]] RealmForOpenQA
-    - block_embedding_to
-    - forward
--- a/docs/source/en/model_doc/retribert.md
+++ b/docs/source/en/model_doc/retribert.md
@ -1,57 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-*This model was released on 2020-06-12 and added to Hugging Face Transformers on 2023-06-20.*
-
-# RetriBERT
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-</div>
-
-<Tip warning={true}>
-
-This model is in maintenance mode only, so we won't accept any new PRs changing its code.
-
-If you run into any issues running this model, please reinstall the last version that supported this model: v4.30.0.
-You can do so by running the following command: `pip install -U transformers==4.30.0`.
-
-</Tip>
-
-## Overview
-
-The [RetriBERT](https://huggingface.co/yjernite/retribert-base-uncased/tree/main) model was proposed in the blog post [Explain Anything Like I'm Five: A Model for Open Domain Long Form
-Question Answering](https://yjernite.github.io/lfqa.html). RetriBERT is a small model that uses either a single or
-pair of BERT encoders with lower-dimension projection for dense semantic indexing of text.
-
-This model was contributed by [yjernite](https://huggingface.co/yjernite). Code to train and use the model can be
-found [here](https://github.com/huggingface/transformers/tree/main/examples/research-projects/distillation).
-
-## RetriBertConfig
-
-[[autodoc]] RetriBertConfig
-
-## RetriBertTokenizer
-
-[[autodoc]] RetriBertTokenizer
-
-## RetriBertTokenizerFast
-
-[[autodoc]] RetriBertTokenizerFast
-
-## RetriBertModel
-
-[[autodoc]] RetriBertModel
-    - forward
--- a/docs/source/en/model_doc/seamless_m4t.md
+++ b/docs/source/en/model_doc/seamless_m4t.md
@ -61,7 +61,7 @@ Here is how to use the processor to process text and audio:
 >>> audio_sample = next(iter(dataset))["audio"]

 >>> # now, process it
->>> audio_inputs = processor(audios=audio_sample["array"], return_tensors="pt")
+>>> audio_inputs = processor(audio=audio_sample["array"], return_tensors="pt")

 >>> # now, process some English test as well
 >>> text_inputs = processor(text = "Hello, my dog is cute", src_lang="eng", return_tensors="pt")
--- a/docs/source/en/model_doc/seamless_m4t_v2.md
+++ b/docs/source/en/model_doc/seamless_m4t_v2.md
@ -61,7 +61,7 @@ Here is how to use the processor to process text and audio:
 >>> audio_sample = next(iter(dataset))["audio"]

 >>> # now, process it
->>> audio_inputs = processor(audios=audio_sample["array"], return_tensors="pt")
+>>> audio_inputs = processor(audio=audio_sample["array"], return_tensors="pt")

 >>> # now, process some English text as well
 >>> text_inputs = processor(text = "Hello, my dog is cute", src_lang="eng", return_tensors="pt")
--- a/docs/source/en/model_doc/smolvlm.md
+++ b/docs/source/en/model_doc/smolvlm.md
@ -159,7 +159,7 @@ conversation3 = [

 conversations = [conversation1, conversation2, conversation3]
 inputs = processor.apply_chat_template(
-    conversation,
+    conversations,
    add_generation_prompt=True,
    tokenize=True,
    return_dict=True,
--- a/docs/source/en/model_doc/speech_to_text_2.md
+++ b/docs/source/en/model_doc/speech_to_text_2.md
@ -1,133 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-*This model was released on 2021-04-14 and added to Hugging Face Transformers on 2023-06-20.*
-
-# Speech2Text2
-
-  <Tip warning={true}>
-
-  This model is in maintenance mode only, we don't accept any new PRs changing its code.
-  If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
-  You can do so by running the following command: `pip install -U transformers==4.40.2`.
-
-  </Tip>
-
-## Overview
-
-The Speech2Text2 model is used together with [Wav2Vec2](wav2vec2) for Speech Translation models proposed in
-[Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://huggingface.co/papers/2104.06678) by
-Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
-
-Speech2Text2 is a *decoder-only* transformer model that can be used with any speech *encoder-only*, such as
-[Wav2Vec2](wav2vec2) or [HuBERT](hubert) for Speech-to-Text tasks. Please refer to the
-[SpeechEncoderDecoder](speech-encoder-decoder) class on how to combine Speech2Text2 with any speech *encoder-only*
-model.
-
-This model was contributed by [Patrick von Platen](https://huggingface.co/patrickvonplaten).
-
-The original code can be found [here](https://github.com/pytorch/fairseq/blob/1f7ef9ed1e1061f8c7f88f8b94c7186834398690/fairseq/models/wav2vec/wav2vec2_asr.py#L266).
-
-## Usage tips
-
- Speech2Text2 achieves state-of-the-art results on the CoVoST Speech Translation dataset. For more information, see
-  the [official models](https://huggingface.co/models?other=speech2text2) .
- Speech2Text2 is always used within the [SpeechEncoderDecoder](speech-encoder-decoder) framework.
- Speech2Text2's tokenizer is based on [fastBPE](https://github.com/glample/fastBPE).
-
-## Inference
-
-Speech2Text2's [`SpeechEncoderDecoderModel`] model accepts raw waveform input values from speech and
-makes use of [`~generation.GenerationMixin.generate`] to translate the input speech
-autoregressively to the target language.
-
-The [`Wav2Vec2FeatureExtractor`] class is responsible for preprocessing the input speech and
-[`Speech2Text2Tokenizer`] decodes the generated target tokens to the target string. The
-[`Speech2Text2Processor`] wraps [`Wav2Vec2FeatureExtractor`] and
-[`Speech2Text2Tokenizer`] into a single instance to both extract the input features and decode the
-predicted token ids.
-
- Step-by-step Speech Translation
-
-```python
->>> from transformers import Speech2Text2Processor, SpeechEncoderDecoderModel
->>> from datasets import load_dataset
-
->>> model = SpeechEncoderDecoderModel.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
->>> processor = Speech2Text2Processor.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
-
-
->>> def map_to_array(example):
-...     example["speech"] = example["audio"]["array"]
-...     return example
-
-
->>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
->>> ds = ds.map(map_to_array)
-
->>> inputs = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="pt")
->>> generated_ids = model.generate(inputs=inputs["input_values"], attention_mask=inputs["attention_mask"])
-
->>> transcription = processor.batch_decode(generated_ids)
-```
-
- Speech Translation via Pipelines
-
-  The automatic speech recognition pipeline can also be used to translate speech in just a couple lines of code
-
-```python
->>> from datasets import load_dataset
->>> from transformers import pipeline
-
->>> librispeech_en = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
->>> asr = pipeline(
-...     "automatic-speech-recognition",
-...     model="facebook/s2t-wav2vec2-large-en-de",
-...     feature_extractor="facebook/s2t-wav2vec2-large-en-de",
-... )
-
->>> translation_de = asr(librispeech_en[0]["file"])
-```
-
-See [model hub](https://huggingface.co/models?filter=speech2text2) to look for Speech2Text2 checkpoints.
-
-## Resources
-
- [Causal language modeling task guide](../tasks/language_modeling)
-
-## Speech2Text2Config
-
-[[autodoc]] Speech2Text2Config
-
-## Speech2TextTokenizer
-
-[[autodoc]] Speech2Text2Tokenizer
-    - batch_decode
-    - decode
-    - save_vocabulary
-
-## Speech2Text2Processor
-
-[[autodoc]] Speech2Text2Processor
-    - __call__
-    - from_pretrained
-    - save_pretrained
-    - batch_decode
-    - decode
-
-## Speech2Text2ForCausalLM
-
-[[autodoc]] Speech2Text2ForCausalLM
-    - forward
--- a/docs/source/en/model_doc/tapex.md
+++ b/docs/source/en/model_doc/tapex.md
@ -1,155 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-*This model was released on 2021-07-16 and added to Hugging Face Transformers on 2023-06-20.*
-
-# TAPEX
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-</div>
-
-<Tip warning={true}>
-
-This model is in maintenance mode only, we don't accept any new PRs changing its code.
-
-If you run into any issues running this model, please reinstall the last version that supported this model: v4.30.0.
-You can do so by running the following command: `pip install -U transformers==4.30.0`.
-
-</Tip>
-
-## Overview
-
-The TAPEX model was proposed in [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://huggingface.co/papers/2107.07653) by Qian Liu,
-Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou. TAPEX pre-trains a BART model to solve synthetic SQL queries, after
-which it can be fine-tuned to answer natural language questions related to tabular data, as well as performing table fact checking.
-
-TAPEX has been fine-tuned on several datasets:
-
- [SQA](https://www.microsoft.com/en-us/download/details.aspx?id=54253) (Sequential Question Answering by Microsoft)
- [WTQ](https://github.com/ppasupat/WikiTableQuestions) (Wiki Table Questions by Stanford University)
- [WikiSQL](https://github.com/salesforce/WikiSQL) (by Salesforce)
- [TabFact](https://tabfact.github.io/) (by USCB NLP Lab).
-
-The abstract from the paper is the following:
-
-*Recent progress in language model pre-training has achieved a great success via leveraging large-scale unstructured textual data. However, it is
-still a challenge to apply pre-training on structured tabular data due to the absence of large-scale high-quality tabular data. In this paper, we
-propose TAPEX to show that table pre-training can be achieved by learning a neural SQL executor over a synthetic corpus, which is obtained by automatically
-synthesizing executable SQL queries and their execution outputs. TAPEX addresses the data scarcity challenge via guiding the language model to mimic a SQL
-executor on the diverse, large-scale and high-quality synthetic corpus. We evaluate TAPEX on four benchmark datasets. Experimental results demonstrate that
-TAPEX outperforms previous table pre-training approaches by a large margin and achieves new state-of-the-art results on all of them. This includes improvements
-on the weakly-supervised WikiSQL denotation accuracy to 89.5% (+2.3%), the WikiTableQuestions denotation accuracy to 57.5% (+4.8%), the SQA denotation accuracy
-to 74.5% (+3.5%), and the TabFact accuracy to 84.2% (+3.2%). To our knowledge, this is the first work to exploit table pre-training via synthetic executable programs
-and to achieve new state-of-the-art results on various downstream tasks.*
-
-## Usage tips
-
- TAPEX is a generative (seq2seq) model. One can directly plug in the weights of TAPEX into a BART model.
- TAPEX has checkpoints on the hub that are either pre-trained only, or fine-tuned on WTQ, SQA, WikiSQL and TabFact.
- Sentences + tables are presented to the model as `sentence + " " + linearized table`. The linearized table has the following format:
-  `col: col1 | col2 | col 3 row 1 : val1 | val2 | val3 row 2 : ...`.
- TAPEX has its own tokenizer, that allows to prepare all data for the model easily. One can pass Pandas DataFrames and strings to the tokenizer,
-  and it will automatically create the `input_ids` and `attention_mask` (as shown in the usage examples below).
-
-### Usage: inference
-
-Below, we illustrate how to use TAPEX for table question answering. As one can see, one can directly plug in the weights of TAPEX into a BART model.
-We use the [Auto API](auto), which will automatically instantiate the appropriate tokenizer ([`TapexTokenizer`]) and model ([`BartForConditionalGeneration`]) for us,
-based on the configuration file of the checkpoint on the hub.
-
-```python
->>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
->>> import pandas as pd
-
->>> tokenizer = AutoTokenizer.from_pretrained("microsoft/tapex-large-finetuned-wtq")
->>> model = AutoModelForSeq2SeqLM.from_pretrained("microsoft/tapex-large-finetuned-wtq")
-
->>> # prepare table + question
->>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]}
->>> table = pd.DataFrame.from_dict(data)
->>> question = "how many movies does Leonardo Di Caprio have?"
-
->>> encoding = tokenizer(table, question, return_tensors="pt")
-
->>> # let the model generate an answer autoregressively
->>> outputs = model.generate(**encoding)
-
->>> # decode back to text
->>> predicted_answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
->>> print(predicted_answer)
-53
-```
-
-Note that [`TapexTokenizer`] also supports batched inference. Hence, one can provide a batch of different tables/questions, or a batch of a single table
-and multiple questions, or a batch of a single query and multiple tables. Let's illustrate this:
-
-```python
->>> # prepare table + question
->>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]}
->>> table = pd.DataFrame.from_dict(data)
->>> questions = [
-...     "how many movies does Leonardo Di Caprio have?",
-...     "which actor has 69 movies?",
-...     "what's the first name of the actor who has 87 movies?",
-... ]
->>> encoding = tokenizer(table, questions, padding=True, return_tensors="pt")
-
->>> # let the model generate an answer autoregressively
->>> outputs = model.generate(**encoding)
-
->>> # decode back to text
->>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-[' 53', ' george clooney', ' brad pitt']
-```
-
-In case one wants to do table verification (i.e. the task of determining whether a given sentence is supported or refuted by the contents
-of a table), one can instantiate a [`BartForSequenceClassification`] model. TAPEX has checkpoints on the hub fine-tuned on TabFact, an important
-benchmark for table fact checking (it achieves 84% accuracy). The code example below again leverages the [Auto API](auto).
-
-```python
->>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
-
->>> tokenizer = AutoTokenizer.from_pretrained("microsoft/tapex-large-finetuned-tabfact")
->>> model = AutoModelForSequenceClassification.from_pretrained("microsoft/tapex-large-finetuned-tabfact")
-
->>> # prepare table + sentence
->>> data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]}
->>> table = pd.DataFrame.from_dict(data)
->>> sentence = "George Clooney has 30 movies"
-
->>> encoding = tokenizer(table, sentence, return_tensors="pt")
-
->>> # forward pass
->>> outputs = model(**encoding)
-
->>> # print prediction
->>> predicted_class_idx = outputs.logits[0].argmax(dim=0).item()
->>> print(model.config.id2label[predicted_class_idx])
-Refused
-```
-
-<Tip>
-
-TAPEX architecture is the same as BART, except for tokenization. Refer to [BART documentation](bart) for information on
-configuration classes and their parameters. TAPEX-specific tokenizer is documented below.
-
-</Tip>
-
-## TapexTokenizer
-
-[[autodoc]] TapexTokenizer
-    - __call__
-    - save_vocabulary
--- a/docs/source/en/model_doc/trajectory_transformer.md
+++ b/docs/source/en/model_doc/trajectory_transformer.md
@ -1,66 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-*This model was released on 2021-06-03 and added to Hugging Face Transformers on 2023-06-20.*
-
-# Trajectory Transformer
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-</div>
-
-<Tip warning={true}>
-
-This model is in maintenance mode only, so we won't accept any new PRs changing its code.
-
-If you run into any issues running this model, please reinstall the last version that supported this model: v4.30.0.
-You can do so by running the following command: `pip install -U transformers==4.30.0`.
-
-</Tip>
-
-## Overview
-
-The Trajectory Transformer model was proposed in [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://huggingface.co/papers/2106.02039)  by Michael Janner, Qiyang Li, Sergey Levine.
-
-The abstract from the paper is the following:
-
-*Reinforcement learning (RL) is typically concerned with estimating stationary policies or single-step models,
-leveraging the Markov property to factorize problems in time. However, we can also view RL as a generic sequence
-modeling problem, with the goal being to produce a sequence of actions that leads to a sequence of high rewards.
-Viewed in this way, it is tempting to consider whether high-capacity sequence prediction models that work well
-in other domains, such as natural-language processing, can also provide effective solutions to the RL problem.
-To this end, we explore how RL can be tackled with the tools of sequence modeling, using a Transformer architecture
-to model distributions over trajectories and repurposing beam search as a planning algorithm. Framing RL as sequence
-modeling problem simplifies a range of design decisions, allowing us to dispense with many of the components common
-in offline RL algorithms. We demonstrate the flexibility of this approach across long-horizon dynamics prediction,
-imitation learning, goal-conditioned RL, and offline RL. Further, we show that this approach can be combined with
-existing model-free algorithms to yield a state-of-the-art planner in sparse-reward, long-horizon tasks.*
-
-This model was contributed by [CarlCochet](https://huggingface.co/CarlCochet). The original code can be found [here](https://github.com/jannerm/trajectory-transformer).
-
-## Usage tips
-
-This Transformer is used for deep reinforcement learning. To use it, you need to create sequences from
-actions, states and rewards from all previous timesteps. This model will treat all these elements together
-as one big sequence (a trajectory).
-
-## TrajectoryTransformerConfig
-
-[[autodoc]] TrajectoryTransformerConfig
-
-## TrajectoryTransformerModel
-
-[[autodoc]] TrajectoryTransformerModel
-    - forward
--- a/docs/source/en/model_doc/transfo-xl.md
+++ b/docs/source/en/model_doc/transfo-xl.md
@ -1,136 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-*This model was released on 2019-01-09 and added to Hugging Face Transformers on 2023-06-20.*
-
-# Transformer XL
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-</div>
-
-<Tip warning={true}>
-
-This model is in maintenance mode only, so we won't accept any new PRs changing its code. This model was deprecated due to security issues linked to `pickle.load`.
-
-We recommend switching to more recent models for improved security.
-
-In case you would still like to use `TransfoXL` in your experiments, we recommend using the [Hub checkpoint](https://huggingface.co/transfo-xl/transfo-xl-wt103) with a specific revision to ensure you are downloading safe files from the Hub.
-
-You will need to set the environment variable `TRUST_REMOTE_CODE` to `True` in order to allow the
-usage of `pickle.load()`:
-
-```python
-import os
-from transformers import TransfoXLTokenizer, TransfoXLLMHeadModel
-
-os.environ["TRUST_REMOTE_CODE"] = "True"
-
-checkpoint = 'transfo-xl/transfo-xl-wt103'
-revision = '40a186da79458c9f9de846edfaea79c412137f97'
-
-tokenizer = TransfoXLTokenizer.from_pretrained(checkpoint, revision=revision)
-model = TransfoXLLMHeadModel.from_pretrained(checkpoint, revision=revision)
-```
-
-If you run into any issues running this model, please reinstall the last version that supported this model: v4.35.0.
-You can do so by running the following command: `pip install -U transformers==4.35.0`.
-
-</Tip>
-
-<div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=transfo-xl">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-transfo--xl-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/transfo-xl-wt103">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
-</div>
-
-## Overview
-
-The Transformer-XL model was proposed in [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://huggingface.co/papers/1901.02860) by Zihang Dai, Zhilin Yang, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan
-Salakhutdinov. It's a causal (uni-directional) transformer with relative positioning (sinusoïdal) embeddings which can
-reuse previously computed hidden-states to attend to longer context (memory). This model also uses adaptive softmax
-inputs and outputs (tied).
-
-The abstract from the paper is the following:
-
-*Transformers have a potential of learning longer-term dependency, but are limited by a fixed-length context in the
-setting of language modeling. We propose a novel neural architecture Transformer-XL that enables learning dependency
-beyond a fixed length without disrupting temporal coherence. It consists of a segment-level recurrence mechanism and a
-novel positional encoding scheme. Our method not only enables capturing longer-term dependency, but also resolves the
-context fragmentation problem. As a result, Transformer-XL learns dependency that is 80% longer than RNNs and 450%
-longer than vanilla Transformers, achieves better performance on both short and long sequences, and is up to 1,800+
-times faster than vanilla Transformers during evaluation. Notably, we improve the state-of-the-art results of
-bpc/perplexity to 0.99 on enwiki8, 1.08 on text8, 18.3 on WikiText-103, 21.8 on One Billion Word, and 54.5 on Penn
-Treebank (without finetuning). When trained only on WikiText-103, Transformer-XL manages to generate reasonably
-coherent, novel text articles with thousands of tokens.*
-
-This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The original code can be found [here](https://github.com/kimiyoung/transformer-xl).
-
-## Usage tips
-
- Transformer-XL uses relative sinusoidal positional embeddings. Padding can be done on the left or on the right. The
-  original implementation trains on SQuAD with padding on the left, therefore the padding defaults are set to left.
- Transformer-XL is one of the few models that has no sequence length limit.
- Same as a regular GPT model, but introduces a recurrence mechanism for two consecutive segments (similar to a regular RNNs with two consecutive inputs). In this context, a segment is a number of consecutive tokens (for instance 512) that may span across multiple documents, and segments are fed in order to the model.
- Basically, the hidden states of the previous segment are concatenated to the current input to compute the attention scores. This allows the model to pay attention to information that was in the previous segment as well as the current one. By stacking multiple attention layers, the receptive field can be increased to multiple previous segments.
- This changes the positional embeddings to positional relative embeddings (as the regular positional embeddings would give the same results in the current input and the current hidden state at a given position) and needs to make some adjustments in the way attention scores are computed.
-
-<Tip warning={true}>
-
-TransformerXL does **not** work with *torch.nn.DataParallel* due to a bug in PyTorch, see [issue #36035](https://github.com/pytorch/pytorch/issues/36035)
-
-</Tip>
-
-## Resources
-
- [Text classification task guide](../tasks/sequence_classification)
- [Causal language modeling task guide](../tasks/language_modeling)
-
-## TransfoXLConfig
-
-[[autodoc]] TransfoXLConfig
-
-## TransfoXLTokenizer
-
-[[autodoc]] TransfoXLTokenizer
-    - save_vocabulary
-
-## TransfoXL specific outputs
-
-[[autodoc]] models.deprecated.transfo_xl.modeling_transfo_xl.TransfoXLModelOutput
-
-[[autodoc]] models.deprecated.transfo_xl.modeling_transfo_xl.TransfoXLLMHeadModelOutput
-
-## TransfoXLModel
-
-[[autodoc]] TransfoXLModel
-    - forward
-
-## TransfoXLLMHeadModel
-
-[[autodoc]] TransfoXLLMHeadModel
-    - forward
-
-## TransfoXLForSequenceClassification
-
-[[autodoc]] TransfoXLForSequenceClassification
-    - forward
-
-## Internal Layers
-
-[[autodoc]] AdaptiveEmbedding
--- a/docs/source/en/model_doc/tvlt.md
+++ b/docs/source/en/model_doc/tvlt.md
@ -1,90 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-*This model was released on 2022-09-28 and added to Hugging Face Transformers on 2023-06-20.*
-
-# TVLT
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-</div>
-
-<Tip warning={true}>
-
-This model is in maintenance mode only, we don't accept any new PRs changing its code.
-If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
-You can do so by running the following command: `pip install -U transformers==4.40.2`.
-
-</Tip>
-
-## Overview
-
-The TVLT model was proposed in [TVLT: Textless Vision-Language Transformer](https://huggingface.co/papers/2209.14156)
-by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal (the first three authors contributed equally). The Textless Vision-Language Transformer (TVLT) is a model that uses raw visual and audio inputs for vision-and-language representation learning, without using text-specific modules such as tokenization or automatic speech recognition (ASR). It can perform various audiovisual and vision-language tasks like retrieval, question answering, etc.
-
-The abstract from the paper is the following:
-
-*In this work, we present the Textless Vision-Language Transformer (TVLT), where homogeneous transformer blocks take raw visual and audio inputs for vision-and-language representation learning with minimal modality-specific design, and do not use text-specific modules such as tokenization or automatic speech recognition (ASR). TVLT is trained by reconstructing masked patches of continuous video frames and audio spectrograms (masked autoencoding) and contrastive modeling to align video and audio. TVLT attains performance comparable to its text-based counterpart on various multimodal tasks, such as visual question answering, image retrieval, video retrieval, and multimodal sentiment analysis, with 28x faster inference speed and only 1/3 of the parameters. Our findings suggest the possibility of learning compact and efficient visual-linguistic representations from low-level visual and audio signals without assuming the prior existence of text.*
-
-<p align="center">
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/tvlt_architecture.png"
-alt="drawing" width="600"/>
-</p>
-
-<small> TVLT architecture. Taken from the <a href="[https://huggingface.co/papers/2102.03334](https://huggingface.co/papers/2209.14156)">original paper</a>. </small>
-
-The original code can be found [here](https://github.com/zinengtang/TVLT). This model was contributed by [Zineng Tang](https://huggingface.co/ZinengTang).
-
-## Usage tips
-
- TVLT is a model that takes both `pixel_values` and `audio_values` as input. One can use [`TvltProcessor`] to prepare data for the model.
-  This processor wraps an image processor (for the image/video modality) and an audio feature extractor (for the audio modality) into one.
- TVLT is trained with images/videos and audios of various sizes: the authors resize and crop the input images/videos to 224 and limit the length of audio spectrogram to 2048. To make batching of videos and audios possible, the authors use a `pixel_mask` that indicates which pixels are real/padding and `audio_mask` that indicates which audio values are real/padding.
- The design of TVLT is very similar to that of a standard Vision Transformer (ViT) and masked autoencoder (MAE) as in [ViTMAE](vitmae). The difference is that the model includes embedding layers for the audio modality.
- The PyTorch version of this model is only available in torch 1.10 and higher.
-
-## TvltConfig
-
-[[autodoc]] TvltConfig
-
-## TvltProcessor
-
-[[autodoc]] TvltProcessor
-    - __call__
-
-## TvltFeatureExtractor
-
-[[autodoc]] TvltFeatureExtractor
-    - __call__
-
-## TvltImageProcessor
-
-[[autodoc]] TvltImageProcessor
-    - preprocess
-
-## TvltModel
-
-[[autodoc]] TvltModel
-    - forward
-
-## TvltForPreTraining
-
-[[autodoc]] TvltForPreTraining
-    - forward
-
-## TvltForAudioVisualClassification
-
-[[autodoc]] TvltForAudioVisualClassification
-    - forward
--- a/docs/source/en/model_doc/van.md
+++ b/docs/source/en/model_doc/van.md
@ -1,76 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-*This model was released on 2022-02-20 and added to Hugging Face Transformers on 2023-06-20.*
-
-# VAN
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-</div>
-
-<Tip warning={true}>
-
-This model is in maintenance mode only, we don't accept any new PRs changing its code.
-
-If you run into any issues running this model, please reinstall the last version that supported this model: v4.30.0.
-You can do so by running the following command: `pip install -U transformers==4.30.0`.
-
-</Tip>
-
-## Overview
-
-The VAN model was proposed in [Visual Attention Network](https://huggingface.co/papers/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
-
-This paper introduces a new attention layer based on convolution operations able to capture both local and distant relationships. This is done by combining normal and large kernel convolution layers. The latter uses a dilated convolution to capture distant correlations.
-
-The abstract from the paper is the following:
-
-*While originally designed for natural language processing tasks, the self-attention mechanism has recently taken various computer vision areas by storm. However, the 2D nature of images brings three challenges for applying self-attention in computer vision. (1) Treating images as 1D sequences neglects their 2D structures. (2) The quadratic complexity is too expensive for high-resolution images. (3) It only captures spatial adaptability but ignores channel adaptability. In this paper, we propose a novel large kernel attention (LKA) module to enable self-adaptive and long-range correlations in self-attention while avoiding the above issues. We further introduce a novel neural network based on LKA, namely Visual Attention Network (VAN). While extremely simple, VAN outperforms the state-of-the-art vision transformers and convolutional neural networks with a large margin in extensive experiments, including image classification, object detection, semantic segmentation, instance segmentation, etc. Code is available at [this https URL](https://github.com/Visual-Attention-Network/VAN-Classification).*
-
-Tips:
-
- VAN does not have an embedding layer, thus the `hidden_states` will have a length equal to the number of stages.
-
-The figure below illustrates the architecture of a Visual Attention Layer. Taken from the [original paper](https://huggingface.co/papers/2202.09741).
-
-<img width="600" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/van_architecture.png"/>
-
-This model was contributed by [Francesco](https://huggingface.co/Francesco). The original code can be found [here](https://github.com/Visual-Attention-Network/VAN-Classification).
-
-## Resources
-
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with VAN.
-
-<PipelineTag pipeline="image-classification"/>
-
- [`VanForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
- See also: [Image classification task guide](../tasks/image_classification)
-
-If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
-
-## VanConfig
-
-[[autodoc]] VanConfig
-
-## VanModel
-
-[[autodoc]] VanModel
-    - forward
-
-## VanForImageClassification
-
-[[autodoc]] VanForImageClassification
-    - forward
--- a/docs/source/en/model_doc/vit_hybrid.md
+++ b/docs/source/en/model_doc/vit_hybrid.md
@ -1,112 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-*This model was released on 2020-10-22 and added to Hugging Face Transformers on 2023-06-20.*
-
-# Hybrid Vision Transformer (ViT Hybrid)
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-</div>
-
-<Tip warning={true}>
-
-This model is in maintenance mode only, we don't accept any new PRs changing its code.
-If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
-You can do so by running the following command: `pip install -U transformers==4.40.2`.
-
-</Tip>
-
-## Overview
-
-The hybrid Vision Transformer (ViT) model was proposed in [An Image is Worth 16x16 Words: Transformers for Image Recognition
-at Scale](https://huggingface.co/papers/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk
-Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob
-Uszkoreit, Neil Houlsby. It's the first paper that successfully trains a Transformer encoder on ImageNet, attaining
-very good results compared to familiar convolutional architectures. ViT hybrid is a slight variant of the [plain Vision Transformer](vit),
-by leveraging a convolutional backbone (specifically, [BiT](bit)) whose features are used as initial "tokens" for the Transformer.
-
-The abstract from the paper is the following:
-
-*While the Transformer architecture has become the de-facto standard for natural language processing tasks, its
-applications to computer vision remain limited. In vision, attention is either applied in conjunction with
-convolutional networks, or used to replace certain components of convolutional networks while keeping their overall
-structure in place. We show that this reliance on CNNs is not necessary and a pure transformer applied directly to
-sequences of image patches can perform very well on image classification tasks. When pre-trained on large amounts of
-data and transferred to multiple mid-sized or small image recognition benchmarks (ImageNet, CIFAR-100, VTAB, etc.),
-Vision Transformer (ViT) attains excellent results compared to state-of-the-art convolutional networks while requiring
-substantially fewer computational resources to train.*
-
-This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code (written in JAX) can be
-found [here](https://github.com/google-research/vision_transformer).
-
-## Using Scaled Dot Product Attention (SDPA)
-
-PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function
-encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the
-[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)
-or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
-page for more information.
-
-SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set
-`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
-
-```py
-from transformers import ViTHybridForImageClassification
-model = ViTHybridForImageClassification.from_pretrained("google/vit-hybrid-base-bit-384", attn_implementation="sdpa", dtype=torch.float16)
-...
-```
-
-For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).
-
-On a local benchmark (A100-40GB, PyTorch 2.3.0, OS Ubuntu 22.04) with `float32` and `google/vit-hybrid-base-bit-384` model, we saw the following speedups during inference.
-
-|   Batch size |   Average inference time (ms), eager mode |   Average inference time (ms), sdpa model |   Speed up, Sdpa / Eager (x) |
-|--------------|-------------------------------------------|-------------------------------------------|------------------------------|
-|            1 |                                        29 |                                        18 |                      1.61 |
-|            2 |                                        26 |                                        18 |                      1.44 |
-|            4 |                                        25 |                                        18 |                      1.39 |
-|            8 |                                        34 |                                        24 |                      1.42 |
-
-## Resources
-
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ViT Hybrid.
-
-<PipelineTag pipeline="image-classification"/>
-
- [`ViTHybridForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
- See also: [Image classification task guide](../tasks/image_classification)
-
-If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
-
-## ViTHybridConfig
-
-[[autodoc]] ViTHybridConfig
-
-## ViTHybridImageProcessor
-
-[[autodoc]] ViTHybridImageProcessor
-    - preprocess
-
-## ViTHybridModel
-
-[[autodoc]] ViTHybridModel
-    - forward
-
-## ViTHybridForImageClassification
-
-[[autodoc]] ViTHybridForImageClassification
-    - forward
--- a/docs/source/en/model_doc/xlm-prophetnet.md
+++ b/docs/source/en/model_doc/xlm-prophetnet.md
@ -1,99 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-*This model was released on 2020-01-13 and added to Hugging Face Transformers on 2023-06-20.*
-
-# XLM-ProphetNet
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-</div>
-
-<Tip warning={true}>
-
-This model is in maintenance mode only, we don't accept any new PRs changing its code.
-If you run into any issues running this model, please reinstall the last version that supported this model: v4.40.2.
-You can do so by running the following command: `pip install -U transformers==4.40.2`.
-
-</Tip>
-
-<div class="flex flex-wrap space-x-1">
-<a href="https://huggingface.co/models?filter=xprophetnet">
-<img alt="Models" src="https://img.shields.io/badge/All_model_pages-xprophetnet-blueviolet">
-</a>
-<a href="https://huggingface.co/spaces/docs-demos/xprophetnet-large-wiki100-cased-xglue-ntg">
-<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
-</a>
-</div>
-
-**DISCLAIMER:** If you see something strange, file a [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title) and assign
-@patrickvonplaten
-
-## Overview
-
-The XLM-ProphetNet model was proposed in [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training,](https://huggingface.co/papers/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei
-Zhang, Ming Zhou on 13 Jan, 2020.
-
-XLM-ProphetNet is an encoder-decoder model and can predict n-future tokens for "ngram" language modeling instead of
-just the next token. Its architecture is identical to ProhpetNet, but the model was trained on the multi-lingual
-"wiki100" Wikipedia dump. XLM-ProphetNet's model architecture and pretraining objective is same as ProphetNet, but XLM-ProphetNet was pre-trained on the cross-lingual dataset XGLUE.
-
-The abstract from the paper is the following:
-
-*In this paper, we present a new sequence-to-sequence pretraining model called ProphetNet, which introduces a novel
-self-supervised objective named future n-gram prediction and the proposed n-stream self-attention mechanism. Instead of
-the optimization of one-step ahead prediction in traditional sequence-to-sequence model, the ProphetNet is optimized by
-n-step ahead prediction which predicts the next n tokens simultaneously based on previous context tokens at each time
-step. The future n-gram prediction explicitly encourages the model to plan for the future tokens and prevent
-overfitting on strong local correlations. We pre-train ProphetNet using a base scale dataset (16GB) and a large scale
-dataset (160GB) respectively. Then we conduct experiments on CNN/DailyMail, Gigaword, and SQuAD 1.1 benchmarks for
-abstractive summarization and question generation tasks. Experimental results show that ProphetNet achieves new
-state-of-the-art results on all these datasets compared to the models using the same scale pretraining corpus.*
-
-The Authors' code can be found [here](https://github.com/microsoft/ProphetNet).
-
-## Resources
-
- [Causal language modeling task guide](../tasks/language_modeling)
- [Translation task guide](../tasks/translation)
- [Summarization task guide](../tasks/summarization)
-
-## XLMProphetNetConfig
-
-[[autodoc]] XLMProphetNetConfig
-
-## XLMProphetNetTokenizer
-
-[[autodoc]] XLMProphetNetTokenizer
-
-## XLMProphetNetModel
-
-[[autodoc]] XLMProphetNetModel
-
-## XLMProphetNetEncoder
-
-[[autodoc]] XLMProphetNetEncoder
-
-## XLMProphetNetDecoder
-
-[[autodoc]] XLMProphetNetDecoder
-
-## XLMProphetNetForConditionalGeneration
-
-[[autodoc]] XLMProphetNetForConditionalGeneration
-
-## XLMProphetNetForCausalLM
-
-[[autodoc]] XLMProphetNetForCausalLM
--- a/docs/source/en/modular_transformers.md
+++ b/docs/source/en/modular_transformers.md
@ -1,6 +1,6 @@
 # Contributing a new model to Transformers

-Modular Transformers lowers the bar for contributing models and significantly reduces the code required to add a model by allowing imports and inheritance.
+Modular Transformers lowers the bar for contributing models and significantly reduces the code required to add a model by allowing imports and inheritance. We recommend to go through [general contribution guidelines for new models](./contributing#do-you-want-to-implement-a-new-model) before diving into the details here. 

 One of Transformers' core design feature is the [single model, single file](https://huggingface.co/blog/transformers-design-philosophy) policy. Model components - such as attention layers - are repeated across many files and any independent implementations tend to diverge as fixes and changes are applied to specific parts of the code.

--- a/docs/source/en/perf_infer_gpu_multi.md
+++ b/docs/source/en/perf_infer_gpu_multi.md
@ -149,7 +149,7 @@ The example below packs `up_proj` and `gate_proj` into a single `gate_up_proj` m
 ```python
 class Llama4TextExperts(nn.Module):
    ...
-    self.gate_up_proj = nn.Parameter(torch.empty(self.num_experts, self.hidden_size, 2 * self.expert_dim))
+    self.gate_up_proj = nn.Parameter(torch.zeros(self.num_experts, self.hidden_size, 2 * self.expert_dim))
 ```

 Batch matrix multiplication can be used in the `forward` pass to compute the output of the `gate_up_proj` module.
--- a/docs/source/en/quantization/fp_quant.md
+++ b/docs/source/en/quantization/fp_quant.md
@ -40,7 +40,7 @@ You can choose between MXFP4 and NVFP4 with `FPQuantConfig(forward_dtype="mxfp4"

 A **Blackwell-generation GPU is required** to run the kernels. Runtime support for FP-Quant is implemented through the [QuTLASS](https://github.com/IST-DASLab/qutlass) library and a lightweight PyTorch interface lib [`fp_quant`](https://github.com/IST-DASLab/FP-Quant/tree/master/inference_lib). We recommend installing the former **from source** and the latter with  `pip install fp_quant`.

-Users **without a Blackwell-generation GPU** , can use the method with `quantization_config=FPQuantConfig(pseudoquant=True)` without having to install [QuTLASS](https://github.com/IST-DASLab/qutlass). This would provide no speedups but would fully emulate the effect of quantization.
+Users **without a Blackwell-generation GPU** , can use the method with `quantization_config=FPQuantConfig(pseudoquantization=True)` without having to install [QuTLASS](https://github.com/IST-DASLab/qutlass). This would provide no speedups but would fully emulate the effect of quantization.

 > [!TIP]
 > Find models pre-quantized with FP-Quant in the official ISTA-DASLab [collection](https://huggingface.co/collections/ISTA-DASLab/fp-quant-6877c186103a21d3a02568ee).
--- a/docs/source/en/quantization/torchao.md
+++ b/docs/source/en/quantization/torchao.md
@ -329,7 +329,7 @@ from torchao.dtypes import Int4XPULayout
 from torchao.quantization.quant_primitives import ZeroPointDomain


-quant_config = Int4WeightOnlyConfig(group_size=128, layout=Int4XPULayout(), zero_point_domain=ZeroPointDomain.INT)
+quant_config = Int4WeightOnlyConfig(group_size=128, layout=Int4XPULayout(), zero_point_domain=ZeroPointDomain.INT, int4_packing_format="plain_int32")
 quantization_config = TorchAoConfig(quant_type=quant_config)

 # Load and quantize the model
@ -342,7 +342,7 @@ quantized_model = AutoModelForCausalLM.from_pretrained(

 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
 input_text = "What are we having for dinner?"
-input_ids = tokenizer(input_text, return_tensors="pt").to(model.device)
+input_ids = tokenizer(input_text, return_tensors="pt").to(quantized_model.device)

 # auto-compile the quantized model with `cache_implementation="static"` to get speed up
 output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
@ -395,7 +395,7 @@ from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
 from torchao.quantization import Int4WeightOnlyConfig
 from torchao.dtypes import Int4CPULayout

-quant_config = Int4WeightOnlyConfig(group_size=128, layout=Int4CPULayout())
+quant_config = Int4WeightOnlyConfig(group_size=128, layout=Int4CPULayout(), int4_packing_format="opaque")
 quantization_config = TorchAoConfig(quant_type=quant_config)

 # Load and quantize the model
@ -422,7 +422,7 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))

 #### 1. Skip quantization for certain layers

-With `ModuleFqnToConfig` we can specify a default configuration for all layers while skipping quantization for certain layers.
+With `FqnToConfig` we can specify a default configuration for all layers while skipping quantization for certain layers.

 ```py
 import torch
@ -430,11 +430,11 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig

 model_id = "meta-llama/Llama-3.1-8B-Instruct"

-from torchao.quantization import Int4WeightOnlyConfig, ModuleFqnToConfig
+from torchao.quantization import Int4WeightOnlyConfig, FqnToConfig
 config = Int4WeightOnlyConfig(group_size=128)

 # set default to int4 (for linears), and skip quantizing `model.layers.0.self_attn.q_proj`
-quant_config = ModuleFqnToConfig({"_default": config, "model.layers.0.self_attn.q_proj": None})
+quant_config = FqnToConfig({"_default": config, "model.layers.0.self_attn.q_proj": None})
 quantization_config = TorchAoConfig(quant_type=quant_config)
 quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", dtype=torch.bfloat16, quantization_config=quantization_config)
 # lm_head is not quantized and model.layers.0.self_attn.q_proj is not quantized
@ -459,7 +459,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig

 model_id = "facebook/opt-125m"

-from torchao.quantization import Int4WeightOnlyConfig, ModuleFqnToConfig, Int8DynamicActivationInt4WeightConfig, IntxWeightOnlyConfig, PerAxis, MappingType
+from torchao.quantization import Int4WeightOnlyConfig, FqnToConfig, Int8DynamicActivationInt4WeightConfig, IntxWeightOnlyConfig, PerAxis, MappingType

 weight_dtype = torch.int8
 granularity = PerAxis(0)
@ -470,7 +470,7 @@ embedding_config = IntxWeightOnlyConfig(
    mapping_type=mapping_type,
 )
 linear_config = Int8DynamicActivationInt4WeightConfig(group_size=128)
-quant_config = ModuleFqnToConfig({"_default": linear_config, "model.decoder.embed_tokens": embedding_config, "model.decoder.embed_positions": None})
+quant_config = FqnToConfig({"_default": linear_config, "model.decoder.embed_tokens": embedding_config, "model.decoder.embed_positions": None})
 # set `include_embedding` to True in order to include embedding in quantization
 # when `include_embedding` is True, we'll remove input embedding from `modules_not_to_convert` as well
 quantization_config = TorchAoConfig(quant_type=quant_config, include_embedding=True)
@ -521,7 +521,7 @@ from torchao.quantization import (
    IntxWeightOnlyConfig,
    PerRow,
    PerAxis,
-    ModuleFqnToConfig,
+    FqnToConfig,
    Float8Tensor,
    Int4TilePackedTo4dTensor,
    IntxUnpackedToInt8Tensor,
@ -550,7 +550,7 @@ qconfig_dict = {

    "_default": intxwo,
 }
-quant_config = ModuleFqnToConfig(qconfig_dict)
+quant_config = FqnToConfig(qconfig_dict)
 quantization_config = TorchAoConfig(quant_type=quant_config)
 quantized_model = AutoModelForCausalLM.from_pretrained(
    model_id,
--- a/docs/source/en/trainer.md
+++ b/docs/source/en/trainer.md
@ -187,7 +187,7 @@ from torch import nn
 from transformers import Trainer

 class CustomTrainer(Trainer):
-    def compute_loss(self, model: nn.Module, inputs: dict[str, Union[torch.Tensor, Any]], return_outputs: bool = False num_items_in_batch: Optional[torch.Tensor] = None):
+    def compute_loss(self, model: nn.Module, inputs: dict[str, Union[torch.Tensor, Any]], return_outputs: bool = False, num_items_in_batch: Optional[torch.Tensor] = None):
        labels = inputs.pop("labels")
        # forward pass
        outputs = model(**inputs)
--- a/docs/source/en/transformers_as_backend.md
+++ b/docs/source/en/transformers_as_backend.md
@ -14,9 +14,9 @@ rendered properly in your Markdown viewer.

 -->

-# Inference server backends
+# Transformers as modeling backend

-Transformers' models are compatible with different inference servers like vLLM and SGLang. Instead of implementing a model for each inference server, you only need one model, which can be plugged into any inference server. It simplifies maintenance and makes it easy for users to use different inference servers for different use cases.
+Transformers' models are compatible with different inference servers like vLLM and SGLang. Instead of implementing a new model architecture from scratch for each inference server, you only need a model definition in `transformers`, which can be plugged into any inference server. It simplifies maintenance and makes it easy for users to use different inference servers for different use cases.

 With Transformers as a backend, you can also serve any model - including custom and Hub-hosted models - without waiting for native support.

@ -157,57 +157,13 @@ class MyConfig(PreTrainedConfig):

 ### Multimodal models

-For multimodal models, you need to include a few more changes on top of the general recommendations. These rules ensure that your model integrates properly with multimodal data.
+For multimodal models, you need to include a few more changes on top of the general recommendations outlined in ["contribuiting a model"](./contributing#vision-language-model-contribution-checklist). These rules ensure that your model integrates properly and enables processing multimodal data.

-1. A multimodal model requires a base `MyMultiModalModel` class to handle multimodal fusion without a language modeling head and a separate generative class that adds a head.
+1. A multimodal model's processing class must have the `self.image_token` and `self.image_token_ids` attributes. These are placeholder tokens used to indicate image positions in the input. This placeholder token is the same token used in the input prompt to denote images and used in model code to scatter image features.

-    The base model needs to implement the `get_image_features()` method to accept image pixel values and return encoded outputs. These are later merged with the language embeddings and don't require any postprocessing. The shape of the returned features must match the number of input images. If a vision encoder returns variable-length outputs (patch-based), return a list of 2D tensors of size `(image_seq_len, image_dim)` for each image.
+2. The processing class needs `self._get_num_multimodal_tokens` method to compute the number of placeholder tokens needed for multimodal inputs with given sizes and to return a [`MultiModalData`] object. The placeholders between `<image>` tokens such as row or column tokens don't count as image placeholders. Only tokens that are actually replaced by image features later in modeling should be counted!

-Expand the code below for an example.
-
-<details>
-<summary>modeling_my_multimodal_model.py</summary>
-
-```python
-from transformers.generation import GenerationMixin
-
-class MyMultimodalModel(MyMultimodalPreTrainedModel):
-    def __init__(self, config):
-        super().__init__(config)
-        self.language_model = AutoModel.from_config(config.text_config)
-        self.vision_tower = AutoModel.from_config(config.vision_config)
-        self.multimodal_projection = nn.Linear(vision_dim, text_dim)
-    
-    def get_image_features(self, pixel_values):
-        return self.vision_tower(pixel_values).last_hidden_states
-    
-    def forward(self, input_ids, pixel_values, **kwargs):
-        # process your inputs
-        return MyModelOutputWithPast(
-            last_hidden_state=last_hidden_state,
-            image_hidden_states=image_features,
-            [...]
-        )
-
-class MyMultimodalModelForConditionalGeneration(MyMultimodalPreTrainedModel, GenerationMixin):
-    def __init__(self, config):
-        super().__init__(config)
-        self.model = MyMultimodalModel(config)
-        self.lm_head = nn.Linear(hidden_dim, vocab_size)
-```
-
-</details>
-
-2. A multimodal model config must be nested with the following fields.
-    * text_config: decoder language model config
-    * vision_config: vision encoder config
-    * image_token_id: ID of the image placeholder token used in the input to indicate image position
-
-3. A multimodal model's processing class must have the `self.image_token` and `self.image_token_ids` attributes. These are placeholder tokens used to indicate image positions in the input. The placeholder token is the same token used in the input prompt and to mask scatter image features.
-
-   The processing class also needs `self._get_num_multimodal_tokens` method to compute the number of placeholder tokens needed for multimodal inputs with given sizes and to return a [`MultiModalData`] object. The placeholder for row and column tokens don't count as image placeholders. Only the tokens that are actually replaced by image features are computed.
-
-Finally, when `return_mm_token_type_ids=True`, the class has to return `mm_token_type_ids` to indicate whether each position is a text token (`0`) or image placeholder token (`1`). Each image's token type IDs must be contiguous with no breaks between consecutive ones.
+3. The processor needs to check the value of `return_mm_token_type_ids` and return `mm_token_type_ids` to indicate whether each position is a text token (`0`), image placeholder token (`1`) or video placeholder token (`2`). Each multimodal token type ID sequence must be contiguous without breaks between consecutive tokens, therefore special tokens for begin/end/row/column must be treated as placeholders.

 Expand the code below for an example.

@ -246,5 +202,5 @@ class MyMultimodalProcessor(ProcessorMixin):

 ## Resources

-* Read the [Transformers backend integration in vLLM](https://blog.vllm.ai/2025/04/11/transformers-backend.html) blog post for more details about the Transformers backend in vLLM.
-* Read the [Transformers backend integration in SGLang](https://huggingface.co/blog/transformers-backend-sglang) blog post for more details about the Transformers backend in SGLang.
+* Read the [Transformers modeling backend integration in vLLM](https://blog.vllm.ai/2025/04/11/transformers-backend.html) blog post for more details about the Transformers modeling backend in vLLM.
+* Read the [Transformers modeling  backend integration in SGLang](https://huggingface.co/blog/transformers-backend-sglang) blog post for more details about the Transformers modeling backend in SGLang.
--- a/docs/source/it/migration.md
+++ b/docs/source/it/migration.md
@ -170,7 +170,7 @@ Per quanto riguarda la classe `TrainingArguments`:
 - L'argomento `evaluate_during_training` di `TrainingArguments` è deprecato a favore di `eval_strategy`.

 Per quanto riguarda il modello Transfo-XL:
- L'attributo di configurazione `tie_weight` di Transfo-XL diventa `tie_words_embeddings`.
+- L'attributo di configurazione `tie_weight` di Transfo-XL diventa `tie_word_embeddings`.
 - Il metodo di modellazione `reset_length` di Transfo-XL diventa `reset_memory_length`.

 Per quanto riguarda le pipeline:
--- a/docs/source/ja/add_new_model.md
+++ b/docs/source/ja/add_new_model.md
@ -406,16 +406,16 @@ model = BrandNewBertModel(BrandNewBertConfig())
 def _init_weights(self, module):
    """Initialize the weights"""
    if isinstance(module, nn.Linear):
-        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        module.weight.normal_(mean=0.0, std=self.config.initializer_range)
        if module.bias is not None:
-            module.bias.data.zero_()
+            module.bias.zero_()
    elif isinstance(module, nn.Embedding):
-        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        module.weight.normal_(mean=0.0, std=self.config.initializer_range)
        if module.padding_idx is not None:
            module.weight.data[module.padding_idx].zero_()
    elif isinstance(module, nn.LayerNorm):
-        module.bias.data.zero_()
-        module.weight.data.fill_(1.0)
+        module.bias.zero_()
+        module.weight.fill_(1.0)
 ```

 特定のモジュールに特別な初期化が必要な場合、カスタムスキームをさらに持つことができます。たとえば、
@ -431,9 +431,9 @@ def _init_weights(self, module):
        module.project_hid._is_hf_initialized = True
        module.project_q._is_hf_initialized = True
    elif isinstance(module, nn.Linear):
-        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        module.weight.normal_(mean=0.0, std=self.config.initializer_range)
        if module.bias is not None:
-            module.bias.data.zero_()
+            module.bias.zero_()
 ```

 `_is_hf_initialized`フラグは、サブモジュールを一度だけ初期化することを確実にするために内部で使用されます。
--- a/docs/source/ko/add_new_model.md
+++ b/docs/source/ko/add_new_model.md
@ -348,16 +348,16 @@ model = BrandNewBertModel(BrandNewBertConfig())
 def _init_weights(self, module):
    """Initialize the weights"""
    if isinstance(module, nn.Linear):
-        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        module.weight.normal_(mean=0.0, std=self.config.initializer_range)
        if module.bias is not None:
-            module.bias.data.zero_()
+            module.bias.zero_()
    elif isinstance(module, nn.Embedding):
-        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        module.weight.normal_(mean=0.0, std=self.config.initializer_range)
        if module.padding_idx is not None:
            module.weight.data[module.padding_idx].zero_()
    elif isinstance(module, nn.LayerNorm):
-        module.bias.data.zero_()
-        module.weight.data.fill_(1.0)
+        module.bias.zero_()
+        module.weight.fill_(1.0)
 ```

 몇 가지 모듈에 대해 특별한 초기화가 필요한 경우 사용자 정의 방식을 사용할 수도 있습니다. 예를 들어, `Wav2Vec2ForPreTraining`에서 마지막 두 개의 선형 레이어는 일반적인 PyTorch `nn.Linear`의 초기화를 가져야 하지만, 다른 모든 레이어는 위와 같은 초기화를 사용해야 합니다. 이는 다음과 같이 코드화됩니다:
@ -371,9 +371,9 @@ def _init_weights(self, module):
        module.project_hid._is_hf_initialized = True
        module.project_q._is_hf_initialized = True
    elif isinstance(module, nn.Linear):
-        module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        module.weight.normal_(mean=0.0, std=self.config.initializer_range)
        if module.bias is not None:
-            module.bias.data.zero_()
+            module.bias.zero_()
 ```

 `_is_hf_initialized` 플래그는 서브모듈을 한 번만 초기화하도록 내부적으로 사용됩니다. `module.project_q` 및 `module.project_hid`에 대해 `True`로 설정함으로써, 우리가 수행한 사용자 정의 초기화가 이후에 덮어쓰이지 않도록 합니다. 즉, `_init_weights` 함수가 이들에게 적용되지 않습니다.
--- a/docs/source/ko/perf_infer_gpu_multi.md
+++ b/docs/source/ko/perf_infer_gpu_multi.md
@ -152,7 +152,7 @@ class ParallelInterface(MutableMapping):
 ```python
 class Llama4TextExperts(nn.Module):
    ...
-    self.gate_up_proj = nn.Parameter(torch.empty(self.num_experts, self.hidden_size, 2 * self.expert_dim))
+    self.gate_up_proj = nn.Parameter(torch.zeros(self.num_experts, self.hidden_size, 2 * self.expert_dim))
 ```

 배치 행렬 곱셈을 `forward` 패스에서 사용하여 `gate_up_proj` 모듈의 출력을 계산할 수 있습니다.
--- a/examples/legacy/README.md
+++ b/examples/legacy/README.md
@ -1,21 +0,0 @@
-<!---
-Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-->
-
-# Legacy examples
-
-This folder contains examples which are not actively maintained (mostly contributed by the community).
-
-Using these examples together with a recent version of the library usually requires to make small (sometimes big) adaptations to get the scripts working.
--- a/examples/legacy/benchmarking/README.md
+++ b/examples/legacy/benchmarking/README.md
@ -1,26 +0,0 @@
-<!---
-Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-->
-
-# 🤗 Benchmark results
-
-Here, you can find a list of the different benchmark results created by the community.
-
-If you would like to list benchmark results on your favorite models of the [model hub](https://huggingface.co/models) here, please open a Pull Request and add it below.
-
-| Benchmark description | Results | Environment info |      Author      |
-|:----------|:-------------|:-------------|------:|
-| PyTorch Benchmark on inference for `google-bert/bert-base-cased` |[memory](https://github.com/patrickvonplaten/files_to_link_to/blob/master/bert_benchmark/inference_memory.csv) | [env](https://github.com/patrickvonplaten/files_to_link_to/blob/master/bert_benchmark/env.csv) | [Patrick von Platen](https://github.com/patrickvonplaten) | 
-| PyTorch Benchmark on inference for `google-bert/bert-base-cased` |[time](https://github.com/patrickvonplaten/files_to_link_to/blob/master/bert_benchmark/inference_time.csv) | [env](https://github.com/patrickvonplaten/files_to_link_to/blob/master/bert_benchmark/env.csv) | [Patrick von Platen](https://github.com/patrickvonplaten) | 
--- a/examples/legacy/benchmarking/plot_csv_file.py
+++ b/examples/legacy/benchmarking/plot_csv_file.py
@ -1,178 +0,0 @@
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import csv
-from collections import defaultdict
-from dataclasses import dataclass, field
-from typing import Optional
-
-import matplotlib.pyplot as plt
-import numpy as np
-from matplotlib.ticker import ScalarFormatter
-
-from transformers import HfArgumentParser
-
-
-def list_field(default=None, metadata=None):
-    return field(default_factory=lambda: default, metadata=metadata)
-
-
-@dataclass
-class PlotArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
-    """
-
-    csv_file: str = field(
-        metadata={"help": "The csv file to plot."},
-    )
-    plot_along_batch: bool = field(
-        default=False,
-        metadata={"help": "Whether to plot along batch size or sequence length. Defaults to sequence length."},
-    )
-    is_time: bool = field(
-        default=False,
-        metadata={"help": "Whether the csv file has time results or memory results. Defaults to memory results."},
-    )
-    no_log_scale: bool = field(
-        default=False,
-        metadata={"help": "Disable logarithmic scale when plotting"},
-    )
-    is_train: bool = field(
-        default=False,
-        metadata={
-            "help": "Whether the csv file has training results or inference results. Defaults to inference results."
-        },
-    )
-    figure_png_file: Optional[str] = field(
-        default=None,
-        metadata={"help": "Filename under which the plot will be saved. If unused no plot is saved."},
-    )
-    short_model_names: Optional[list[str]] = list_field(
-        default=None, metadata={"help": "List of model names that are used instead of the ones in the csv file."}
-    )
-
-
-def can_convert_to_int(string):
-    try:
-        int(string)
-        return True
-    except ValueError:
-        return False
-
-
-def can_convert_to_float(string):
-    try:
-        float(string)
-        return True
-    except ValueError:
-        return False
-
-
-class Plot:
-    def __init__(self, args):
-        self.args = args
-        self.result_dict = defaultdict(lambda: {"bsz": [], "seq_len": [], "result": {}})
-
-        with open(self.args.csv_file, newline="") as csv_file:
-            reader = csv.DictReader(csv_file)
-            for row in reader:
-                model_name = row["model"]
-                self.result_dict[model_name]["bsz"].append(int(row["batch_size"]))
-                self.result_dict[model_name]["seq_len"].append(int(row["sequence_length"]))
-                if can_convert_to_int(row["result"]):
-                    # value is not None
-                    self.result_dict[model_name]["result"][(int(row["batch_size"]), int(row["sequence_length"]))] = (
-                        int(row["result"])
-                    )
-                elif can_convert_to_float(row["result"]):
-                    # value is not None
-                    self.result_dict[model_name]["result"][(int(row["batch_size"]), int(row["sequence_length"]))] = (
-                        float(row["result"])
-                    )
-
-    def plot(self):
-        fig, ax = plt.subplots()
-        title_str = "Time usage" if self.args.is_time else "Memory usage"
-        title_str = title_str + " for training" if self.args.is_train else title_str + " for inference"
-
-        if not self.args.no_log_scale:
-            # set logarithm scales
-            ax.set_xscale("log")
-            ax.set_yscale("log")
-
-        for axis in [ax.xaxis, ax.yaxis]:
-            axis.set_major_formatter(ScalarFormatter())
-
-        for model_name_idx, model_name in enumerate(self.result_dict.keys()):
-            batch_sizes = sorted(set(self.result_dict[model_name]["bsz"]))
-            sequence_lengths = sorted(set(self.result_dict[model_name]["seq_len"]))
-            results = self.result_dict[model_name]["result"]
-
-            (x_axis_array, inner_loop_array) = (
-                (batch_sizes, sequence_lengths) if self.args.plot_along_batch else (sequence_lengths, batch_sizes)
-            )
-
-            label_model_name = (
-                model_name if self.args.short_model_names is None else self.args.short_model_names[model_name_idx]
-            )
-
-            for inner_loop_value in inner_loop_array:
-                if self.args.plot_along_batch:
-                    y_axis_array = np.asarray(
-                        [results[(x, inner_loop_value)] for x in x_axis_array if (x, inner_loop_value) in results],
-                        dtype=int,
-                    )
-                else:
-                    y_axis_array = np.asarray(
-                        [results[(inner_loop_value, x)] for x in x_axis_array if (inner_loop_value, x) in results],
-                        dtype=np.float32,
-                    )
-
-                (x_axis_label, inner_loop_label) = (
-                    ("batch_size", "len") if self.args.plot_along_batch else ("in #tokens", "bsz")
-                )
-
-                x_axis_array = np.asarray(x_axis_array, int)[: len(y_axis_array)]
-                plt.scatter(
-                    x_axis_array, y_axis_array, label=f"{label_model_name} - {inner_loop_label}: {inner_loop_value}"
-                )
-                plt.plot(x_axis_array, y_axis_array, "--")
-
-            title_str += f" {label_model_name} vs."
-
-        title_str = title_str[:-4]
-        y_axis_label = "Time in s" if self.args.is_time else "Memory in MB"
-
-        # plot
-        plt.title(title_str)
-        plt.xlabel(x_axis_label)
-        plt.ylabel(y_axis_label)
-        plt.legend()
-
-        if self.args.figure_png_file is not None:
-            plt.savefig(self.args.figure_png_file)
-        else:
-            plt.show()
-
-
-def main():
-    parser = HfArgumentParser(PlotArguments)
-    plot_args = parser.parse_args_into_dataclasses()[0]
-    plot = Plot(args=plot_args)
-    plot.plot()
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/legacy/benchmarking/requirements.txt
+++ b/examples/legacy/benchmarking/requirements.txt
@ -1 +0,0 @@
-torch >= 1.3
--- a/examples/legacy/benchmarking/run_benchmark.py
+++ b/examples/legacy/benchmarking/run_benchmark.py
@ -1,47 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2020 The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Benchmarking the library on inference and training"""
-
-from transformers import HfArgumentParser, PyTorchBenchmark, PyTorchBenchmarkArguments
-
-
-def main():
-    parser = HfArgumentParser(PyTorchBenchmarkArguments)
-    try:
-        benchmark_args = parser.parse_args_into_dataclasses()[0]
-    except ValueError as e:
-        arg_error_msg = "Arg --no_{0} is no longer used, please use --no-{0} instead."
-        begin_error_msg = " ".join(str(e).split(" ")[:-1])
-        full_error_msg = ""
-        depreciated_args = eval(str(e).split(" ")[-1])
-        wrong_args = []
-        for arg in depreciated_args:
-            # arg[2:] removes '--'
-            if arg[2:] in PyTorchBenchmarkArguments.deprecated_args:
-                # arg[5:] removes '--no_'
-                full_error_msg += arg_error_msg.format(arg[5:])
-            else:
-                wrong_args.append(arg)
-        if len(wrong_args) > 0:
-            full_error_msg = full_error_msg + begin_error_msg + str(wrong_args)
-        raise ValueError(full_error_msg)
-
-    benchmark = PyTorchBenchmark(args=benchmark_args)
-    benchmark.run()
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/legacy/multiple_choice/run_multiple_choice.py
+++ b/examples/legacy/multiple_choice/run_multiple_choice.py
@ -1,232 +0,0 @@
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Finetuning the library models for multiple choice (Bert, Roberta, XLNet)."""
-
-import logging
-import os
-from dataclasses import dataclass, field
-from typing import Optional
-
-import numpy as np
-from utils_multiple_choice import MultipleChoiceDataset, Split, processors
-
-import transformers
-from transformers import (
-    AutoConfig,
-    AutoModelForMultipleChoice,
-    AutoTokenizer,
-    DataCollatorWithPadding,
-    EvalPrediction,
-    HfArgumentParser,
-    Trainer,
-    TrainingArguments,
-    set_seed,
-)
-from transformers.trainer_utils import is_main_process
-
-
-logger = logging.getLogger(__name__)
-
-
-def simple_accuracy(preds, labels):
-    return (preds == labels).mean()
-
-
-@dataclass
-class ModelArguments:
-    """
-    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
-    """
-
-    model_name_or_path: str = field(
-        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
-    )
-    config_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
-    )
-    cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
-    )
-
-
-@dataclass
-class DataTrainingArguments:
-    """
-    Arguments pertaining to what data we are going to input our model for training and eval.
-    """
-
-    task_name: str = field(metadata={"help": "The name of the task to train on: " + ", ".join(processors.keys())})
-    data_dir: str = field(metadata={"help": "Should contain the data files for the task."})
-    max_seq_length: int = field(
-        default=128,
-        metadata={
-            "help": (
-                "The maximum total input sequence length after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded."
-            )
-        },
-    )
-    overwrite_cache: bool = field(
-        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
-    )
-
-
-def main():
-    # See all possible arguments in src/transformers/training_args.py
-    # or by passing the --help flag to this script.
-    # We now keep distinct sets of args, for a cleaner separation of concerns.
-
-    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
-    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
-
-    # Setup logging
-    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-        datefmt="%m/%d/%Y %H:%M:%S",
-        level=logging.INFO if training_args.local_process_index in [-1, 0] else logging.WARN,
-    )
-    logger.warning(
-        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
-        training_args.local_process_index,
-        training_args.device,
-        training_args.n_gpu,
-        bool(training_args.parallel_mode.value == "distributed"),
-        training_args.fp16,
-    )
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    if is_main_process(training_args.local_process_index):
-        transformers.utils.logging.set_verbosity_info()
-        transformers.utils.logging.enable_default_handler()
-        transformers.utils.logging.enable_explicit_format()
-    logger.info("Training/evaluation parameters %s", training_args)
-
-    # Set seed
-    set_seed(training_args.seed)
-
-    try:
-        processor = processors[data_args.task_name]()
-        label_list = processor.get_labels()
-        num_labels = len(label_list)
-    except KeyError:
-        raise ValueError("Task not found: %s" % (data_args.task_name))
-
-    # Load pretrained model and tokenizer
-    #
-    # Distributed training:
-    # The .from_pretrained methods guarantee that only one local process can concurrently
-    # download model & vocab.
-
-    config = AutoConfig.from_pretrained(
-        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
-        num_labels=num_labels,
-        finetuning_task=data_args.task_name,
-        cache_dir=model_args.cache_dir,
-    )
-    tokenizer = AutoTokenizer.from_pretrained(
-        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
-        cache_dir=model_args.cache_dir,
-    )
-    model = AutoModelForMultipleChoice.from_pretrained(
-        model_args.model_name_or_path,
-        from_tf=bool(".ckpt" in model_args.model_name_or_path),
-        config=config,
-        cache_dir=model_args.cache_dir,
-    )
-
-    # Get datasets
-    train_dataset = (
-        MultipleChoiceDataset(
-            data_dir=data_args.data_dir,
-            tokenizer=tokenizer,
-            task=data_args.task_name,
-            max_seq_length=data_args.max_seq_length,
-            overwrite_cache=data_args.overwrite_cache,
-            mode=Split.train,
-        )
-        if training_args.do_train
-        else None
-    )
-    eval_dataset = (
-        MultipleChoiceDataset(
-            data_dir=data_args.data_dir,
-            tokenizer=tokenizer,
-            task=data_args.task_name,
-            max_seq_length=data_args.max_seq_length,
-            overwrite_cache=data_args.overwrite_cache,
-            mode=Split.dev,
-        )
-        if training_args.do_eval
-        else None
-    )
-
-    def compute_metrics(p: EvalPrediction) -> dict:
-        preds = np.argmax(p.predictions, axis=1)
-        return {"acc": simple_accuracy(preds, p.label_ids)}
-
-    # Data collator
-    data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) if training_args.fp16 else None
-
-    # Initialize our Trainer
-    trainer = Trainer(
-        model=model,
-        args=training_args,
-        train_dataset=train_dataset,
-        eval_dataset=eval_dataset,
-        compute_metrics=compute_metrics,
-        data_collator=data_collator,
-    )
-
-    # Training
-    if training_args.do_train:
-        trainer.train(
-            model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
-        )
-        trainer.save_model()
-        # For convenience, we also re-save the tokenizer to the same directory,
-        # so that you can share your model easily on huggingface.co/models =)
-        if trainer.is_world_master():
-            tokenizer.save_pretrained(training_args.output_dir)
-
-    # Evaluation
-    results = {}
-    if training_args.do_eval:
-        logger.info("*** Evaluate ***")
-
-        result = trainer.evaluate()
-
-        output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt")
-        if trainer.is_world_master():
-            with open(output_eval_file, "w") as writer:
-                logger.info("***** Eval results *****")
-                for key, value in result.items():
-                    logger.info("  %s = %s", key, value)
-                    writer.write("{} = {}\n".format(key, value))
-
-                results.update(result)
-
-    return results
-
-
-def _mp_fn(index):
-    # For xla_spawn (TPUs)
-    main()
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/legacy/multiple_choice/utils_multiple_choice.py
+++ b/examples/legacy/multiple_choice/utils_multiple_choice.py
@ -1,483 +0,0 @@
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Multiple choice fine-tuning: utilities to work with multiple choice tasks of reading comprehension"""
-
-import csv
-import glob
-import json
-import logging
-import os
-from dataclasses import dataclass
-from enum import Enum
-from typing import Optional
-
-import tqdm
-from filelock import FileLock
-
-from transformers import PreTrainedTokenizer, is_torch_available
-
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass(frozen=True)
-class InputExample:
-    """
-    A single training/test example for multiple choice
-
-    Args:
-        example_id: Unique id for the example.
-        question: string. The untokenized text of the second sequence (question).
-        contexts: list of str. The untokenized text of the first sequence (context of corresponding question).
-        endings: list of str. multiple choice's options. Its length must be equal to contexts' length.
-        label: (Optional) string. The label of the example. This should be
-        specified for train and dev examples, but not for test examples.
-    """
-
-    example_id: str
-    question: str
-    contexts: list[str]
-    endings: list[str]
-    label: Optional[str]
-
-
-@dataclass(frozen=True)
-class InputFeatures:
-    """
-    A single set of features of data.
-    Property names are the same names as the corresponding inputs to a model.
-    """
-
-    example_id: str
-    input_ids: list[list[int]]
-    attention_mask: Optional[list[list[int]]]
-    token_type_ids: Optional[list[list[int]]]
-    label: Optional[int]
-
-
-class Split(Enum):
-    train = "train"
-    dev = "dev"
-    test = "test"
-
-
-if is_torch_available():
-    import torch
-    from torch.utils.data import Dataset
-
-    class MultipleChoiceDataset(Dataset):
-        features: list[InputFeatures]
-
-        def __init__(
-            self,
-            data_dir: str,
-            tokenizer: PreTrainedTokenizer,
-            task: str,
-            max_seq_length: Optional[int] = None,
-            overwrite_cache=False,
-            mode: Split = Split.train,
-        ):
-            processor = processors[task]()
-
-            cached_features_file = os.path.join(
-                data_dir,
-                "cached_{}_{}_{}_{}".format(
-                    mode.value,
-                    tokenizer.__class__.__name__,
-                    str(max_seq_length),
-                    task,
-                ),
-            )
-
-            # Make sure only the first process in distributed training processes the dataset,
-            # and the others will use the cache.
-            lock_path = cached_features_file + ".lock"
-            with FileLock(lock_path):
-                if os.path.exists(cached_features_file) and not overwrite_cache:
-                    logger.info(f"Loading features from cached file {cached_features_file}")
-                    self.features = torch.load(cached_features_file, weights_only=True)
-                else:
-                    logger.info(f"Creating features from dataset file at {data_dir}")
-                    label_list = processor.get_labels()
-                    if mode == Split.dev:
-                        examples = processor.get_dev_examples(data_dir)
-                    elif mode == Split.test:
-                        examples = processor.get_test_examples(data_dir)
-                    else:
-                        examples = processor.get_train_examples(data_dir)
-                    logger.info("Training examples: %s", len(examples))
-                    self.features = convert_examples_to_features(
-                        examples,
-                        label_list,
-                        max_seq_length,
-                        tokenizer,
-                    )
-                    logger.info("Saving features into cached file %s", cached_features_file)
-                    torch.save(self.features, cached_features_file)
-
-        def __len__(self):
-            return len(self.features)
-
-        def __getitem__(self, i) -> InputFeatures:
-            return self.features[i]
-
-
-class DataProcessor:
-    """Base class for data converters for multiple choice data sets."""
-
-    def get_train_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the train set."""
-        raise NotImplementedError()
-
-    def get_dev_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the dev set."""
-        raise NotImplementedError()
-
-    def get_test_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the test set."""
-        raise NotImplementedError()
-
-    def get_labels(self):
-        """Gets the list of labels for this data set."""
-        raise NotImplementedError()
-
-
-class RaceProcessor(DataProcessor):
-    """Processor for the RACE data set."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        logger.info(f"LOOKING AT {data_dir} train")
-        high = os.path.join(data_dir, "train/high")
-        middle = os.path.join(data_dir, "train/middle")
-        high = self._read_txt(high)
-        middle = self._read_txt(middle)
-        return self._create_examples(high + middle, "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        logger.info(f"LOOKING AT {data_dir} dev")
-        high = os.path.join(data_dir, "dev/high")
-        middle = os.path.join(data_dir, "dev/middle")
-        high = self._read_txt(high)
-        middle = self._read_txt(middle)
-        return self._create_examples(high + middle, "dev")
-
-    def get_test_examples(self, data_dir):
-        """See base class."""
-        logger.info(f"LOOKING AT {data_dir} test")
-        high = os.path.join(data_dir, "test/high")
-        middle = os.path.join(data_dir, "test/middle")
-        high = self._read_txt(high)
-        middle = self._read_txt(middle)
-        return self._create_examples(high + middle, "test")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1", "2", "3"]
-
-    def _read_txt(self, input_dir):
-        lines = []
-        files = glob.glob(input_dir + "/*txt")
-        for file in tqdm.tqdm(files, desc="read files"):
-            with open(file, encoding="utf-8") as fin:
-                data_raw = json.load(fin)
-                data_raw["race_id"] = file
-                lines.append(data_raw)
-        return lines
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for _, data_raw in enumerate(lines):
-            race_id = "{}-{}".format(set_type, data_raw["race_id"])
-            article = data_raw["article"]
-            for i in range(len(data_raw["answers"])):
-                truth = str(ord(data_raw["answers"][i]) - ord("A"))
-                question = data_raw["questions"][i]
-                options = data_raw["options"][i]
-
-                examples.append(
-                    InputExample(
-                        example_id=race_id,
-                        question=question,
-                        contexts=[article, article, article, article],  # this is not efficient but convenient
-                        endings=[options[0], options[1], options[2], options[3]],
-                        label=truth,
-                    )
-                )
-        return examples
-
-
-class SynonymProcessor(DataProcessor):
-    """Processor for the Synonym data set."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        logger.info(f"LOOKING AT {data_dir} train")
-        return self._create_examples(self._read_csv(os.path.join(data_dir, "mctrain.csv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        logger.info(f"LOOKING AT {data_dir} dev")
-        return self._create_examples(self._read_csv(os.path.join(data_dir, "mchp.csv")), "dev")
-
-    def get_test_examples(self, data_dir):
-        """See base class."""
-        logger.info(f"LOOKING AT {data_dir} dev")
-
-        return self._create_examples(self._read_csv(os.path.join(data_dir, "mctest.csv")), "test")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1", "2", "3", "4"]
-
-    def _read_csv(self, input_file):
-        with open(input_file, encoding="utf-8") as f:
-            return list(csv.reader(f))
-
-    def _create_examples(self, lines: list[list[str]], type: str):
-        """Creates examples for the training and dev sets."""
-
-        examples = [
-            InputExample(
-                example_id=line[0],
-                question="",  # in the swag dataset, the
-                # common beginning of each
-                # choice is stored in "sent2".
-                contexts=[line[1], line[1], line[1], line[1], line[1]],
-                endings=[line[2], line[3], line[4], line[5], line[6]],
-                label=line[7],
-            )
-            for line in lines  # we skip the line with the column names
-        ]
-
-        return examples
-
-
-class SwagProcessor(DataProcessor):
-    """Processor for the SWAG data set."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        logger.info(f"LOOKING AT {data_dir} train")
-        return self._create_examples(self._read_csv(os.path.join(data_dir, "train.csv")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        logger.info(f"LOOKING AT {data_dir} dev")
-        return self._create_examples(self._read_csv(os.path.join(data_dir, "val.csv")), "dev")
-
-    def get_test_examples(self, data_dir):
-        """See base class."""
-        logger.info(f"LOOKING AT {data_dir} dev")
-        raise ValueError(
-            "For swag testing, the input file does not contain a label column. It can not be tested in current code "
-            "setting!"
-        )
-        return self._create_examples(self._read_csv(os.path.join(data_dir, "test.csv")), "test")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1", "2", "3"]
-
-    def _read_csv(self, input_file):
-        with open(input_file, encoding="utf-8") as f:
-            return list(csv.reader(f))
-
-    def _create_examples(self, lines: list[list[str]], type: str):
-        """Creates examples for the training and dev sets."""
-        if type == "train" and lines[0][-1] != "label":
-            raise ValueError("For training, the input file must contain a label column.")
-
-        examples = [
-            InputExample(
-                example_id=line[2],
-                question=line[5],  # in the swag dataset, the
-                # common beginning of each
-                # choice is stored in "sent2".
-                contexts=[line[4], line[4], line[4], line[4]],
-                endings=[line[7], line[8], line[9], line[10]],
-                label=line[11],
-            )
-            for line in lines[1:]  # we skip the line with the column names
-        ]
-
-        return examples
-
-
-class ArcProcessor(DataProcessor):
-    """Processor for the ARC data set (request from allennlp)."""
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        logger.info(f"LOOKING AT {data_dir} train")
-        return self._create_examples(self._read_json(os.path.join(data_dir, "train.jsonl")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        logger.info(f"LOOKING AT {data_dir} dev")
-        return self._create_examples(self._read_json(os.path.join(data_dir, "dev.jsonl")), "dev")
-
-    def get_test_examples(self, data_dir):
-        logger.info(f"LOOKING AT {data_dir} test")
-        return self._create_examples(self._read_json(os.path.join(data_dir, "test.jsonl")), "test")
-
-    def get_labels(self):
-        """See base class."""
-        return ["0", "1", "2", "3"]
-
-    def _read_json(self, input_file):
-        with open(input_file, encoding="utf-8") as fin:
-            lines = fin.readlines()
-            return lines
-
-    def _create_examples(self, lines, type):
-        """Creates examples for the training and dev sets."""
-
-        # There are two types of labels. They should be normalized
-        def normalize(truth):
-            if truth in "ABCD":
-                return ord(truth) - ord("A")
-            elif truth in "1234":
-                return int(truth) - 1
-            else:
-                logger.info("truth ERROR! %s", str(truth))
-                return None
-
-        examples = []
-        three_choice = 0
-        four_choice = 0
-        five_choice = 0
-        other_choices = 0
-        # we deleted example which has more than or less than four choices
-        for line in tqdm.tqdm(lines, desc="read arc data"):
-            data_raw = json.loads(line.strip("\n"))
-            if len(data_raw["question"]["choices"]) == 3:
-                three_choice += 1
-                continue
-            elif len(data_raw["question"]["choices"]) == 5:
-                five_choice += 1
-                continue
-            elif len(data_raw["question"]["choices"]) != 4:
-                other_choices += 1
-                continue
-            four_choice += 1
-            truth = str(normalize(data_raw["answerKey"]))
-            assert truth != "None"
-            question_choices = data_raw["question"]
-            question = question_choices["stem"]
-            id = data_raw["id"]
-            options = question_choices["choices"]
-            if len(options) == 4:
-                examples.append(
-                    InputExample(
-                        example_id=id,
-                        question=question,
-                        contexts=[
-                            options[0]["para"].replace("_", ""),
-                            options[1]["para"].replace("_", ""),
-                            options[2]["para"].replace("_", ""),
-                            options[3]["para"].replace("_", ""),
-                        ],
-                        endings=[options[0]["text"], options[1]["text"], options[2]["text"], options[3]["text"]],
-                        label=truth,
-                    )
-                )
-
-        if type == "train":
-            assert len(examples) > 1
-            assert examples[0].label is not None
-        logger.info("len examples: %s}", str(len(examples)))
-        logger.info("Three choices: %s", str(three_choice))
-        logger.info("Five choices: %s", str(five_choice))
-        logger.info("Other choices: %s", str(other_choices))
-        logger.info("four choices: %s", str(four_choice))
-
-        return examples
-
-
-def convert_examples_to_features(
-    examples: list[InputExample],
-    label_list: list[str],
-    max_length: int,
-    tokenizer: PreTrainedTokenizer,
-) -> list[InputFeatures]:
-    """
-    Loads a data file into a list of `InputFeatures`
-    """
-
-    label_map = {label: i for i, label in enumerate(label_list)}
-
-    features = []
-    for ex_index, example in tqdm.tqdm(enumerate(examples), desc="convert examples to features"):
-        if ex_index % 10000 == 0:
-            logger.info("Writing example %d of %d" % (ex_index, len(examples)))
-        choices_inputs = []
-        for ending_idx, (context, ending) in enumerate(zip(example.contexts, example.endings)):
-            text_a = context
-            if example.question.find("_") != -1:
-                # this is for cloze question
-                text_b = example.question.replace("_", ending)
-            else:
-                text_b = example.question + " " + ending
-
-            inputs = tokenizer(
-                text_a,
-                text_b,
-                add_special_tokens=True,
-                max_length=max_length,
-                padding="max_length",
-                truncation=True,
-                return_overflowing_tokens=True,
-            )
-            if "num_truncated_tokens" in inputs and inputs["num_truncated_tokens"] > 0:
-                logger.info(
-                    "Attention! you are cropping tokens (swag task is ok). "
-                    "If you are training ARC and RACE and you are popping question + options, "
-                    "you need to try to use a bigger max seq length!"
-                )
-
-            choices_inputs.append(inputs)
-
-        label = label_map[example.label]
-
-        input_ids = [x["input_ids"] for x in choices_inputs]
-        attention_mask = (
-            [x["attention_mask"] for x in choices_inputs] if "attention_mask" in choices_inputs[0] else None
-        )
-        token_type_ids = (
-            [x["token_type_ids"] for x in choices_inputs] if "token_type_ids" in choices_inputs[0] else None
-        )
-
-        features.append(
-            InputFeatures(
-                example_id=example.example_id,
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                token_type_ids=token_type_ids,
-                label=label,
-            )
-        )
-
-    for f in features[:2]:
-        logger.info("*** Example ***")
-        logger.info("feature: %s" % f)
-
-    return features
-
-
-processors = {"race": RaceProcessor, "swag": SwagProcessor, "arc": ArcProcessor, "syn": SynonymProcessor}
-MULTIPLE_CHOICE_TASKS_NUM_LABELS = {"race", 4, "swag", 4, "arc", 4, "syn", 5}
--- a/examples/legacy/pytorch-lightning/lightning_base.py
+++ b/examples/legacy/pytorch-lightning/lightning_base.py
@ -1,397 +0,0 @@
-import argparse
-import logging
-import os
-from pathlib import Path
-from typing import Any
-
-import pytorch_lightning as pl
-from pytorch_lightning.utilities import rank_zero_info
-
-from transformers import (
-    AutoConfig,
-    AutoModel,
-    AutoModelForPreTraining,
-    AutoModelForQuestionAnswering,
-    AutoModelForSeq2SeqLM,
-    AutoModelForSequenceClassification,
-    AutoModelForTokenClassification,
-    AutoModelWithLMHead,
-    AutoTokenizer,
-    PreTrainedConfig,
-    PreTrainedTokenizer,
-    is_torch_available,
-)
-from transformers.optimization import (
-    Adafactor,
-    get_cosine_schedule_with_warmup,
-    get_cosine_with_hard_restarts_schedule_with_warmup,
-    get_linear_schedule_with_warmup,
-    get_polynomial_decay_schedule_with_warmup,
-)
-from transformers.utils.versions import require_version
-
-
-if is_torch_available():
-    import torch
-
-
-logger = logging.getLogger(__name__)
-
-require_version("pytorch_lightning>=1.0.4")
-
-MODEL_MODES = {
-    "base": AutoModel,
-    "sequence-classification": AutoModelForSequenceClassification,
-    "question-answering": AutoModelForQuestionAnswering,
-    "pretraining": AutoModelForPreTraining,
-    "token-classification": AutoModelForTokenClassification,
-    "language-modeling": AutoModelWithLMHead,
-    "summarization": AutoModelForSeq2SeqLM,
-    "translation": AutoModelForSeq2SeqLM,
-}
-
-
-# update this and the import above to support new schedulers from transformers.optimization
-arg_to_scheduler = {
-    "linear": get_linear_schedule_with_warmup,
-    "cosine": get_cosine_schedule_with_warmup,
-    "cosine_w_restarts": get_cosine_with_hard_restarts_schedule_with_warmup,
-    "polynomial": get_polynomial_decay_schedule_with_warmup,
-    # '': get_constant_schedule,             # not supported for now
-    # '': get_constant_schedule_with_warmup, # not supported for now
-}
-arg_to_scheduler_choices = sorted(arg_to_scheduler.keys())
-arg_to_scheduler_metavar = "{" + ", ".join(arg_to_scheduler_choices) + "}"
-
-
-class BaseTransformer(pl.LightningModule):
-    def __init__(
-        self,
-        hparams: argparse.Namespace,
-        num_labels=None,
-        mode="base",
-        config=None,
-        tokenizer=None,
-        model=None,
-        **config_kwargs,
-    ):
-        """Initialize a model, tokenizer and config."""
-        super().__init__()
-        # TODO: move to self.save_hyperparameters()
-        # self.save_hyperparameters()
-        # can also expand arguments into trainer signature for easier reading
-
-        self.save_hyperparameters(hparams)
-        self.step_count = 0
-        self.output_dir = Path(self.hparams.output_dir)
-        cache_dir = self.hparams.cache_dir if self.hparams.cache_dir else None
-        if config is None:
-            self.config = AutoConfig.from_pretrained(
-                self.hparams.config_name if self.hparams.config_name else self.hparams.model_name_or_path,
-                **({"num_labels": num_labels} if num_labels is not None else {}),
-                cache_dir=cache_dir,
-                **config_kwargs,
-            )
-        else:
-            self.config: PreTrainedConfig = config
-
-        extra_model_params = ("encoder_layerdrop", "decoder_layerdrop", "dropout", "attention_dropout")
-        for p in extra_model_params:
-            if getattr(self.hparams, p, None):
-                assert hasattr(self.config, p), f"model config doesn't have a `{p}` attribute"
-                setattr(self.config, p, getattr(self.hparams, p))
-
-        if tokenizer is None:
-            self.tokenizer = AutoTokenizer.from_pretrained(
-                self.hparams.tokenizer_name if self.hparams.tokenizer_name else self.hparams.model_name_or_path,
-                cache_dir=cache_dir,
-            )
-        else:
-            self.tokenizer: PreTrainedTokenizer = tokenizer
-        self.model_type = MODEL_MODES[mode]
-        if model is None:
-            self.model = self.model_type.from_pretrained(
-                self.hparams.model_name_or_path,
-                from_tf=bool(".ckpt" in self.hparams.model_name_or_path),
-                config=self.config,
-                cache_dir=cache_dir,
-            )
-        else:
-            self.model = model
-
-    def load_hf_checkpoint(self, *args, **kwargs):
-        self.model = self.model_type.from_pretrained(*args, **kwargs)
-
-    def get_lr_scheduler(self):
-        get_schedule_func = arg_to_scheduler[self.hparams.lr_scheduler]
-        scheduler = get_schedule_func(
-            self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.total_steps()
-        )
-        scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}
-        return scheduler
-
-    def configure_optimizers(self):
-        """Prepare optimizer and schedule (linear warmup and decay)"""
-        model = self.model
-        no_decay = ["bias", "LayerNorm.weight"]
-        optimizer_grouped_parameters = [
-            {
-                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
-                "weight_decay": self.hparams.weight_decay,
-            },
-            {
-                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
-                "weight_decay": 0.0,
-            },
-        ]
-        if self.hparams.adafactor:
-            optimizer = Adafactor(
-                optimizer_grouped_parameters, lr=self.hparams.learning_rate, scale_parameter=False, relative_step=False
-            )
-
-        else:
-            optimizer = torch.optim.AdamW(
-                optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon
-            )
-        self.opt = optimizer
-
-        scheduler = self.get_lr_scheduler()
-
-        return [optimizer], [scheduler]
-
-    def test_step(self, batch, batch_nb):
-        return self.validation_step(batch, batch_nb)
-
-    def test_epoch_end(self, outputs):
-        return self.validation_end(outputs)
-
-    def total_steps(self) -> int:
-        """The number of total training steps that will be run. Used for lr scheduler purposes."""
-        num_devices = max(1, self.hparams.gpus)  # TODO: consider num_tpu_cores
-        effective_batch_size = self.hparams.train_batch_size * self.hparams.accumulate_grad_batches * num_devices
-        return (self.dataset_size / effective_batch_size) * self.hparams.max_epochs
-
-    def setup(self, mode):
-        if mode == "test":
-            self.dataset_size = len(self.test_dataloader().dataset)
-        else:
-            self.train_loader = self.get_dataloader("train", self.hparams.train_batch_size, shuffle=True)
-            self.dataset_size = len(self.train_dataloader().dataset)
-
-    def get_dataloader(self, type_path: str, batch_size: int, shuffle: bool = False):
-        raise NotImplementedError("You must implement this for your task")
-
-    def train_dataloader(self):
-        return self.train_loader
-
-    def val_dataloader(self):
-        return self.get_dataloader("dev", self.hparams.eval_batch_size, shuffle=False)
-
-    def test_dataloader(self):
-        return self.get_dataloader("test", self.hparams.eval_batch_size, shuffle=False)
-
-    def _feature_file(self, mode):
-        return os.path.join(
-            self.hparams.data_dir,
-            "cached_{}_{}_{}".format(
-                mode,
-                list(filter(None, self.hparams.model_name_or_path.split("/"))).pop(),
-                str(self.hparams.max_seq_length),
-            ),
-        )
-
-    @pl.utilities.rank_zero_only
-    def on_save_checkpoint(self, checkpoint: dict[str, Any]) -> None:
-        save_path = self.output_dir.joinpath("best_tfmr")
-        self.model.config.save_step = self.step_count
-        self.model.save_pretrained(save_path)
-        self.tokenizer.save_pretrained(save_path)
-
-    @staticmethod
-    def add_model_specific_args(parser, root_dir):
-        parser.add_argument(
-            "--model_name_or_path",
-            default=None,
-            type=str,
-            required=True,
-            help="Path to pretrained model or model identifier from huggingface.co/models",
-        )
-        parser.add_argument(
-            "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
-        )
-        parser.add_argument(
-            "--tokenizer_name",
-            default=None,
-            type=str,
-            help="Pretrained tokenizer name or path if not the same as model_name",
-        )
-        parser.add_argument(
-            "--cache_dir",
-            default="",
-            type=str,
-            help="Where do you want to store the pre-trained models downloaded from huggingface.co",
-        )
-        parser.add_argument(
-            "--encoder_layerdrop",
-            type=float,
-            help="Encoder layer dropout probability (Optional). Goes into model.config",
-        )
-        parser.add_argument(
-            "--decoder_layerdrop",
-            type=float,
-            help="Decoder layer dropout probability (Optional). Goes into model.config",
-        )
-        parser.add_argument(
-            "--dropout",
-            type=float,
-            help="Dropout probability (Optional). Goes into model.config",
-        )
-        parser.add_argument(
-            "--attention_dropout",
-            type=float,
-            help="Attention dropout probability (Optional). Goes into model.config",
-        )
-        parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
-        parser.add_argument(
-            "--lr_scheduler",
-            default="linear",
-            choices=arg_to_scheduler_choices,
-            metavar=arg_to_scheduler_metavar,
-            type=str,
-            help="Learning rate scheduler",
-        )
-        parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
-        parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
-        parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
-        parser.add_argument("--num_workers", default=4, type=int, help="kwarg passed to DataLoader")
-        parser.add_argument("--num_train_epochs", dest="max_epochs", default=3, type=int)
-        parser.add_argument("--train_batch_size", default=32, type=int)
-        parser.add_argument("--eval_batch_size", default=32, type=int)
-        parser.add_argument("--adafactor", action="store_true")
-
-
-class LoggingCallback(pl.Callback):
-    def on_batch_end(self, trainer, pl_module):
-        lr_scheduler = trainer.lr_schedulers[0]["scheduler"]
-        lrs = {f"lr_group_{i}": lr for i, lr in enumerate(lr_scheduler.get_lr())}
-        pl_module.logger.log_metrics(lrs)
-
-    def on_validation_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
-        rank_zero_info("***** Validation results *****")
-        metrics = trainer.callback_metrics
-        # Log results
-        for key in sorted(metrics):
-            if key not in ["log", "progress_bar"]:
-                rank_zero_info(f"{key} = {str(metrics[key])}\n")
-
-    def on_test_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule):
-        rank_zero_info("***** Test results *****")
-        metrics = trainer.callback_metrics
-        # Log and save results to file
-        output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
-        with open(output_test_results_file, "w") as writer:
-            for key in sorted(metrics):
-                if key not in ["log", "progress_bar"]:
-                    rank_zero_info(f"{key} = {str(metrics[key])}\n")
-                    writer.write(f"{key} = {str(metrics[key])}\n")
-
-
-def add_generic_args(parser, root_dir) -> None:
-    #  To allow all pl args uncomment the following line
-    #  parser = pl.Trainer.add_argparse_args(parser)
-    parser.add_argument(
-        "--output_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-    parser.add_argument(
-        "--fp16",
-        action="store_true",
-        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
-    )
-
-    parser.add_argument(
-        "--fp16_opt_level",
-        type=str,
-        default="O2",
-        help=(
-            "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. "
-            "See details at https://nvidia.github.io/apex/amp.html"
-        ),
-    )
-    parser.add_argument("--n_tpu_cores", dest="tpu_cores", type=int)
-    parser.add_argument("--max_grad_norm", dest="gradient_clip_val", default=1.0, type=float, help="Max gradient norm")
-    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
-    parser.add_argument("--do_predict", action="store_true", help="Whether to run predictions on the test set.")
-    parser.add_argument(
-        "--gradient_accumulation_steps",
-        dest="accumulate_grad_batches",
-        type=int,
-        default=1,
-        help="Number of updates steps to accumulate before performing a backward/update pass.",
-    )
-    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
-    parser.add_argument(
-        "--data_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The input data dir. Should contain the training files for the CoNLL-2003 NER task.",
-    )
-
-
-def generic_train(
-    model: BaseTransformer,
-    args: argparse.Namespace,
-    early_stopping_callback=None,
-    logger=True,  # can pass WandbLogger() here
-    extra_callbacks=[],
-    checkpoint_callback=None,
-    logging_callback=None,
-    **extra_train_kwargs,
-):
-    pl.seed_everything(args.seed)
-
-    # init model
-    odir = Path(model.hparams.output_dir)
-    odir.mkdir(exist_ok=True)
-
-    # add custom checkpoints
-    if checkpoint_callback is None:
-        checkpoint_callback = pl.callbacks.ModelCheckpoint(
-            filepath=args.output_dir, prefix="checkpoint", monitor="val_loss", mode="min", save_top_k=1
-        )
-    if early_stopping_callback:
-        extra_callbacks.append(early_stopping_callback)
-    if logging_callback is None:
-        logging_callback = LoggingCallback()
-
-    train_params = {}
-
-    # TODO: remove with PyTorch 1.6 since pl uses native amp
-    if args.fp16:
-        train_params["precision"] = 16
-        train_params["amp_level"] = args.fp16_opt_level
-
-    if args.gpus > 1:
-        train_params["distributed_backend"] = "ddp"
-
-    train_params["accumulate_grad_batches"] = args.accumulate_grad_batches
-    train_params["accelerator"] = extra_train_kwargs.get("accelerator")
-    train_params["profiler"] = extra_train_kwargs.get("profiler")
-
-    trainer = pl.Trainer.from_argparse_args(
-        args,
-        weights_summary=None,
-        callbacks=[logging_callback] + extra_callbacks,
-        logger=logger,
-        checkpoint_callback=checkpoint_callback,
-        **train_params,
-    )
-
-    if args.do_train:
-        trainer.fit(model)
-
-    return trainer
--- a/examples/legacy/pytorch-lightning/requirements.txt
+++ b/examples/legacy/pytorch-lightning/requirements.txt
@ -1,21 +0,0 @@
-tensorboard
-scikit-learn
-seqeval
-psutil
-sacrebleu
-rouge-score
-tensorflow_datasets
-matplotlib
-git-python==1.0.3
-faiss-cpu
-streamlit
-elasticsearch
-nltk
-pandas
-datasets >= 1.1.3
-fire
-pytest<8.0.1
-conllu
-sentencepiece != 0.1.92
-protobuf
-ray
--- a/examples/legacy/pytorch-lightning/run_glue.py
+++ b/examples/legacy/pytorch-lightning/run_glue.py
@ -1,201 +0,0 @@
-import argparse
-import glob
-import logging
-import os
-import time
-from argparse import Namespace
-
-import numpy as np
-import torch
-from lightning_base import BaseTransformer, add_generic_args, generic_train
-from torch.utils.data import DataLoader, TensorDataset
-
-from transformers import glue_compute_metrics as compute_metrics
-from transformers import glue_convert_examples_to_features as convert_examples_to_features
-from transformers import glue_output_modes, glue_tasks_num_labels
-from transformers import glue_processors as processors
-
-
-logger = logging.getLogger(__name__)
-
-
-class GLUETransformer(BaseTransformer):
-    mode = "sequence-classification"
-
-    def __init__(self, hparams):
-        if isinstance(hparams, dict):
-            hparams = Namespace(**hparams)
-        hparams.glue_output_mode = glue_output_modes[hparams.task]
-        num_labels = glue_tasks_num_labels[hparams.task]
-
-        super().__init__(hparams, num_labels, self.mode)
-
-    def forward(self, **inputs):
-        return self.model(**inputs)
-
-    def training_step(self, batch, batch_idx):
-        inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
-
-        if self.config.model_type not in ["distilbert", "bart"]:
-            inputs["token_type_ids"] = batch[2] if self.config.model_type in ["bert", "xlnet", "albert"] else None
-
-        outputs = self(**inputs)
-        loss = outputs[0]
-
-        lr_scheduler = self.trainer.lr_schedulers[0]["scheduler"]
-        tensorboard_logs = {"loss": loss, "rate": lr_scheduler.get_last_lr()[-1]}
-        return {"loss": loss, "log": tensorboard_logs}
-
-    def prepare_data(self):
-        "Called to initialize data. Use the call to construct features"
-        args = self.hparams
-        processor = processors[args.task]()
-        self.labels = processor.get_labels()
-
-        for mode in ["train", "dev"]:
-            cached_features_file = self._feature_file(mode)
-            if os.path.exists(cached_features_file) and not args.overwrite_cache:
-                logger.info("Loading features from cached file %s", cached_features_file)
-            else:
-                logger.info("Creating features from dataset file at %s", args.data_dir)
-                examples = (
-                    processor.get_dev_examples(args.data_dir)
-                    if mode == "dev"
-                    else processor.get_train_examples(args.data_dir)
-                )
-                features = convert_examples_to_features(
-                    examples,
-                    self.tokenizer,
-                    max_length=args.max_seq_length,
-                    label_list=self.labels,
-                    output_mode=args.glue_output_mode,
-                )
-                logger.info("Saving features into cached file %s", cached_features_file)
-                torch.save(features, cached_features_file)
-
-    def get_dataloader(self, mode: str, batch_size: int, shuffle: bool = False) -> DataLoader:
-        "Load datasets. Called after prepare data."
-
-        # We test on dev set to compare to benchmarks without having to submit to GLUE server
-        mode = "dev" if mode == "test" else mode
-
-        cached_features_file = self._feature_file(mode)
-        logger.info("Loading features from cached file %s", cached_features_file)
-        features = torch.load(cached_features_file, weights_only=True)
-        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
-        all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
-        all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
-        if self.hparams.glue_output_mode == "classification":
-            all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
-        elif self.hparams.glue_output_mode == "regression":
-            all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
-
-        return DataLoader(
-            TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels),
-            batch_size=batch_size,
-            shuffle=shuffle,
-        )
-
-    def validation_step(self, batch, batch_idx):
-        inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
-
-        if self.config.model_type not in ["distilbert", "bart"]:
-            inputs["token_type_ids"] = batch[2] if self.config.model_type in ["bert", "xlnet", "albert"] else None
-
-        outputs = self(**inputs)
-        tmp_eval_loss, logits = outputs[:2]
-        preds = logits.detach().cpu().numpy()
-        out_label_ids = inputs["labels"].detach().cpu().numpy()
-
-        return {"val_loss": tmp_eval_loss.detach().cpu(), "pred": preds, "target": out_label_ids}
-
-    def _eval_end(self, outputs) -> tuple:
-        val_loss_mean = torch.stack([x["val_loss"] for x in outputs]).mean().detach().cpu().item()
-        preds = np.concatenate([x["pred"] for x in outputs], axis=0)
-
-        if self.hparams.glue_output_mode == "classification":
-            preds = np.argmax(preds, axis=1)
-        elif self.hparams.glue_output_mode == "regression":
-            preds = np.squeeze(preds)
-
-        out_label_ids = np.concatenate([x["target"] for x in outputs], axis=0)
-        out_label_list = [[] for _ in range(out_label_ids.shape[0])]
-        preds_list = [[] for _ in range(out_label_ids.shape[0])]
-
-        results = {"val_loss": val_loss_mean, **compute_metrics(self.hparams.task, preds, out_label_ids)}
-
-        ret = dict(results.items())
-        ret["log"] = results
-        return ret, preds_list, out_label_list
-
-    def validation_epoch_end(self, outputs: list) -> dict:
-        ret, preds, targets = self._eval_end(outputs)
-        logs = ret["log"]
-        return {"val_loss": logs["val_loss"], "log": logs, "progress_bar": logs}
-
-    def test_epoch_end(self, outputs) -> dict:
-        ret, predictions, targets = self._eval_end(outputs)
-        logs = ret["log"]
-        # `val_loss` is the key returned by `self._eval_end()` but actually refers to `test_loss`
-        return {"avg_test_loss": logs["val_loss"], "log": logs, "progress_bar": logs}
-
-    @staticmethod
-    def add_model_specific_args(parser, root_dir):
-        BaseTransformer.add_model_specific_args(parser, root_dir)
-        parser.add_argument(
-            "--max_seq_length",
-            default=128,
-            type=int,
-            help=(
-                "The maximum total input sequence length after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded."
-            ),
-        )
-
-        parser.add_argument(
-            "--task",
-            default="",
-            type=str,
-            required=True,
-            help="The GLUE task to run",
-        )
-        parser.add_argument(
-            "--gpus",
-            default=0,
-            type=int,
-            help="The number of GPUs allocated for this, it is by default 0 meaning none",
-        )
-
-        parser.add_argument(
-            "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
-        )
-
-        return parser
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    add_generic_args(parser, os.getcwd())
-    parser = GLUETransformer.add_model_specific_args(parser, os.getcwd())
-    args = parser.parse_args()
-
-    # If output_dir not provided, a folder will be generated in pwd
-    if args.output_dir is None:
-        args.output_dir = os.path.join(
-            "./results",
-            f"{args.task}_{time.strftime('%Y%m%d_%H%M%S')}",
-        )
-        os.makedirs(args.output_dir)
-
-    model = GLUETransformer(args)
-    trainer = generic_train(model, args)
-
-    # Optionally, predict on dev set and write to output_dir
-    if args.do_predict:
-        checkpoints = sorted(glob.glob(os.path.join(args.output_dir, "checkpoint-epoch=*.ckpt"), recursive=True))
-        model = model.load_from_checkpoint(checkpoints[-1])
-        return trainer.test(model)
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/legacy/pytorch-lightning/run_glue.sh
+++ b/examples/legacy/pytorch-lightning/run_glue.sh
@ -1,34 +0,0 @@
-# Install example requirements
-pip install -r ../requirements.txt
-
-# Download glue data
-python3 ../../utils/download_glue_data.py
-
-export TASK=mrpc
-export DATA_DIR=./glue_data/MRPC/
-export MAX_LENGTH=128
-export LEARNING_RATE=2e-5
-export BERT_MODEL=bert-base-cased
-export BATCH_SIZE=32
-export NUM_EPOCHS=3
-export SEED=2
-export OUTPUT_DIR_NAME=mrpc-pl-bert
-export CURRENT_DIR=${PWD}
-export OUTPUT_DIR=${CURRENT_DIR}/${OUTPUT_DIR_NAME}
-
-# Make output directory if it doesn't exist
-mkdir -p $OUTPUT_DIR
-# Add parent directory to python path to access lightning_base.py
-export PYTHONPATH="../":"${PYTHONPATH}"
-
-python3 run_glue.py --gpus 1 --data_dir $DATA_DIR \
--task $TASK \
--model_name_or_path $BERT_MODEL \
--output_dir $OUTPUT_DIR \
--max_seq_length  $MAX_LENGTH \
--learning_rate $LEARNING_RATE \
--num_train_epochs $NUM_EPOCHS \
--train_batch_size $BATCH_SIZE \
--seed $SEED \
--do_train \
--do_predict
--- a/examples/legacy/pytorch-lightning/run_ner.py
+++ b/examples/legacy/pytorch-lightning/run_ner.py
@ -1,216 +0,0 @@
-import argparse
-import glob
-import logging
-import os
-from argparse import Namespace
-from importlib import import_module
-
-import numpy as np
-import torch
-from lightning_base import BaseTransformer, add_generic_args, generic_train
-from seqeval.metrics import accuracy_score, f1_score, precision_score, recall_score
-from torch.nn import CrossEntropyLoss
-from torch.utils.data import DataLoader, TensorDataset
-from utils_ner import TokenClassificationTask
-
-
-logger = logging.getLogger(__name__)
-
-
-class NERTransformer(BaseTransformer):
-    """
-    A training module for NER. See BaseTransformer for the core options.
-    """
-
-    mode = "token-classification"
-
-    def __init__(self, hparams):
-        if isinstance(hparams, dict):
-            hparams = Namespace(**hparams)
-        module = import_module("tasks")
-        try:
-            token_classification_task_clazz = getattr(module, hparams.task_type)
-            self.token_classification_task: TokenClassificationTask = token_classification_task_clazz()
-        except AttributeError:
-            raise ValueError(
-                f"Task {hparams.task_type} needs to be defined as a TokenClassificationTask subclass in {module}. "
-                f"Available tasks classes are: {TokenClassificationTask.__subclasses__()}"
-            )
-        self.labels = self.token_classification_task.get_labels(hparams.labels)
-        self.pad_token_label_id = CrossEntropyLoss().ignore_index
-        super().__init__(hparams, len(self.labels), self.mode)
-
-    def forward(self, **inputs):
-        return self.model(**inputs)
-
-    def training_step(self, batch, batch_num):
-        "Compute loss and log."
-        inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
-        if self.config.model_type != "distilbert":
-            inputs["token_type_ids"] = (
-                batch[2] if self.config.model_type in ["bert", "xlnet"] else None
-            )  # XLM and RoBERTa don"t use token_type_ids
-
-        outputs = self(**inputs)
-        loss = outputs[0]
-        # tensorboard_logs = {"loss": loss, "rate": self.lr_scheduler.get_last_lr()[-1]}
-        return {"loss": loss}
-
-    def prepare_data(self):
-        "Called to initialize data. Use the call to construct features"
-        args = self.hparams
-        for mode in ["train", "dev", "test"]:
-            cached_features_file = self._feature_file(mode)
-            if os.path.exists(cached_features_file) and not args.overwrite_cache:
-                logger.info("Loading features from cached file %s", cached_features_file)
-                features = torch.load(cached_features_file, weights_only=True)
-            else:
-                logger.info("Creating features from dataset file at %s", args.data_dir)
-                examples = self.token_classification_task.read_examples_from_file(args.data_dir, mode)
-                features = self.token_classification_task.convert_examples_to_features(
-                    examples,
-                    self.labels,
-                    args.max_seq_length,
-                    self.tokenizer,
-                    cls_token_at_end=bool(self.config.model_type == "xlnet"),
-                    cls_token=self.tokenizer.cls_token,
-                    cls_token_segment_id=2 if self.config.model_type == "xlnet" else 0,
-                    sep_token=self.tokenizer.sep_token,
-                    sep_token_extra=False,
-                    pad_on_left=bool(self.config.model_type == "xlnet"),
-                    pad_token=self.tokenizer.pad_token_id,
-                    pad_token_segment_id=self.tokenizer.pad_token_type_id,
-                    pad_token_label_id=self.pad_token_label_id,
-                )
-                logger.info("Saving features into cached file %s", cached_features_file)
-                torch.save(features, cached_features_file)
-
-    def get_dataloader(self, mode: int, batch_size: int, shuffle: bool = False) -> DataLoader:
-        "Load datasets. Called after prepare data."
-        cached_features_file = self._feature_file(mode)
-        logger.info("Loading features from cached file %s", cached_features_file)
-        features = torch.load(cached_features_file, weights_only=True)
-        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
-        all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
-        if features[0].token_type_ids is not None:
-            all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
-        else:
-            all_token_type_ids = torch.tensor([0 for f in features], dtype=torch.long)
-            # HACK(we will not use this anymore soon)
-        all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long)
-        return DataLoader(
-            TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_label_ids), batch_size=batch_size
-        )
-
-    def validation_step(self, batch, batch_nb):
-        """Compute validation""" ""
-        inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
-        if self.config.model_type != "distilbert":
-            inputs["token_type_ids"] = (
-                batch[2] if self.config.model_type in ["bert", "xlnet"] else None
-            )  # XLM and RoBERTa don"t use token_type_ids
-        outputs = self(**inputs)
-        tmp_eval_loss, logits = outputs[:2]
-        preds = logits.detach().cpu().numpy()
-        out_label_ids = inputs["labels"].detach().cpu().numpy()
-        return {"val_loss": tmp_eval_loss.detach().cpu(), "pred": preds, "target": out_label_ids}
-
-    def _eval_end(self, outputs):
-        "Evaluation called for both Val and Test"
-        val_loss_mean = torch.stack([x["val_loss"] for x in outputs]).mean()
-        preds = np.concatenate([x["pred"] for x in outputs], axis=0)
-        preds = np.argmax(preds, axis=2)
-        out_label_ids = np.concatenate([x["target"] for x in outputs], axis=0)
-
-        label_map = dict(enumerate(self.labels))
-        out_label_list = [[] for _ in range(out_label_ids.shape[0])]
-        preds_list = [[] for _ in range(out_label_ids.shape[0])]
-
-        for i in range(out_label_ids.shape[0]):
-            for j in range(out_label_ids.shape[1]):
-                if out_label_ids[i, j] != self.pad_token_label_id:
-                    out_label_list[i].append(label_map[out_label_ids[i][j]])
-                    preds_list[i].append(label_map[preds[i][j]])
-
-        results = {
-            "val_loss": val_loss_mean,
-            "accuracy_score": accuracy_score(out_label_list, preds_list),
-            "precision": precision_score(out_label_list, preds_list),
-            "recall": recall_score(out_label_list, preds_list),
-            "f1": f1_score(out_label_list, preds_list),
-        }
-
-        ret = dict(results.items())
-        ret["log"] = results
-        return ret, preds_list, out_label_list
-
-    def validation_epoch_end(self, outputs):
-        # when stable
-        ret, preds, targets = self._eval_end(outputs)
-        logs = ret["log"]
-        return {"val_loss": logs["val_loss"], "log": logs, "progress_bar": logs}
-
-    def test_epoch_end(self, outputs):
-        # updating to test_epoch_end instead of deprecated test_end
-        ret, predictions, targets = self._eval_end(outputs)
-
-        # Converting to the dict required by pl
-        # https://github.com/PyTorchLightning/pytorch-lightning/blob/master/\
-        # pytorch_lightning/trainer/logging.py#L139
-        logs = ret["log"]
-        # `val_loss` is the key returned by `self._eval_end()` but actually refers to `test_loss`
-        return {"avg_test_loss": logs["val_loss"], "log": logs, "progress_bar": logs}
-
-    @staticmethod
-    def add_model_specific_args(parser, root_dir):
-        # Add NER specific options
-        BaseTransformer.add_model_specific_args(parser, root_dir)
-        parser.add_argument(
-            "--task_type", default="NER", type=str, help="Task type to fine tune in training (e.g. NER, POS, etc)"
-        )
-        parser.add_argument(
-            "--max_seq_length",
-            default=128,
-            type=int,
-            help=(
-                "The maximum total input sequence length after tokenization. Sequences longer "
-                "than this will be truncated, sequences shorter will be padded."
-            ),
-        )
-
-        parser.add_argument(
-            "--labels",
-            default="",
-            type=str,
-            help="Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.",
-        )
-        parser.add_argument(
-            "--gpus",
-            default=0,
-            type=int,
-            help="The number of GPUs allocated for this, it is by default 0 meaning none",
-        )
-
-        parser.add_argument(
-            "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
-        )
-
-        return parser
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    add_generic_args(parser, os.getcwd())
-    parser = NERTransformer.add_model_specific_args(parser, os.getcwd())
-    args = parser.parse_args()
-    model = NERTransformer(args)
-    trainer = generic_train(model, args)
-
-    if args.do_predict:
-        # See https://github.com/huggingface/transformers/issues/3159
-        # pl use this default format to create a checkpoint:
-        # https://github.com/PyTorchLightning/pytorch-lightning/blob/master\
-        # /pytorch_lightning/callbacks/model_checkpoint.py#L322
-        checkpoints = sorted(glob.glob(os.path.join(args.output_dir, "checkpoint-epoch=*.ckpt"), recursive=True))
-        model = model.load_from_checkpoint(checkpoints[-1])
-        trainer.test(model)
--- a/examples/legacy/pytorch-lightning/run_ner.sh
+++ b/examples/legacy/pytorch-lightning/run_ner.sh
@ -1,44 +0,0 @@
-#!/usr/bin/env bash
-
-# for seqeval metrics import
-pip install -r ../requirements.txt
-
-## The relevant files are currently on a shared Google
-## drive at https://drive.google.com/drive/folders/1kC0I2UGl2ltrluI9NqDjaQJGw5iliw_J
-## Monitor for changes and eventually migrate to use the `datasets` library
-curl -L 'https://drive.google.com/uc?export=download&id=1Jjhbal535VVz2ap4v4r_rN1UEHTdLK5P' \
-| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > train.txt.tmp
-curl -L 'https://drive.google.com/uc?export=download&id=1ZfRcQThdtAR5PPRjIDtrVP7BtXSCUBbm' \
-| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > dev.txt.tmp
-curl -L 'https://drive.google.com/uc?export=download&id=1u9mb7kNJHWQCWyweMDRMuTFoOHOfeBTH' \
-| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > test.txt.tmp
-
-export MAX_LENGTH=128
-export BERT_MODEL=bert-base-multilingual-cased
-python3 scripts/preprocess.py train.txt.tmp $BERT_MODEL $MAX_LENGTH > train.txt
-python3 scripts/preprocess.py dev.txt.tmp $BERT_MODEL $MAX_LENGTH > dev.txt
-python3 scripts/preprocess.py test.txt.tmp $BERT_MODEL $MAX_LENGTH > test.txt
-cat train.txt dev.txt test.txt | cut -d " " -f 2 | grep -v "^$"| sort | uniq > labels.txt
-export BATCH_SIZE=32
-export NUM_EPOCHS=3
-export SEED=1
-
-export OUTPUT_DIR_NAME=germeval-model
-export CURRENT_DIR=${PWD}
-export OUTPUT_DIR=${CURRENT_DIR}/${OUTPUT_DIR_NAME}
-mkdir -p $OUTPUT_DIR
-
-# Add parent directory to python path to access lightning_base.py
-export PYTHONPATH="../":"${PYTHONPATH}"
-
-python3 run_ner.py --data_dir ./ \
--labels ./labels.txt \
--model_name_or_path $BERT_MODEL \
--output_dir $OUTPUT_DIR \
--max_seq_length  $MAX_LENGTH \
--num_train_epochs $NUM_EPOCHS \
--train_batch_size $BATCH_SIZE \
--seed $SEED \
--gpus 1 \
--do_train \
--do_predict
--- a/examples/legacy/pytorch-lightning/run_pos.sh
+++ b/examples/legacy/pytorch-lightning/run_pos.sh
@ -1,39 +0,0 @@
-#!/usr/bin/env bash
-if ! [ -f ./dev.txt ]; then
-  echo "Download dev dataset...."
-  curl -L -o ./dev.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-dev.conllu'
-fi
-
-if ! [ -f ./test.txt ]; then
-  echo "Download test dataset...."
-  curl -L -o ./test.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-test.conllu'
-fi
-
-if ! [ -f ./train.txt ]; then
-  echo "Download train dataset...."
-  curl -L -o ./train.txt 'https://github.com/UniversalDependencies/UD_English-EWT/raw/master/en_ewt-ud-train.conllu'
-fi
-
-export MAX_LENGTH=200
-export BERT_MODEL=bert-base-uncased
-export OUTPUT_DIR=postagger-model
-export BATCH_SIZE=32
-export NUM_EPOCHS=3
-export SAVE_STEPS=750
-export SEED=1
-
-
-# Add parent directory to python path to access lightning_base.py
-export PYTHONPATH="../":"${PYTHONPATH}"
-
-python3 run_ner.py --data_dir ./ \
--task_type POS \
--model_name_or_path $BERT_MODEL \
--output_dir $OUTPUT_DIR \
--max_seq_length  $MAX_LENGTH \
--num_train_epochs $NUM_EPOCHS \
--train_batch_size $BATCH_SIZE \
--seed $SEED \
--gpus 1 \
--do_train \
--do_predict
--- a/examples/legacy/question-answering/README.md
+++ b/examples/legacy/question-answering/README.md
@ -1,126 +0,0 @@
-#### Fine-tuning BERT on SQuAD1.0 with relative position embeddings
-
-The following examples show how to fine-tune BERT models with different relative position embeddings. The BERT model 
-`google-bert/bert-base-uncased` was pretrained with default absolute position embeddings. We provide the following pretrained 
-models which were pre-trained on the same training data (BooksCorpus and English Wikipedia) as in the BERT model 
-training, but with different relative position embeddings. 
-
-* `zhiheng-huang/bert-base-uncased-embedding-relative-key`, trained from scratch with relative embedding proposed by 
-Shaw et al., [Self-Attention with Relative Position Representations](https://huggingface.co/papers/1803.02155)
-* `zhiheng-huang/bert-base-uncased-embedding-relative-key-query`, trained from scratch with relative embedding method 4 
-in Huang et al. [Improve Transformer Models with Better Relative Position Embeddings](https://huggingface.co/papers/2009.13658)
-* `zhiheng-huang/bert-large-uncased-whole-word-masking-embedding-relative-key-query`, fine-tuned from model 
-`google-bert/bert-large-uncased-whole-word-masking` with 3 additional epochs with relative embedding method 4 in Huang et al. 
-[Improve Transformer Models with Better Relative Position Embeddings](https://huggingface.co/papers/2009.13658)
-
-
-##### Base models fine-tuning
-
-```bash
-export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-torchrun --nproc_per_node=8 ./examples/question-answering/run_squad.py \
-    --model_name_or_path zhiheng-huang/bert-base-uncased-embedding-relative-key-query \
-    --dataset_name squad \
-    --do_train \
-    --do_eval \
-    --learning_rate 3e-5 \
-    --num_train_epochs 2 \
-    --max_seq_length 512 \
-    --doc_stride 128 \
-    --output_dir relative_squad \
-    --per_device_eval_batch_size=60 \
-    --per_device_train_batch_size=6
-```
-Training with the above command leads to the following results. It boosts the BERT default from f1 score of 88.52 to 90.54.
-
-```bash
-'exact': 83.6802270577105, 'f1': 90.54772098174814
-```
-
-The change of `max_seq_length` from 512 to 384 in the above command leads to the f1 score of 90.34. Replacing the above 
-model `zhiheng-huang/bert-base-uncased-embedding-relative-key-query` with 
-`zhiheng-huang/bert-base-uncased-embedding-relative-key` leads to the f1 score of 89.51. The changing of 8 gpus to one 
-gpu training leads to the f1 score of 90.71.
-
-##### Large models fine-tuning
-
-```bash
-export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-torchrun --nproc_per_node=8 ./examples/question-answering/run_squad.py \
-    --model_name_or_path zhiheng-huang/bert-large-uncased-whole-word-masking-embedding-relative-key-query \
-    --dataset_name squad \
-    --do_train \
-    --do_eval \
-    --learning_rate 3e-5 \
-    --num_train_epochs 2 \
-    --max_seq_length 512 \
-    --doc_stride 128 \
-    --output_dir relative_squad \
-    --per_gpu_eval_batch_size=6 \
-    --per_gpu_train_batch_size=2 \
-    --gradient_accumulation_steps 3
-```
-Training with the above command leads to the f1 score of 93.52, which is slightly better than the f1 score of 93.15 for 
-`google-bert/bert-large-uncased-whole-word-masking`.
-
-#### Distributed training
-
-Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD1.1:
-
-```bash
-torchrun --nproc_per_node=8 ./examples/question-answering/run_squad.py \
-    --model_name_or_path google-bert/bert-large-uncased-whole-word-masking \
-    --dataset_name squad \
-    --do_train \
-    --do_eval \
-    --learning_rate 3e-5 \
-    --num_train_epochs 2 \
-    --max_seq_length 384 \
-    --doc_stride 128 \
-    --output_dir ./examples/models/wwm_uncased_finetuned_squad/ \
-    --per_device_eval_batch_size=3   \
-    --per_device_train_batch_size=3   \
-```
-
-Training with the previously defined hyper-parameters yields the following results:
-
-```bash
-f1 = 93.15
-exact_match = 86.91
-```
-
-This fine-tuned model is available as a checkpoint under the reference
-[`google-bert/bert-large-uncased-whole-word-masking-finetuned-squad`](https://huggingface.co/google-bert/bert-large-uncased-whole-word-masking-finetuned-squad).
-
-## Results
-
-Larger batch size may improve the performance while costing more memory.
-
-##### Results for SQuAD1.0 with the previously defined hyper-parameters:
-
-```python
-{
-"exact": 85.45884578997162,
-"f1": 92.5974600601065,
-"total": 10570,
-"HasAns_exact": 85.45884578997162,
-"HasAns_f1": 92.59746006010651,
-"HasAns_total": 10570
-}
-```
-
-##### Results for SQuAD2.0 with the previously defined hyper-parameters:
-
-```python
-{
-"exact": 80.4177545691906,
-"f1": 84.07154997729623,
-"total": 11873,
-"HasAns_exact": 76.73751686909581,
-"HasAns_f1": 84.05558584352873,
-"HasAns_total": 5928,
-"NoAns_exact": 84.0874684608915,
-"NoAns_f1": 84.0874684608915,
-"NoAns_total": 5945
-}
-```
--- a/Show More
+++ b/Show More