fix

try 1
2025-11-04 20:14:36 +08:00 · 2025-06-25 17:28:16 +02:00 · 2025-06-25 17:06:35 +02:00 · 2025-06-25 17:04:02 +02:00
760 changed files with 18665 additions and 42246 deletions
--- a/.github/workflows/check_failed_tests.yml
+++ b/.github/workflows/check_failed_tests.yml
@ -41,7 +41,7 @@ jobs:
  check_new_failures:
    name: " "
    runs-on:
-      group: aws-g5-4xlarge-cache
+      group: aws-g4dn-4xlarge-cache
    container:
      image: ${{ inputs.docker }}
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
--- a/.github/workflows/doctest_job.yml
+++ b/.github/workflows/doctest_job.yml
@ -28,7 +28,7 @@ jobs:
      matrix:
        split_keys: ${{ fromJson(inputs.split_keys) }}
    runs-on: 
-      group: aws-g5-4xlarge-cache
+      group: aws-g4dn-4xlarge-cache
    container:
      image: huggingface/transformers-all-latest-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
--- a/.github/workflows/doctests.yml
+++ b/.github/workflows/doctests.yml
@ -15,7 +15,7 @@ jobs:
  setup:
    name: Setup
    runs-on: 
-      group: aws-g5-4xlarge-cache
+      group: aws-g4dn-4xlarge-cache
    container:
      image: huggingface/transformers-all-latest-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
--- a/.github/workflows/get-pr-info.yml
+++ b/.github/workflows/get-pr-info.yml
@ -1,157 +0,0 @@
-name: Get PR commit SHA
-on:
-  workflow_call:
-    inputs:
-      pr_number:
-        required: true
-        type: string
-    outputs:
-      PR_HEAD_REPO_FULL_NAME:
-        description: "The full name of the repository from which the pull request is created"
-        value: ${{ jobs.get-pr-info.outputs.PR_HEAD_REPO_FULL_NAME }}
-      PR_BASE_REPO_FULL_NAME:
-        description: "The full name of the repository to which the pull request is created"
-        value: ${{ jobs.get-pr-info.outputs.PR_BASE_REPO_FULL_NAME }}
-      PR_HEAD_REPO_OWNER:
-        description: "The owner of the repository from which the pull request is created"
-        value: ${{ jobs.get-pr-info.outputs.PR_HEAD_REPO_OWNER }}
-      PR_BASE_REPO_OWNER:
-        description: "The owner of the repository to which the pull request is created"
-        value: ${{ jobs.get-pr-info.outputs.PR_BASE_REPO_OWNER }}
-      PR_HEAD_REPO_NAME:
-        description: "The name of the repository from which the pull request is created"
-        value: ${{ jobs.get-pr-info.outputs.PR_HEAD_REPO_NAME }}
-      PR_BASE_REPO_NAME:
-        description: "The name of the repository to which the pull request is created"
-        value: ${{ jobs.get-pr-info.outputs.PR_BASE_REPO_NAME }}
-      PR_HEAD_REF:
-        description: "The branch name of the pull request in the head repository"
-        value: ${{ jobs.get-pr-info.outputs.PR_HEAD_REF }}
-      PR_BASE_REF:
-        description: "The branch name in the base repository (to merge into)"
-        value: ${{ jobs.get-pr-info.outputs.PR_BASE_REF }}
-      PR_HEAD_SHA:
-        description: "The head sha of the pull request branch in the head repository"
-        value: ${{ jobs.get-pr-info.outputs.PR_HEAD_SHA }}
-      PR_BASE_SHA:
-        description: "The head sha of the target branch in the base repository"
-        value: ${{ jobs.get-pr-info.outputs.PR_BASE_SHA }}
-      PR_MERGE_COMMIT_SHA:
-        description: "The sha of the merge commit for the pull request (created by GitHub) in the base repository"
-        value: ${{ jobs.get-pr-info.outputs.PR_MERGE_COMMIT_SHA }}
-      PR_HEAD_COMMIT_DATE:
-        description: "The date of the head sha of the pull request branch in the head repository"
-        value: ${{ jobs.get-pr-info.outputs.PR_HEAD_COMMIT_DATE }}
-      PR_MERGE_COMMIT_DATE:
-        description: "The date of the merge commit for the pull request (created by GitHub) in the base repository"
-        value: ${{ jobs.get-pr-info.outputs.PR_MERGE_COMMIT_DATE }}
-      PR_HEAD_COMMIT_TIMESTAMP:
-        description: "The timestamp of the head sha of the pull request branch in the head repository"
-        value: ${{ jobs.get-pr-info.outputs.PR_HEAD_COMMIT_TIMESTAMP }}
-      PR_MERGE_COMMIT_TIMESTAMP:
-        description: "The timestamp of the merge commit for the pull request (created by GitHub) in the base repository"
-        value: ${{ jobs.get-pr-info.outputs.PR_MERGE_COMMIT_TIMESTAMP }}
-      PR:
-        description: "The PR"
-        value: ${{ jobs.get-pr-info.outputs.PR }}
-      PR_FILES:
-        description: "The files touched in the PR"
-        value: ${{ jobs.get-pr-info.outputs.PR_FILES }}
-
-
-jobs:
-  get-pr-info:
-    runs-on: ubuntu-22.04
-    name: Get PR commit SHA better
-    outputs:
-      PR_HEAD_REPO_FULL_NAME: ${{ steps.pr_info.outputs.head_repo_full_name }}
-      PR_BASE_REPO_FULL_NAME: ${{ steps.pr_info.outputs.base_repo_full_name }}
-      PR_HEAD_REPO_OWNER: ${{ steps.pr_info.outputs.head_repo_owner }}
-      PR_BASE_REPO_OWNER: ${{ steps.pr_info.outputs.base_repo_owner }}
-      PR_HEAD_REPO_NAME: ${{ steps.pr_info.outputs.head_repo_name }}
-      PR_BASE_REPO_NAME: ${{ steps.pr_info.outputs.base_repo_name }}
-      PR_HEAD_REF: ${{ steps.pr_info.outputs.head_ref }}
-      PR_BASE_REF: ${{ steps.pr_info.outputs.base_ref }}
-      PR_HEAD_SHA: ${{ steps.pr_info.outputs.head_sha }}
-      PR_BASE_SHA: ${{ steps.pr_info.outputs.base_sha }}
-      PR_MERGE_COMMIT_SHA: ${{ steps.pr_info.outputs.merge_commit_sha }}
-      PR_HEAD_COMMIT_DATE: ${{ steps.pr_info.outputs.head_commit_date }}
-      PR_MERGE_COMMIT_DATE: ${{ steps.pr_info.outputs.merge_commit_date }}
-      PR_HEAD_COMMIT_TIMESTAMP: ${{ steps.get_timestamps.outputs.head_commit_timestamp }}
-      PR_MERGE_COMMIT_TIMESTAMP: ${{ steps.get_timestamps.outputs.merge_commit_timestamp }}
-      PR: ${{ steps.pr_info.outputs.pr }}
-      PR_FILES: ${{ steps.pr_info.outputs.files }}
-    if: ${{ inputs.pr_number != '' }}
-    steps:
-      - name: Extract PR details
-        id: pr_info
-        uses: actions/github-script@v6
-        with:
-          script: |            
-            const { data: pr } = await github.rest.pulls.get({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              pull_number: ${{ inputs.pr_number }}
-            });
-
-            const { data: head_commit }  = await github.rest.repos.getCommit({
-              owner: pr.head.repo.owner.login,
-              repo: pr.head.repo.name,
-              ref: pr.head.ref
-            });
-
-            const { data: merge_commit }  = await github.rest.repos.getCommit({
-              owner: pr.base.repo.owner.login,
-              repo: pr.base.repo.name,
-              ref: pr.merge_commit_sha,
-            });
-
-            const { data: files } = await github.rest.pulls.listFiles({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              pull_number: ${{ inputs.pr_number }}
-            });
-
-            core.setOutput('head_repo_full_name', pr.head.repo.full_name);
-            core.setOutput('base_repo_full_name', pr.base.repo.full_name);
-            core.setOutput('head_repo_owner', pr.head.repo.owner.login);
-            core.setOutput('base_repo_owner', pr.base.repo.owner.login);
-            core.setOutput('head_repo_name', pr.head.repo.name);
-            core.setOutput('base_repo_name', pr.base.repo.name);
-            core.setOutput('head_ref', pr.head.ref);
-            core.setOutput('base_ref', pr.base.ref);
-            core.setOutput('head_sha', pr.head.sha);
-            core.setOutput('base_sha', pr.base.sha);
-            core.setOutput('merge_commit_sha', pr.merge_commit_sha);
-            core.setOutput('pr', pr);
-
-            core.setOutput('head_commit_date', head_commit.commit.committer.date);
-            core.setOutput('merge_commit_date', merge_commit.commit.committer.date);
-            
-            core.setOutput('files', files);            
-            
-            console.log('PR head commit:', {
-              head_commit: head_commit,
-              commit: head_commit.commit,
-              date: head_commit.commit.committer.date
-            });
-
-            console.log('PR merge commit:', {
-              merge_commit: merge_commit,
-              commit: merge_commit.commit,
-              date: merge_commit.commit.committer.date
-            });
-
-      - name: Convert dates to timestamps
-        id: get_timestamps
-        run: |
-          head_commit_date=${{ steps.pr_info.outputs.head_commit_date }}
-          merge_commit_date=${{ steps.pr_info.outputs.merge_commit_date }}
-          echo $head_commit_date
-          echo $merge_commit_date
-          head_commit_timestamp=$(date -d "$head_commit_date" +%s)
-          merge_commit_timestamp=$(date -d "$merge_commit_date" +%s)
-          echo $head_commit_timestamp
-          echo $merge_commit_timestamp
-          echo "head_commit_timestamp=$head_commit_timestamp" >> $GITHUB_OUTPUT
-          echo "merge_commit_timestamp=$merge_commit_timestamp" >> $GITHUB_OUTPUT
--- a/.github/workflows/get-pr-number.yml
+++ b/.github/workflows/get-pr-number.yml
@ -1,36 +0,0 @@
-name: Get PR number
-on:
-  workflow_call:
-    outputs:
-      PR_NUMBER:
-        description: "The extracted PR number"
-        value: ${{ jobs.get-pr-number.outputs.PR_NUMBER }}
-
-jobs:
-  get-pr-number:
-    runs-on: ubuntu-22.04
-    name: Get PR number
-    outputs:
-      PR_NUMBER: ${{ steps.set_pr_number.outputs.PR_NUMBER }}
-    steps:
-      - name: Get PR number
-        shell: bash
-        run: |
-          if [[ "${{ github.event.issue.number }}" != "" && "${{ github.event.issue.pull_request }}" != "" ]]; then
-            echo "PR_NUMBER=${{ github.event.issue.number }}" >> $GITHUB_ENV
-          elif [[ "${{ github.event.pull_request.number }}" != "" ]]; then
-            echo "PR_NUMBER=${{ github.event.pull_request.number }}" >> $GITHUB_ENV
-          elif [[ "${{ github.event.pull_request }}" != "" ]]; then
-            echo "PR_NUMBER=${{ github.event.number }}" >> $GITHUB_ENV
-          else
-            echo "PR_NUMBER=" >> $GITHUB_ENV
-          fi
-
-      - name: Check PR number
-        shell: bash
-        run: |
-          echo "${{ env.PR_NUMBER }}"
-
-      - name: Set PR number
-        id: set_pr_number
-        run: echo "PR_NUMBER=${{ env.PR_NUMBER }}" >> "$GITHUB_OUTPUT"
--- a/.github/workflows/model_jobs.yml
+++ b/.github/workflows/model_jobs.yml
@ -107,9 +107,9 @@ jobs:
        run: |
          echo "${{ inputs.machine_type }}"

-          if [ "${{ inputs.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+          if [ "${{ inputs.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ inputs.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
+          elif [ "${{ inputs.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ inputs.machine_type }}
--- a/.github/workflows/pr_run_slow_ci.yml
+++ b/.github/workflows/pr_run_slow_ci.yml
@ -1,199 +0,0 @@
-name: PR slow CI
-on:
-  pull_request_target:
-    types: [opened, synchronize, reopened]
-
-jobs:
-  get-pr-number:
-    name: Get PR number
-    uses: ./.github/workflows/get-pr-number.yml
-
-  get-pr-info:
-    name: Get PR commit SHA
-    needs: get-pr-number
-    if: ${{ needs.get-pr-number.outputs.PR_NUMBER != ''}}
-    uses: ./.github/workflows/get-pr-info.yml
-    with:
-      pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
-
-  # We only need to verify the timestamp if the workflow is triggered by `issue_comment`.
-  verity_pr_commit:
-    name: Verity PR commit corresponds to a specific event by comparing timestamps
-    if: ${{ github.event.comment.created_at != '' }}
-    runs-on: ubuntu-22.04
-    needs: get-pr-info
-    env:
-      COMMENT_DATE: ${{ github.event.comment.created_at }}
-      PR_MERGE_COMMIT_DATE: ${{ needs.get-pr-info.outputs.PR_MERGE_COMMIT_DATE }}
-      PR_MERGE_COMMIT_TIMESTAMP: ${{ needs.get-pr-info.outputs.PR_MERGE_COMMIT_TIMESTAMP }}
-    steps:
-      - run: |
-          COMMENT_TIMESTAMP=$(date -d "${COMMENT_DATE}" +"%s")
-          echo "COMMENT_DATE: $COMMENT_DATE"
-          echo "PR_MERGE_COMMIT_DATE: $PR_MERGE_COMMIT_DATE"
-          echo "COMMENT_TIMESTAMP: $COMMENT_TIMESTAMP"
-          echo "PR_MERGE_COMMIT_TIMESTAMP: $PR_MERGE_COMMIT_TIMESTAMP"
-          if [ $COMMENT_TIMESTAMP -le $PR_MERGE_COMMIT_TIMESTAMP ]; then
-            echo "Last commit on the pull request is newer than the issue comment triggering this run! Abort!";
-            exit -1;
-          fi
-
-  get-jobs:
-    name: Get test files to run
-    runs-on: ubuntu-22.04
-    needs: [get-pr-number, get-pr-info]
-    outputs:
-      jobs: ${{ steps.get_jobs.outputs.jobs_to_run }}
-    steps:
-      - name: Get repository content
-        id: repo_content
-        uses: actions/github-script@v6
-        with:
-          script: |
-            const { data: tests_dir } = await github.rest.repos.getContent({
-              owner: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_OWNER }}',
-              repo: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_NAME }}',
-              path: 'tests',
-              ref: '${{ needs.get-pr-info.outputs.PR_HEAD_SHA }}',
-            });
-
-            const { data: tests_models_dir } = await github.rest.repos.getContent({
-              owner: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_OWNER }}',
-              repo: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_NAME }}',
-              path: 'tests/models',
-              ref: '${{ needs.get-pr-info.outputs.PR_HEAD_SHA }}',
-            });
-
-            const { data: tests_quantization_dir } = await github.rest.repos.getContent({
-              owner: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_OWNER }}',
-              repo: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_NAME }}',
-              path: 'tests/quantization',
-              ref: '${{ needs.get-pr-info.outputs.PR_HEAD_SHA }}',
-            });
-
-            core.setOutput('tests_dir', tests_dir);
-            core.setOutput('tests_models_dir', tests_models_dir);
-            core.setOutput('tests_quantization_dir', tests_quantization_dir);
-
-      # This checkout to the main branch
-      - uses: actions/checkout@v4
-        with:
-          fetch-depth: "0"
-
-      - name: Write pr_files file
-        run: |
-          cat > pr_files.txt << 'EOF'
-          ${{ needs.get-pr-info.outputs.PR_FILES }}
-          EOF
-
-      - name: Write tests_dir file
-        run: |
-          cat > tests_dir.txt << 'EOF'
-          ${{ steps.repo_content.outputs.tests_dir }}
-          EOF
-
-      - name: Write tests_models_dir file
-        run: |
-          cat > tests_models_dir.txt << 'EOF'
-          ${{ steps.repo_content.outputs.tests_models_dir }}
-          EOF
-
-      - name: Write tests_quantization_dir file
-        run: |
-          cat > tests_quantization_dir.txt << 'EOF'
-          ${{ steps.repo_content.outputs.tests_quantization_dir }}
-          EOF
-
-      - name: Run script to get jobs to run
-        id: get_jobs
-        run: |
-          python utils/get_pr_run_slow_jobs.py | tee output.txt
-          echo "jobs_to_run: $(tail -n 1 output.txt)"
-          echo "jobs_to_run=$(tail -n 1 output.txt)" >> $GITHUB_OUTPUT
-
-  send_comment:
-    # Will delete the previous comment and send a new one if:
-    #   - either the content is changed
-    #   - or the previous comment is 30 minutes or more old
-    name: Send a comment to suggest jobs to run
-    if: ${{ needs.get-jobs.outputs.jobs != '' }}
-    needs: [get-pr-number, get-jobs]
-    permissions:
-      pull-requests: write
-    runs-on: ubuntu-22.04
-    steps:
-      - name: Check and update comment if needed
-        uses: actions/github-script@v7
-        env:
-          BODY: "\n\nrun-slow: ${{ needs.get-jobs.outputs.jobs }}"
-        with:
-          script: |
-            const prNumber = ${{ needs.get-pr-number.outputs.PR_NUMBER }};
-            const commentPrefix = "**[For maintainers]** Suggested jobs to run (before merge)";
-            const thirtyMinutesAgo = new Date(Date.now() - 30 * 60 * 1000); // 30 minutes ago
-            const newBody = `${commentPrefix}${process.env.BODY}`;
-            
-            // Get all comments on the PR
-            const { data: comments } = await github.rest.issues.listComments({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              issue_number: prNumber
-            });
-            
-            // Find existing comments that start with our prefix
-            const existingComments = comments.filter(comment => 
-              comment.user.login === 'github-actions[bot]' && 
-              comment.body.startsWith(commentPrefix)
-            );
-            
-            let shouldCreateNewComment = true;
-            let commentsToDelete = [];
-            
-            if (existingComments.length > 0) {
-              // Get the most recent comment
-              const mostRecentComment = existingComments
-                .sort((a, b) => new Date(b.created_at) - new Date(a.created_at))[0];
-              
-              const commentDate = new Date(mostRecentComment.created_at);
-              const isOld = commentDate < thirtyMinutesAgo;
-              const isDifferentContent = mostRecentComment.body !== newBody;
-              
-              console.log(`Most recent comment created: ${mostRecentComment.created_at}`);
-              console.log(`Is older than 30 minutes: ${isOld}`);
-              console.log(`Has different content: ${isDifferentContent}`);
-              
-              if (isOld || isDifferentContent) {
-                // Delete all existing comments and create new one
-                commentsToDelete = existingComments;
-                console.log(`Will delete ${commentsToDelete.length} existing comment(s) and create new one`);
-              } else {
-                // Content is same and comment is recent, skip
-                shouldCreateNewComment = false;
-                console.log('Comment is recent and content unchanged, skipping update');
-              }
-            } else {
-              console.log('No existing comments found, will create new one');
-            }
-            
-            // Delete old comments if needed
-            for (const comment of commentsToDelete) {
-              console.log(`Deleting comment #${comment.id} (created: ${comment.created_at})`);
-              await github.rest.issues.deleteComment({
-                owner: context.repo.owner,
-                repo: context.repo.repo,
-                comment_id: comment.id
-              });
-            }
-            
-            // Create new comment if needed
-            if (shouldCreateNewComment) {
-              await github.rest.issues.createComment({
-                owner: context.repo.owner,
-                repo: context.repo.repo,
-                issue_number: prNumber,
-                body: newBody
-              });
-              console.log('✅ New comment created');
-            } else {
-              console.log('ℹ️ No comment update needed');
-            }
--- a/.github/workflows/self-comment-ci.yml
+++ b/.github/workflows/self-comment-ci.yml
@ -185,7 +185,7 @@ jobs:
      fail-fast: false
      matrix:
        folders: ${{ fromJson(needs.get-tests.outputs.models) }}
-        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
+        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
       group: '${{ matrix.machine_type }}'
    container:
@ -239,9 +239,9 @@ jobs:
        shell: bash
        run: |
          echo "${{ matrix.machine_type }}"
-          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
@ -292,7 +292,7 @@ jobs:
      fail-fast: false
      matrix:
        folders: ${{ fromJson(needs.get-tests.outputs.quantizations) }}
-        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
+        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -338,9 +338,9 @@ jobs:
        shell: bash
        run: |
          echo "${{ matrix.machine_type }}"
-          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@ -31,7 +31,7 @@ jobs:
    name: Setup
    strategy:
      matrix:
-        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
+        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -131,7 +131,7 @@ jobs:
      fail-fast: false
      matrix:
        folders: ${{ fromJson(needs.setup.outputs.matrix) }}
-        machine_type: [aws-g5-4xlarge-cache]
+        machine_type: [aws-g4dn-2xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -169,9 +169,9 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
@ -244,7 +244,7 @@ jobs:
      fail-fast: false
      matrix:
        folders: ${{ fromJson(needs.setup.outputs.matrix) }}
-        machine_type: [aws-g5-12xlarge-cache]
+        machine_type: [aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -282,9 +282,9 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
@ -357,7 +357,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g5-4xlarge-cache]
+        machine_type: [aws-g4dn-2xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -395,9 +395,9 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
@ -467,7 +467,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g5-12xlarge-cache]
+        machine_type: [aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -505,9 +505,9 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
--- a/.github/workflows/self-scheduled-intel-gaudi.yml
+++ b/.github/workflows/self-scheduled-intel-gaudi.yml
@ -84,6 +84,8 @@ jobs:
      machine_type: ${{ matrix.machine_type }}
      folder_slices: ${{ needs.setup.outputs.folder_slices }}
      runner: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
+      report_name_prefix: run_models_gpu
+
    secrets: inherit

  run_trainer_and_fsdp_gpu:
@ -102,10 +104,11 @@ jobs:
      folder_slices: ${{ needs.setup.outputs.folder_slices }}
      runner: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
      report_name_prefix: run_trainer_and_fsdp_gpu
+
    secrets: inherit

-  run_pipelines_torch_gpu:
-    if: ${{ inputs.job == 'run_pipelines_torch_gpu' }}
+  run_pipelines_gpu:
+    if: ${{ inputs.job == 'run_pipelines_gpu' }}
    name: Pipelines
    strategy:
      fail-fast: false
@ -158,20 +161,20 @@ jobs:

      - name: Run all pipeline tests on Intel Gaudi
        run: |
-          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports tests/pipelines -m "not not_device_test"
+          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_pipelines_gpu_test_reports tests/pipelines -m "not not_device_test"

      - name: Failure short reports
        if: ${{ failure() }}
        continue-on-error: true
        run: |
-          cat reports/${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports/failures_short.txt
+          cat reports/${{ env.machine_type }}_run_pipelines_gpu_test_reports/failures_short.txt

-      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports"
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_pipelines_gpu_test_reports"
        if: ${{ always() }}
        uses: actions/upload-artifact@v4
        with:
-          name: ${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports
-          path: reports/${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports
+          name: ${{ env.machine_type }}_run_pipelines_gpu_test_reports
+          path: reports/${{ env.machine_type }}_run_pipelines_gpu_test_reports

  run_examples_gpu:
    if: ${{ inputs.job == 'run_examples_gpu' }}
@ -245,8 +248,8 @@ jobs:
          name: ${{ env.machine_type }}_run_examples_gpu_test_reports
          path: reports/${{ env.machine_type }}_run_examples_gpu_test_reports

-  run_torch_cuda_extensions_gpu:
-    if: ${{ inputs.job == 'run_torch_cuda_extensions_gpu' }}
+  run_deepspeed_gpu:
+    if: ${{ inputs.job == 'run_deepspeed_gpu' }}
    name: Intel Gaudi deepspeed tests
    strategy:
      fail-fast: false
@ -302,20 +305,20 @@ jobs:

      - name: Run all deepspeed tests on intel Gaudi
        run: |
-          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed -m "not not_device_test"
+          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_deepspeed_gpu_test_reports tests/deepspeed -m "not not_device_test"

      - name: Failure short reports
        if: ${{ failure() }}
        continue-on-error: true
        run: |
-          cat reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt
+          cat reports/${{ env.machine_type }}_run_deepspeed_gpu_test_reports/failures_short.txt

-      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports"
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_deepspeed_gpu_test_reports"
        if: ${{ always() }}
        uses: actions/upload-artifact@v4
        with:
-          name: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
-          path: reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
+          name: ${{ env.machine_type }}_run_deepspeed_gpu_test_reports
+          path: reports/${{ env.machine_type }}_run_deepspeed_gpu_test_reports

  send_results:
    name: Slack Report
@ -324,8 +327,8 @@ jobs:
        setup,
        run_models_gpu,
        run_examples_gpu,
-        run_torch_cuda_extensions_gpu,
-        run_pipelines_torch_gpu,
+        run_pipelines_gpu,
+        run_deepspeed_gpu,
        run_trainer_and_fsdp_gpu,
      ]
    if: ${{ always() }}
--- a/.github/workflows/self-scheduled-intel-gaudi3-caller.yml
+++ b/.github/workflows/self-scheduled-intel-gaudi3-caller.yml
@ -23,7 +23,7 @@ jobs:
    name: Pipeline CI
    uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
    with:
-      job: run_pipelines_torch_gpu
+      job: run_pipelines_gpu
      ci_event: Scheduled CI (Intel) - Gaudi3
      runner_scale_set: itac-bm-emr-gaudi3-dell
      slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
@ -47,7 +47,7 @@ jobs:
    name: DeepSpeed CI
    uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
    with:
-      job: run_torch_cuda_extensions_gpu
+      job: run_deepspeed_gpu
      ci_event: Scheduled CI (Intel) - Gaudi3
      runner_scale_set: itac-bm-emr-gaudi3-dell
      slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@ -50,7 +50,7 @@ jobs:
    name: Setup
    strategy:
      matrix:
-        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
+        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -128,14 +128,13 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
+        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
        slice_id: [0, 1]
    uses: ./.github/workflows/model_jobs.yml
    with:
      folder_slices: ${{ needs.setup.outputs.folder_slices }}
      machine_type: ${{ matrix.machine_type }}
      slice_id: ${{ matrix.slice_id }}
-      runner_map: ${{ needs.setup.outputs.runner_map }}
      docker: ${{ inputs.docker }}
      report_name_prefix: run_trainer_and_fsdp_gpu
    secrets: inherit
@ -146,7 +145,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
+        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -180,9 +179,9 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
@ -214,7 +213,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g5-4xlarge-cache]
+        machine_type: [aws-g4dn-4xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -248,9 +247,9 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
@ -283,7 +282,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
+        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -345,9 +344,9 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
@ -382,7 +381,7 @@ jobs:
      fail-fast: false
      matrix:
        folders: ${{ fromJson(needs.setup.outputs.quantization_matrix) }}
-        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
+        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -425,9 +424,9 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
--- a/awesome-transformers.md
+++ b/awesome-transformers.md
@ -288,7 +288,7 @@ Keywords: Music understanding, Music generation

 ## [dalle-flow](https://github.com/jina-ai/dalle-flow)

-DALL·E Flow is an interactive workflow for generating high-definition images from a text prompt. It leverages DALL·E-Mega, GLID-3 XL, and Stable Diffusion to generate image candidates, and then calls CLIP-as-service to rank the candidates w.r.t. the prompt.
+DALL·E Flow is an interactive workflow for generating high-definition images from a text prompt. Itt leverages DALL·E-Mega, GLID-3 XL, and Stable Diffusion to generate image candidates, and then calls CLIP-as-service to rank the candidates w.r.t. the prompt.
 The preferred candidate is fed to GLID-3 XL for diffusion, which often enriches the texture and background. Finally, the candidate is upscaled to 1024x1024 via SwinIR.

 Keywords: High-definition image generation, Stable Diffusion, DALL-E Mega, GLID-3 XL, CLIP, SwinIR
@ -526,7 +526,7 @@ Keywords: Model deployment, CLoud, Mobile, Edge

 ## [underthesea](https://github.com/undertheseanlp/underthesea)

-[underthesea](https://github.com/undertheseanlp/underthesea) is a Vietnamese NLP toolkit. Underthesea is a suite of open source Python modules data sets and tutorials supporting research and development in Vietnamese Natural Language Processing. We provide extremely easy API to quickly apply pretrained NLP models to your Vietnamese text, such as word segmentation, part-of-speech tagging (PoS), named entity recognition (NER), text classification and dependency parsing.
+[underthesea](https://github.com/undertheseanlp/underthesea) is a Vietnamese NLP toolkit. Underthesea is a suite of open source Python modules data sets and tutorials supporting research and development in Vietnamese Natural Language Processing. We provides extremely easy API to quickly apply pretrained NLP models to your Vietnamese text, such as word segmentation, part-of-speech tagging (PoS), named entity recognition (NER), text classification and dependency parsing.

 Keywords: Vietnamese, NLP

--- a/docker/examples-torch.dockerfile
+++ b/docker/examples-torch.dockerfile
@ -2,10 +2,10 @@ FROM python:3.9-slim
 ENV PYTHONDONTWRITEBYTECODE=1
 ARG REF=main
 USER root
-RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git ffmpeg
+RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]" seqeval albumentations jiwer
 RUN uv pip uninstall transformers
--- a/docker/pipeline-torch.dockerfile
+++ b/docker/pipeline-torch.dockerfile
@ -2,10 +2,10 @@ FROM python:3.9-slim
 ENV PYTHONDONTWRITEBYTECODE=1
 ARG REF=main
 USER root
-RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git pkg-config openssh-client git ffmpeg
+RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git pkg-config openssh-client git
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]"
 RUN uv pip uninstall transformers
--- a/docker/torch-light.dockerfile
+++ b/docker/torch-light.dockerfile
@ -2,10 +2,10 @@ FROM python:3.9-slim
 ENV PYTHONDONTWRITEBYTECODE=1
 ARG REF=main
 USER root
-RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git git-lfs ffmpeg
+RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git git-lfs
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing,tiktoken,num2words,video]"
 RUN uv pip uninstall transformers
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@ -26,7 +26,7 @@ RUN git clone https://github.com/huggingface/transformers && cd transformers &&
 # 1. Put several commands in a single `RUN` to avoid image/layer exporting issue. Could be revised in the future.
 # 2. Regarding `torch` part, We might need to specify proper versions for `torchvision` and `torchaudio`.
 #    Currently, let's not bother to specify their versions explicitly (so installed with their latest release versions).
-RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' ||  VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA && python3 -m pip uninstall -y tensorflow tensorflow_text tensorflow_probability
+RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' ||  VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA && python3 -m pip uninstall -y tensorflow tensorflow_text tensorflow_probability

 RUN python3 -m pip uninstall -y flax jax

--- a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
@ -21,7 +21,7 @@ RUN python3 -m pip install --no-cache-dir './transformers[deepspeed-testing]' 'p
 # Install latest release PyTorch
 # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
 # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
-RUN python3 -m pip uninstall -y torch torchvision torchaudio && python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/$CUDA
+RUN python3 -m pip uninstall -y torch torchvision torchaudio && python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA

 RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate

--- a/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
@ -19,7 +19,7 @@ RUN python3 -m pip uninstall -y torch torchvision torchaudio
 # Install **nightly** release PyTorch (flag `--pre`)
 # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
 # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
-RUN python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA
+RUN python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA

 # `datasets` requires pandas, pandas has some modules compiled with numpy=1.x causing errors
 RUN python3 -m pip install --no-cache-dir './transformers[deepspeed-testing]' 'pandas<2' 'numpy<2'
--- a/docker/transformers-quantization-latest-gpu/Dockerfile
+++ b/docker/transformers-quantization-latest-gpu/Dockerfile
@ -26,7 +26,7 @@ RUN [ ${#PYTORCH} -gt 0 ] && VERSION='torch=='$PYTORCH'.*' ||  VERSION='torch';
 RUN echo torch=$VERSION
 # `torchvision` and `torchaudio` should be installed along with `torch`, especially for nightly build.
 # Currently, let's just use their latest releases (when `torch` is installed with a release version)
-RUN python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/$CUDA
+RUN python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA

 RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate

@ -93,9 +93,6 @@ RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch]
 # `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
 RUN python3 -m pip uninstall -y kernels

-# Uninstall flash-attn installed by autoawq, it causes issues here : https://github.com/huggingface/transformers/actions/runs/15915442841/job/44892146131
-RUN python3 -m pip uninstall -y flash-attn
-
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
--- a/docs/source/de/testing.md
+++ b/docs/source/de/testing.md
@ -473,6 +473,13 @@ Hier ist zum Beispiel ein Test, der nur ausgeführt werden muss, wenn 2 oder meh
 def test_example_with_multi_gpu():
 ```

+Wenn ein Test `tensorflow` benötigt, verwenden Sie den Dekorator `require_tf`. Zum Beispiel:
+
+```python no-style
+@require_tf
+def test_tf_thing_with_tensorflow():
+```
+
 Diese Dekors können gestapelt werden. Wenn zum Beispiel ein Test langsam ist und mindestens eine GPU unter pytorch benötigt, können Sie
 wie Sie ihn einrichten können:

@ -1197,6 +1204,9 @@ if torch.cuda.is_available():
 import numpy as np

 np.random.seed(seed)
+
+# tf RNG
+tf.random.set_seed(seed)
 ```

 ### Tests debuggen
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@ -17,12 +17,12 @@
      title: Customizing model components
    - local: model_sharing
      title: Sharing
-    - local: modular_transformers
-      title: Contributing a new model to Transformers
    - local: add_new_model
-      title: Legacy model contribution
+      title: Adding a new model to Transformers
+    - local: modular_transformers
+      title: Modular Transformers
    - local: auto_docstring
-      title: Documenting a model
+      title: Document your models
    - local: attention_interface
      title: Customizing attention function
    title: Models
@ -97,9 +97,11 @@
    - local: perf_infer_gpu_one
      title: GPU
    - local: perf_infer_gpu_multi
-      title: Distributed inference
+      title: Distributed GPU inference
    - local: perf_infer_cpu
      title: CPU
+    - local: tf_xla
+      title: XLA
    title: Optimization
  - local: agents
    title: Agents
@ -139,6 +141,8 @@
      title: GPU
    - local: perf_train_cpu
      title: CPU
+    - local: perf_train_tpu_tf
+      title: TPU
    - local: perf_train_special
      title: Apple Silicon
    - local: perf_train_gaudi
@ -429,8 +433,6 @@
        title: DiffLlama
      - local: model_doc/distilbert
        title: DistilBERT
-      - local: model_doc/doge
-        title: Doge
      - local: model_doc/dots1
        title: dots1
      - local: model_doc/dpr
@ -691,8 +693,6 @@
        title: Zamba2
      title: Text models
    - sections:
-      - local: model_doc/aimv2
-        title: Aimv2
      - local: model_doc/beit
        title: BEiT
      - local: model_doc/bit
@ -737,8 +737,6 @@
        title: EfficientFormer
      - local: model_doc/efficientnet
        title: EfficientNet
-      - local: model_doc/eomt
-        title: EoMT
      - local: model_doc/focalnet
        title: FocalNet
      - local: model_doc/glpn
@ -841,8 +839,6 @@
        title: CSM
      - local: model_doc/dac
        title: dac
-      - local: model_doc/dia
-        title: Dia
      - local: model_doc/encodec
        title: EnCodec
      - local: model_doc/fastspeech2_conformer
@ -851,7 +847,7 @@
        title: GraniteSpeech
      - local: model_doc/hubert
        title: Hubert
-      - local: model_doc/kyutai_speech_to_text
+      - local: model_doc/stt
        title: Kyutai Speech-To-Text
      - local: model_doc/mctct
        title: MCTCT
@ -961,8 +957,6 @@
        title: FLAVA
      - local: model_doc/gemma3
        title: Gemma3
-      - local: model_doc/gemma3n
-        title: Gemma3n
      - local: model_doc/git
        title: GIT
      - local: model_doc/glm4v
@ -1059,8 +1053,6 @@
        title: SigLIP
      - local: model_doc/siglip2
        title: SigLIP2
-      - local: model_doc/smollm3
-        title: SmolLM3
      - local: model_doc/smolvlm
        title: SmolVLM
      - local: model_doc/speech-encoder-decoder
@ -1144,3 +1136,4 @@
      title: Environment Variables
    title: Reference
  title: API
+
--- a/docs/source/en/add_new_model.md
+++ b/docs/source/en/add_new_model.md
@ -13,7 +13,7 @@ rendered properly in your Markdown viewer.

 -->

-# Legacy model contribution
+# Adding a new model to Transformers

 > [!TIP]
 > Try adding new models with a more [modular](./modular_transformers) approach first. This makes it significantly easier to contribute a model to Transformers!
--- a/docs/source/en/agents.md
+++ b/docs/source/en/agents.md
@ -14,9 +14,5 @@ rendered properly in your Markdown viewer.

 -->

-# Agents
-
-(deprecated)
-
 > [!WARNING]
 > Agents and tools were spun out into the standalone [smolagents](https://huggingface.co/docs/smolagents/index) library. They were removed from `transformers` in v4.52.
--- a/docs/source/en/auto_docstring.md
+++ b/docs/source/en/auto_docstring.md
@ -14,26 +14,43 @@ rendered properly in your Markdown viewer.

 -->

-# Documenting a model
+# Utilizing the @auto_docstring Decorator

-The `@auto_docstring` decorator in Transformers generates consistent docstrings for model classes and their methods. It reduces boilerplate by automatically including standard argument descriptions while also allowing overrides to add new or custom arguments. [Contributing a new model](./modular_transformers) is easier because you don't need to manually add the standard docstrings, and only focus on documenting new arguments.
+The `@auto_docstring` decorator in the Hugging Face Transformers library helps generate docstrings for model classes and their methods, which will be used to build the documentation for the library. It aims to improve consistency and reduce boilerplate by automatically including standard argument descriptions and allowing for targeted overrides and additions.

-This guide describes how to use the `@auto_docstring` decorator and how it works.
+---

-## @auto_docstring
+## 📜 How it Works

-Start by importing the decorator in the modeling file (`modular_model.py` or `modeling_model.py`).
+The `@auto_docstring` decorator constructs docstrings by:
+
+1.  **Signature Inspection:** It inspects the signature (arguments, types, defaults) of the decorated class's `__init__` method or the decorated function.
+2.  **Centralized Docstring Fetching:** It retrieves predefined docstrings for common arguments (e.g., `input_ids`, `attention_mask`) from internal library sources (like `ModelArgs` or `ImageProcessorArgs` in `utils/args_doc.py`).
+3.  **Overriding or Adding Arguments Descriptions:**
+    * **Direct Docstring Block:** It incorporates custom docstring content from an `r""" """` (or `""" """`) block below the method signature or within the `__init__` docstring. This is for documenting new arguments or overriding standard descriptions.
+    * **Decorator Arguments (`custom_args`):** A `custom_args` docstring block can be passed to the decorator to provide docstrings for specific arguments directly in the decorator call. This can be used to define the docstring block for new arguments once if they are repeated in multiple places in the modeling file.
+4.  **Adding Classes and Functions Introduction:**
+    * **`custom_intro` argument:** Allows prepending a custom introductory paragraph to a class or function docstring.
+    * **Automatic Introduction Generation:** For model classes with standard naming patterns (like `ModelForCausalLM`) or belonging to a pipeline, the decorator automatically generates an appropriate introductory paragraph using `ClassDocstring` in `utils/args_doc.py` as the source.
+5.  **Templating:** The decorator uses a templating system, allowing predefined docstrings to include dynamic information deduced from the `auto_modules` of the library, such as `{{processor_class}}` or `{{config_class}}`.
+6.  **Deducing Relevant Examples:** The decorator attempts to find appropriate usage examples based on the model's task or pipeline compatibility. It extracts checkpoint information from the model's configuration class to provide concrete examples with real model identifiers.
+7.  **Adding Return Value Documentation:** For methods like `forward`, the decorator can automatically generate the "Returns" section based on the method's return type annotation. For example, for a method returning a `ModelOutput` subclass, it will extracts field descriptions from that class's docstring to create a comprehensive return value description. A custom `Returns` section can also be manually specified in the function docstring block.
+8.  **Unrolling Kwargs Typed With Unpack Operator:** For specific methods (defined in `UNROLL_KWARGS_METHODS`) or classes (defined in `UNROLL_KWARGS_CLASSES`), the decorator processes `**kwargs` parameters that are typed with `Unpack[KwargsTypedDict]`. It extracts the documentation from the TypedDict and adds each parameter to the function's docstring. Currently, this functionality is only supported for `FastImageProcessorKwargs`.
+
+
+---
+
+## 🚀 How to Use @auto_docstring
+
+### 1. Importing the Decorator
+Import the decorator into your modeling file:

 ```python
 from ...utils import auto_docstring
 ```

-Select whether you'd like to apply `@auto_docstring` to a class or function below to see how to use it.
-
-<hfoptions id="type">
-<hfoption id="classes">
-
-Place `@auto_docstring` directly above the class definition. The decorator derives parameter descriptions from the `__init__` method's signature and docstring.
+### 2. Applying to Classes
+Place `@auto_docstring` directly above the class definition. It uses the `__init__` method's signature and its docstring for parameter descriptions.

 ```python
 from transformers.modeling_utils import PreTrainedModel
@ -56,7 +73,9 @@ class MyAwesomeModel(PreTrainedModel):
    # ... other methods
 ```

-Arguments can also be passed directly to `@auto_docstring` for more control. Use the `custom_intro` parameter to describe the argument and the `custom_args` parameter to describe the arguments.
+#### Advanced Class Decoration:
+
+Arguments can be passed directly to `@auto_docstring` for more control:

 ```python
@auto_docstring(
@ -74,7 +93,7 @@ class MySpecialModel(PreTrainedModel):
        # ...
 ```

-You can also choose to only use `custom_intro` and define the custom arguments directly in the class.
+Or:

 ```python
@auto_docstring(
@ -92,10 +111,8 @@ class MySpecialModel(PreTrainedModel):
        # ...
 ```

-</hfoption>
-<hfoption id="functions">
-
-Place `@auto_docstring` directly above the method definition. The decorator derives parameter descriptions from the function signature.
+### 3. Applying to Functions (e.g., `forward` method)
+Apply the decorator above method definitions, such as the `forward` method.

 ```python
    @auto_docstring
@ -114,10 +131,9 @@ Place `@auto_docstring` directly above the method definition. The decorator deri
        # ...
 ```

-Arguments can also be passed directly to `@auto_docstring` for more control. Use the `custom_intro` parameter to describe the argument and the `custom_args` parameter to describe the arguments.
-
-The `Returns` and `Examples` parts of the docstring can also be manually specified.
+#### Advanced Function Decoration:

+Arguments can be passed directly to `@auto_docstring` for more control. `Returns` and `Examples` sections can also be manually specified:

 ```python
 MODEL_COMMON_CUSTOM_ARGS = r"""
@ -164,117 +180,100 @@ class MyModel(PreTrainedModel):
        # ...
 ```

-</hfoption>
-</hfoptions>
+---

-## Documenting arguments
+### ✍️ Documenting Arguments: Approach & Priority

-There are some rules for documenting different types of arguments and they're listed below.
-
- Standard arguments (`input_ids`, `attention_mask`, `pixel_values`, etc.) are defined and retrieved from `args_doc.py`. It is the single source of truth for standard arguments and should not be redefined locally if an argument's description and shape is the same as an argument in `args_doc.py`.
-
-    If a standard argument behaves differently in your model, then you can override it locally in a `r""" """` block. This local definition has a higher priority. For example, the `labels` argument is often customized per model and typically requires overriding.
-
-
- New or custom arguments should be documented within an `r""" """` block after the signature if it is a function or in the `__init__` method's docstring if it is a class.
-
-    ```py
-    argument_name (`type`, *optional*, defaults to `X`):
-        Description of the argument.
-        Explain its purpose, expected shape/type if complex, and default behavior.
-        This can span multiple lines.
-    ```
+1.  **Standard Arguments (e.g., `input_ids`, `attention_mask`, `pixel_values`, `encoder_hidden_states` etc.):**
+    * `@auto_docstring` retrieves descriptions from a central source. Do not redefine these locally if their description and shape are the same as in `args_doc.py`.

+2.  **New or Custom Arguments:**
+    * **Primary Method:** Document these within an `r""" """` docstring block following the signature (for functions) or in the `__init__` method's docstring (for class parameters).
+    * **Format:**
+        ```
+        argument_name (`type`, *optional*, defaults to `X`):
+            Description of the argument.
+            Explain its purpose, expected shape/type if complex, and default behavior.
+            This can span multiple lines.
+        ```
    * Include `type` in backticks.
-    * Add *optional* if the argument is not required or has a default value.
-    * Add "defaults to X" if it has a default value. You don't need to add "defaults to `None`" if the default value is `None`.
+    * Add "*optional*" if the argument is not required (has a default value).
+    * Add "defaults to `X`" if it has a default value (no need to specify "defaults to `None`" if the default value is `None`).

-    These arguments can also be passed to `@auto_docstring` as a `custom_args` argument. It is used to define the docstring block for new arguments once if they are repeated in multiple places in the modeling file.
+3.  **Overriding Standard Arguments:**
+    * If a standard argument behaves differently (e.g., different expected shape, model-specific behavior), provide its complete description in the local `r""" """` docstring. This local definition takes precedence.
+    * The `labels` argument is often customized per model and typically requires a specific docstring.

-    ```py
-    class MyModel(PreTrainedModel):
-    # ...
-    @auto_docstring(
-        custom_intro="""
-        This is a custom introduction for the function.
-        """
-        custom_args=r"""
-        common_arg_1 (`torch.Tensor`, *optional*, defaults to `default_value`):
-            Description of common_arg_1
-        """
-    )
-    ```
+4.  **Using Decorator Arguments for Overrides or New Arguments (`custom_args`):**
+    * New or custom arguments docstrings can also be passed to `@auto_docstring` as a `custom_args` argument. This can be used to define the docstring block for new arguments once if they are repeated in multiple places in the modeling file.

-## Checking the docstrings
+---

-Transformers includes a utility script to validate the docstrings when you open a Pull Request which triggers CI (continuous integration) checks. The script checks for the following criteria.
+### Usage with [modular files](./modular_transformers)

-* Ensures `@auto_docstring` is applied to relevant mode classes and public methods.
-* Ensures arguments are complete and consistent. It checks that documented arguments exist in the signature and verifies whether the types and default values in the docstring match the signature. Arguments that aren't known standard arguments or if they lack a local description are flagged.
-* Reminds you to complete placeholders like `<fill_type>` and `<fill_docstring>`.
-* Ensures docstrings are formatted according to the expected docstring style.
+When working with modular files, follow these guidelines for applying the `@auto_docstring` decorator:

-You can run this check locally - before committing - by running the following command.
+- **For standalone models in modular files:**
+  Apply the `@auto_docstring` decorator just as you would in regular modeling files.
+
+- **For models inheriting from other library models:**
+  - When inheriting from a parent model, decorators (including `@auto_docstring`) are automatically carried over to the generated modeling file without needing to add them in your modular file.
+  - If you need to modify the `@auto_docstring` behavior, apply the customized decorator in your modular file, making sure to *include all other decorators* that were present on the original function/class.
+
+  > **Warning**: When overriding any decorator in a modular file, you must include ALL decorators that were applied to that function/class in the parent model. If you only override some decorators, the others won't be included in the generated modeling file.
+
+
+**Note**: The `check_auto_docstrings` tool doesn't check modular files directly, but it will check (and modify when using `--fix_and_overwrite`) the generated modeling files. If issues are found in the generated files, you'll need to update your modular files accordingly.
+
+---
+
+## ✅ Checking Your Docstrings with `check_auto_docstrings`
+
+The library includes a utility script to validate docstrings. This check is typically run during Continuous Integration (CI).
+
+#### What it Checks:
+
+* **Decorator Presence:** Ensures `@auto_docstring` is applied to relevant model classes and public methods. (TODO)
+* **Argument Completeness & Consistency:**
+    * Flags arguments in the signature that are not known standard arguments and lack a local description.
+    * Ensures documented arguments exist in the signature. (TODO)
+    * Verifies that types and default values in the docstring match the signature. (TODO)
+* **Placeholder Detection:** Reminds you to complete placeholders like `<fill_type>` or `<fill_docstring>`.
+* **Formatting:** Adherence to the expected docstring style.
+
+#### Running the Check Locally:
+
+Run this check locally before committing. The common command is:

 ```bash
 make fix-copies
 ```

-`make fix-copies` runs several other checks as well. If you don't need those checks, run the command below to only perform docstring and auto-docstring checks.
+Alternatively, to only perform docstrings and auto-docstring checks, you can use:

 ```bash
 python utils/check_docstrings.py # to only check files included in the diff without fixing them
-# python utils/check_docstrings.py --fix_and_overwrite # to fix and overwrite the files in the diff
-# python utils/check_docstrings.py --fix_and_overwrite --check_all # to fix and overwrite all files
+# Or: python utils/check_docstrings.py --fix_and_overwrite # to fix and overwrite the files in the diff
+# Or: python utils/check_docstrings.py --fix_and_overwrite --check_all # to fix and overwrite all files
 ```

-## modular_model.py files
+#### Workflow with the Checker:

-When working with modular files (`modular_model.py`), follow the guidelines below for applying `@auto_docstring`.
+1.  Add `@auto_docstring(...)` to the class or method.
+2.  For new, custom, or overridden arguments, add descriptions in an `r""" """` block.
+3.  Run `make fix-copies` (or the `check_docstrings.py` utility).
+    * For unrecognized arguments lacking documentation, the utility will create placeholder entries.
+4.  Manually edit these placeholders with accurate types and descriptions.
+5.  Re-run the check to ensure all issues are resolved.

- For standalone models in modular files, apply `@auto_docstring` like you would in a `modeling_model.py` file.
- For models that inherit from other library models, `@auto_docstring` is automatically carried over to the generated modeling file. You don't need to add `@auto_docstring` in your modular file.
+---

-    If you need to modify the `@auto_docstring` behavior, apply the customized decorator in your modular file. Make sure to **include all other decorators** that are present in the original function or class.
+## 🔑 Key Takeaways & Best Practices

-> [!WARNING]
-> When overriding any decorator in a modular file, you must include **all** decorators that were applied to that function or class in the parent model. If you only override some decorators, the others won't be included in the generated modeling file.
-
-## How it works
-
-The `@auto_docstring` decorator automatically generates docstrings by:
-
-1. Inspecting the signature (arguments, types, defaults) of the decorated class' `__init__` method or the decorated function.
-2. Retrieving the predefined docstrings for common arguments (`input_ids`, `attention_mask`, etc.) from internal library sources like [`ModelArgs`], [`ImageProcessorArgs`], and the `args_doc.py` file.
-3. Adding argument descriptions in one of two ways as shown below.
-
-    | method | description | usage |
-    |---|---|---|
-    | `r""" """` | add custom docstring content directly to a method signature or within the `__init__` docstring | document new arguments or override standard descriptions |
-    | `custom_args` | add custom docstrings for specific arguments directly in `@auto_docstring` | define docstring for new arguments once if they're repeated in multiple places in the modeling file |
-
-4. Adding class and function descriptions. For model classes with standard naming patterns, like `ModelForCausalLM`, or if it belongs to a pipeline, `@auto_docstring` automatically generates the appropriate descriptions with `ClassDocstring` from `args_doc.py`.
-
-    `@auto_docstring` also accepts the `custom_intro` argument to describe a class or function.
-
-5. Using a templating system to allow predefined docstrings to include dynamic information from Transformers' [auto_modules](https://github.com/huggingface/transformers/tree/main/src/transformers/models/auto) such as `{{processor_class}}` and `{{config_class}}`.
-
-6. Finding appropriate usage examples based on the model's task or pipeline compatibility. It extracts checkpoint information form the model's configuration class to provide concrete examples with real model identifiers.
-
-7. Adding return values to the docstring. For methods like `forward`, the decorator automatically generates the `Returns` field in the docstring based on the method's return type annotation.
-
-    For example, if a method returns a [`~transformers.utils.ModelOutput`] subclass, `@auto_docstring` extracts the field descriptions from the class' docstring to create a comprehensive return value description. You can also manually specifiy a custom `Returns` field in a functions docstring.
-
-8. Unrolling kwargs typed with the unpack operator. For specific methods (defined in `UNROLL_KWARGS_METHODS`) or classes (defined in `UNROLL_KWARGS_CLASSES`), the decorator processes `**kwargs` parameters that are typed with `Unpack[KwargsTypedDict]`. It extracts the documentations from the `TypedDict` and adds each parameter to the function's docstring.
-
-    Currently only supported for [`FastImageProcessorKwargs`].
-
-## Best practices
-
-Follow the best practices below to help maintain consistent and informative documentation for Transformers!
-
-* Use `@auto_docstring` for new PyTorch model classes ([`PreTrainedModel`] subclasses) and their primary methods like `forward` or `get_text_features`.
-* For classes, `@auto_docstring` retrieves parameter descriptions from the `__init__` method's docstring.
-* Rely on standard docstrings and do not redefine common arguments unless their behavior is different in your model.
+* Use `@auto_docstring` for new PyTorch model classes (`PreTrainedModel` subclasses) and their primary for methods (e.g., `forward`, `get_text_features` etc.).
+* For classes, the `__init__` method's docstring is the main source for parameter descriptions when using `@auto_docstring` on the class.
+* Rely on standard docstrings; do not redefine common arguments unless their behavior is different in your specific model.
 * Document new or custom arguments clearly.
 * Run `check_docstrings` locally and iteratively.
+
+By following these guidelines, you help maintain consistent and informative documentation for the Hugging Face Transformers library 🤗.
--- a/docs/source/en/chat_templating_multimodal.md
+++ b/docs/source/en/chat_templating_multimodal.md
@ -56,7 +56,7 @@ Create a [`ImageTextToTextPipeline`] and pass the chat to it. For large models,
 import torch
 from transformers import pipeline

-pipeline = pipeline("image-text-to-text", model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", device_map="auto", torch_dtype=torch.float16)
+pipeline = pipeline("image-text-to-text", model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", device="cuda", torch_dtype=torch.float16)
 pipeline(text=messages, max_new_tokens=50, return_full_text=False)
 [{'input_text': [{'role': 'system',
    'content': [{'type': 'text',
@ -175,7 +175,7 @@ processed_chat = processor.apply_chat_template(
    add_generation_prompt=True,
    tokenize=True,
    return_dict=True,
-    video_fps=16,
+    video_fps=32,
    video_load_backend="decord",
 )
 print(processed_chat.keys())
--- a/docs/source/en/conversations.md
+++ b/docs/source/en/conversations.md
@ -25,7 +25,7 @@ Check model leaderboards like [OpenLLM](https://hf.co/spaces/HuggingFaceH4/open_

 This guide shows you how to quickly start chatting with Transformers from the command line, how build and format a conversation, and how to chat using the [`TextGenerationPipeline`].

-## chat CLI
+## transformers CLI

 After you've [installed Transformers](./installation.md), chat with a model directly from the command line as shown below. It launches an interactive session with a model, with a few base commands listed at the start of the session.

@ -49,8 +49,7 @@ For a full list of options, run the command below.
 transformers chat -h
 ```

-The chat is implemented on top of the [AutoClass](./model_doc/auto), using tooling from [text generation](./llm_tutorial) and [chat](./chat_templating). It uses the `transformers serve` CLI under the hood ([docs](./serving.md#serve-cli)).
-
+The chat is implemented on top of the [AutoClass](./model_doc/auto), using tooling from [text generation](./llm_tutorial) and [chat](./chat_templating).

 ## TextGenerationPipeline

--- a/docs/source/en/feature_extractors.md
+++ b/docs/source/en/feature_extractors.md
@ -26,7 +26,6 @@ Pass the audio signal, typically stored in `array`, to the feature extractor and
 from transformers import AutoFeatureExtractor

 feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
-dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
 processed_sample = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=16000)
 processed_sample
 {'input_values': [array([ 9.4472744e-05,  3.0777880e-03, -2.8888427e-03, ...,
--- a/docs/source/en/kv_cache.md
+++ b/docs/source/en/kv_cache.md
@ -44,7 +44,7 @@ import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM

 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16).to("cuda:0")
 inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device)

 model.generate(**inputs, do_sample=False, max_new_tokens=20, use_cache=False)
@ -59,7 +59,7 @@ import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache

 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16).to("cuda:0")
 inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device)

 past_key_values = DynamicCache()
@ -142,14 +142,13 @@ Enable [`QuantizedCache`] by configuring `cache_implementation="quantized"` in [
 For [`HQQQuantizedCache`], we recommend setting the `axis-key` and `axis-value` parameters to `1`.

 ```py
-import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM, HQQQuantizedCache, QuantizedCacheConfig

 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16).to("cuda:0")
 inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device)

-out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="quantized", cache_config={"backend": "HQQ"})
+out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="quantized", cache_config={"axis-key": 1, "axis-value": 1, "backend": "hqq"})
 print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
 I like rock music because it's loud and energetic. It's a great way to express myself and rel
 ```
@ -160,14 +159,13 @@ I like rock music because it's loud and energetic. It's a great way to express m
 For [`QuantoQuantizedCache`], we recommend setting the `axis-key` and `axis-value` parameters to `0`.

 ```py
-import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM, QuantoQuantizedCache, QuantizedCacheConfig

 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16).to("cuda:0")
 inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device)

-out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="quantized", cache_config={"nbits": 4, "backend": "quanto"})
+out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="quantized", cache_config={"nbits": 4, "axis-key": 0, "axis-value": 0, "backend": "quanto"})
 print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
 I like rock music because it's loud and energetic. It's a great way to express myself and rel
 ```
@ -209,14 +207,14 @@ import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM

 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map={"": 0})
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
 inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device)

 out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="offloaded_static")
 tokenizer.batch_decode(out, skip_special_tokens=True)[0]
 "Hello, my name is [Your Name], and I am a [Your Profession] with [Number of Years] of"
 ```
-Cache offloading requires a CUDA GPU or Intel XPU.
+Cache offloading requires a CUDA GPU.

 ### Sliding window cache

@ -229,7 +227,7 @@ import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM

 tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
-model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", torch_dtype=torch.float16, device_map="auto")
+model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", torch_dtype=torch.float16).to("cuda:0")
 inputs = tokenizer("Yesterday I was on a rock concert and.", return_tensors="pt").to(model.device)

 out = model.generate(**inputs, do_sample=False, max_new_tokens=30, cache_implementation="sliding_window")
@ -308,15 +306,15 @@ import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, DynamicCache, StaticCache

 model_id = "meta-llama/Llama-2-7b-chat-hf"
-model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map={"": 0})
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="cuda")
 tokenizer = AutoTokenizer.from_pretrained(model_id)

 # Init StaticCache with big enough max-length (1024 tokens for the below example)
 # You can also init a DynamicCache, if that suits you better
-prompt_cache = StaticCache(config=model.config, max_batch_size=1, max_cache_len=1024, device=model.device.type, dtype=torch.bfloat16)
+prompt_cache = StaticCache(config=model.config, max_batch_size=1, max_cache_len=1024, device="cuda", dtype=torch.bfloat16)

 INITIAL_PROMPT = "You are a helpful assistant. "
-inputs_initial_prompt = tokenizer(INITIAL_PROMPT, return_tensors="pt").to(model.device.type)
+inputs_initial_prompt = tokenizer(INITIAL_PROMPT, return_tensors="pt").to("cuda")
 # This is the common prompt cached, we need to run forward without grad to be able to copy
 with torch.no_grad():
     prompt_cache = model(**inputs_initial_prompt, past_key_values = prompt_cache).past_key_values
@ -324,7 +322,7 @@ with torch.no_grad():
 prompts = ["Help me to write a blogpost about travelling.", "What is the capital of France?"]
 responses = []
 for prompt in prompts:
-    new_inputs = tokenizer(INITIAL_PROMPT + prompt, return_tensors="pt").to(model.device.type)
+    new_inputs = tokenizer(INITIAL_PROMPT + prompt, return_tensors="pt").to("cuda")
    past_key_values = copy.deepcopy(prompt_cache)
    outputs = model.generate(**new_inputs, past_key_values=past_key_values,max_new_tokens=20)
    response = tokenizer.batch_decode(outputs)[0]
--- a/docs/source/en/model_doc/aimv2.md
+++ b/docs/source/en/model_doc/aimv2.md
@ -1,104 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# AIMv2
-
-## Overview
-
-The AIMv2 model was proposed in [Multimodal Autoregressive Pre-training of Large Vision Encoders](https://arxiv.org/abs/2411.14402) by Enrico Fini, Mustafa Shukor, Xiujun Li, Philipp Dufter, Michal Klein, David Haldimann, Sai Aitharaju, Victor Guilherme Turrisi da Costa, Louis Béthune, Zhe Gan, Alexander T Toshev, Marcin Eichner, Moin Nabi, Yinfei Yang, Joshua M. Susskind, Alaaeldin El-Nouby.
-
-The abstract from the paper is the following:
-
-*We introduce a novel method for pre-training of large-scale vision encoders. Building on recent advancements in autoregressive pre-training of vision models, we extend this framework to a multimodal setting, i.e., images and text. In this paper, we present AIMV2, a family of generalist vision encoders characterized by a straightforward pre-training process, scalability, and remarkable performance across a range of downstream tasks. This is achieved by pairing the vision encoder with a multimodal decoder that autoregressively generates raw image patches and text tokens. Our encoders excel not only in multimodal evaluations but also in vision benchmarks such as localization, grounding, and classification. Notably, our AIMV2-3B encoder achieves 89.5% accuracy on ImageNet-1k with a frozen trunk. Furthermore, AIMV2 consistently outperforms state-of-the-art contrastive models (e.g., CLIP, SigLIP) in multimodal image understanding across diverse settings.*
-
-
-This model was contributed by [Yaswanth Gali](https://huggingface.co/yaswanthgali).
-The original code can be found [here](https://github.com/apple/ml-aim).
-
-## Usage Example
-
-Here is an example of Image Feature Extraction using specific checkpoints on resized images and native resolution images:
-
-```python
-import requests
-from PIL import Image
-from transformers import AutoImageProcessor, AutoModel
-
-url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-image = Image.open(requests.get(url, stream=True).raw)
-
-processor = AutoImageProcessor.from_pretrained("apple/aimv2-large-patch14-native")
-model = AutoModel.from_pretrained("apple/aimv2-large-patch14-native")
-
-inputs = processor(images=image, return_tensors="pt")
-outputs = model(**inputs)
-```
-
-Here is an example of a checkpoint performing zero-shot classification:
-
-```python
-import requests
-from PIL import Image
-from transformers import AutoProcessor, AutoModel
-
-url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-image = Image.open(requests.get(url, stream=True).raw)
-text = ["Picture of a dog.", "Picture of a cat.", "Picture of a horse."]
-
-processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-224-lit")
-model = AutoModel.from_pretrained("apple/aimv2-large-patch14-224-lit")
-
-inputs = processor(
-    images=image,
-    text=text,
-    add_special_tokens=True,
-    truncation=True,
-    padding=True,
-    return_tensors="pt",
-)
-outputs = model(**inputs)
-probs = outputs.logits_per_image.softmax(dim=-1)
-```
-
-## Aimv2Config
-
-[[autodoc]] Aimv2Config
-
-## Aimv2TextConfig
-
-[[autodoc]] Aimv2TextConfig
-
-## Aimv2VisionConfig
-
-[[autodoc]] Aimv2VisionConfig
-
-## Aimv2Model
-
-[[autodoc]] Aimv2Model
-    - forward
-
-## Aimv2VisionModel
-
-[[autodoc]] Aimv2VisionModel
-    - forward
-
-## Aimv2TextModel
-
-[[autodoc]] Aimv2TextModel
-    - forward
-
-</pt>
-<tf>
--- a/docs/source/en/model_doc/auto.md
+++ b/docs/source/en/model_doc/auto.md
@ -350,10 +350,6 @@ The following auto classes are available for the following audio tasks.

 [[autodoc]] AutoModelForTextToWaveform

-### AutoModelForAudioTokenization
-
-[[autodoc]] AutoModelForAudioTokenization
-
 ## Multimodal

 The following auto classes are available for the following multimodal tasks.
--- a/docs/source/en/model_doc/bigbird_pegasus.md
+++ b/docs/source/en/model_doc/bigbird_pegasus.md
@ -14,123 +14,59 @@ rendered properly in your Markdown viewer.

 -->

-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-           <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-    </div>
-</div>
-
 # BigBirdPegasus

-[BigBirdPegasus](https://huggingface.co/papers/2007.14062) is an encoder-decoder (sequence-to-sequence) transformer model for long-input summarization. It extends the [BigBird](./big_bird) architecture with an additional pretraining objective borrowed from [Pegasus](./pegasus) called gap sequence generation (GSG). Whole sentences are masked and the model has to fill in the gaps in the document. BigBirdPegasus's ability to keep track of long contexts makes it effective at summarizing lengthy inputs, surpassing the performance of base Pegasus models.
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>

-You can find all the original BigBirdPegasus checkpoints under the [Google](https://huggingface.co/google/models?search=bigbird-pegasus) organization.
+## Overview

-> [!TIP]
-> This model was contributed by [vasudevgupta](https://huggingface.co/vasudevgupta).
->
-> Click on the BigBirdPegasus models in the right sidebar for more examples of how to apply BigBirdPegasus to different language tasks.
+The BigBird model was proposed in [Big Bird: Transformers for Longer Sequences](https://huggingface.co/papers/2007.14062) by
+Zaheer, Manzil and Guruganesh, Guru and Dubey, Kumar Avinava and Ainslie, Joshua and Alberti, Chris and Ontanon,
+Santiago and Pham, Philip and Ravula, Anirudh and Wang, Qifan and Yang, Li and others. BigBird, is a sparse-attention
+based transformer which extends Transformer based models, such as BERT to much longer sequences. In addition to sparse
+attention, BigBird also applies global attention as well as random attention to the input sequence. Theoretically, it
+has been shown that applying sparse, global, and random attention approximates full attention, while being
+computationally much more efficient for longer sequences. As a consequence of the capability to handle longer context,
+BigBird has shown improved performance on various long document NLP tasks, such as question answering and
+summarization, compared to BERT or RoBERTa.

-The example below demonstrates how to summarize text with [`Pipeline`], [`AutoModel`], and from the command line.
+The abstract from the paper is the following:

-<hfoptions id="usage">
-<hfoption id="Pipeline">
+*Transformers-based models, such as BERT, have been one of the most successful deep learning models for NLP.
+Unfortunately, one of their core limitations is the quadratic dependency (mainly in terms of memory) on the sequence
+length due to their full attention mechanism. To remedy this, we propose, BigBird, a sparse attention mechanism that
+reduces this quadratic dependency to linear. We show that BigBird is a universal approximator of sequence functions and
+is Turing complete, thereby preserving these properties of the quadratic, full attention model. Along the way, our
+theoretical analysis reveals some of the benefits of having O(1) global tokens (such as CLS), that attend to the entire
+sequence as part of the sparse attention mechanism. The proposed sparse attention can handle sequences of length up to
+8x of what was previously possible using similar hardware. As a consequence of the capability to handle longer context,
+BigBird drastically improves performance on various NLP tasks such as question answering and summarization. We also
+propose novel applications to genomics data.*

-```py
-import torch
-from transformers import pipeline
+The original code can be found [here](https://github.com/google-research/bigbird).

-pipeline = pipeline(
-    task="summarization",
-    model="google/bigbird-pegasus-large-arxiv",
-    torch_dtype=torch.float32,
-    device=0
-)
-pipeline("""Plants are among the most remarkable and essential life forms on Earth, possessing a unique ability to produce their own food through a process known as photosynthesis. This complex biochemical process is fundamental not only to plant life but to virtually all life on the planet.
-Through photosynthesis, plants capture energy from sunlight using a green pigment called chlorophyll, which is located in specialized cell structures called chloroplasts. In the presence of light, plants absorb carbon dioxide from the atmosphere through small pores in their leaves called stomata, and take in water from the soil through their root systems.
-These ingredients are then transformed into glucose, a type of sugar that serves as a source of chemical energy, and oxygen, which is released as a byproduct into the atmosphere. The glucose produced during photosynthesis is not just used immediately; plants also store it as starch or convert it into other organic compounds like cellulose, which is essential for building their cellular structure.
-This energy reserve allows them to grow, develop leaves, produce flowers, bear fruit, and carry out various physiological processes throughout their lifecycle.""")
-```
-</hfoption>
-<hfoption id="AutoModel">
+## Usage tips

-```py
-import torch
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-
-tokenizer = AutoTokenizer.from_pretrained(
-    "google/bigbird-pegasus-large-arxiv"
-)
-model = AutoModelForSeq2SeqLM.from_pretrained(
-    "google/bigbird-pegasus-large-arxiv",
-    torch_dtype=torch.bfloat16,
-    device_map="auto",
-)
-
-input_text = """Plants are among the most remarkable and essential life forms on Earth, possessing a unique ability to produce their own food through a process known as photosynthesis. This complex biochemical process is fundamental not only to plant life but to virtually all life on the planet.
-Through photosynthesis, plants capture energy from sunlight using a green pigment called chlorophyll, which is located in specialized cell structures called chloroplasts. In the presence of light, plants absorb carbon dioxide from the atmosphere through small pores in their leaves called stomata, and take in water from the soil through their root systems.
-These ingredients are then transformed into glucose, a type of sugar that serves as a source of chemical energy, and oxygen, which is released as a byproduct into the atmosphere. The glucose produced during photosynthesis is not just used immediately; plants also store it as starch or convert it into other organic compounds like cellulose, which is essential for building their cellular structure.
-This energy reserve allows them to grow, develop leaves, produce flowers, bear fruit, and carry out various physiological processes throughout their lifecycle."""
-input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
-
-output = model.generate(**input_ids, cache_implementation="static")
-print(tokenizer.decode(output[0], skip_special_tokens=True))
-```
-</hfoption>
-<hfoption id="transformers-cli">
-
-```bash
-echo -e "Plants are among the most remarkable and essential life forms on Earth, possessing a unique ability to produce their own food through a process known as photosynthesis. This complex biochemical process is fundamental not only to plant life but to virtually all life on the planet. Through photosynthesis, plants capture energy from sunlight using a green pigment called chlorophyll, which is located in specialized cell structures called chloroplasts." | transformers-cli run --task summarization --model google/bigbird-pegasus-large-arxiv --device 0
-```
-
-</hfoption>
-</hfoptions>
-
-Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
-
-The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to int4.
-
-```py
-import torch
-from transformers import BitsAndBytesConfig, AutoModelForSeq2SeqLM, AutoTokenizer
-
-quantization_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_compute_dtype=torch.bfloat16,
-    bnb_4bit_quant_type="nf4"
-)
-model = AutoModelForSeq2SeqLM.from_pretrained(
-    "google/bigbird-pegasus-large-arxiv",
-    torch_dtype=torch.bfloat16,
-    device_map="auto",
-    quantization_config=quantization_config
-)
-
-tokenizer = AutoTokenizer.from_pretrained(
-    "google/bigbird-pegasus-large-arxiv"
-)
-
-input_text = """Plants are among the most remarkable and essential life forms on Earth, possessing a unique ability to produce their own food through a process known as photosynthesis. This complex biochemical process is fundamental not only to plant life but to virtually all life on the planet.
-Through photosynthesis, plants capture energy from sunlight using a green pigment called chlorophyll, which is located in specialized cell structures called chloroplasts. In the presence of light, plants absorb carbon dioxide from the atmosphere through small pores in their leaves called stomata, and take in water from the soil through their root systems.
-These ingredients are then transformed into glucose, a type of sugar that serves as a source of chemical energy, and oxygen, which is released as a byproduct into the atmosphere. The glucose produced during photosynthesis is not just used immediately; plants also store it as starch or convert it into other organic compounds like cellulose, which is essential for building their cellular structure.
-This energy reserve allows them to grow, develop leaves, produce flowers, bear fruit, and carry out various physiological processes throughout their lifecycle."""
-input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
-
-output = model.generate(**input_ids, cache_implementation="static")
-print(tokenizer.decode(output[0], skip_special_tokens=True))
-```
-
-## Notes
-
- BigBirdPegasus also uses the [`PegasusTokenizer`].
- Inputs should be padded on the right because BigBird uses absolute position embeddings.
- BigBirdPegasus supports `original_full` and `block_sparse` attention. If the input sequence length is less than 1024, it is recommended to use `original_full` since sparse patterns don't offer much benefit for smaller inputs.
- The current implementation uses window size of 3 blocks and 2 global blocks, only supports the ITC-implementation, and doesn't support `num_random_blocks=0`.
- The sequence length must be divisible by the block size.
+- For an in-detail explanation on how BigBird's attention works, see [this blog post](https://huggingface.co/blog/big-bird).
+- BigBird comes with 2 implementations: **original_full** & **block_sparse**. For the sequence length < 1024, using
+  **original_full** is advised as there is no benefit in using **block_sparse** attention.
+- The code currently uses window size of 3 blocks and 2 global blocks.
+- Sequence length must be divisible by block size.
+- Current implementation supports only **ITC**.
+- Current implementation doesn't support **num_random_blocks = 0**.
+- BigBirdPegasus uses the [PegasusTokenizer](https://github.com/huggingface/transformers/blob/main/src/transformers/models/pegasus/tokenization_pegasus.py).
+- BigBird is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
+  the left.

 ## Resources

-Read the [Understanding BigBird's Block Sparse Attention](https://huggingface.co/blog/big-bird) blog post for more details about how BigBird's attention works.
+- [Text classification task guide](../tasks/sequence_classification)
+- [Question answering task guide](../tasks/question_answering)
+- [Causal language modeling task guide](../tasks/language_modeling)
+- [Translation task guide](../tasks/translation)
+- [Summarization task guide](../tasks/summarization)

 ## BigBirdPegasusConfig

--- a/docs/source/en/model_doc/chameleon.md
+++ b/docs/source/en/model_doc/chameleon.md
@ -191,11 +191,6 @@ model = ChameleonForConditionalGeneration.from_pretrained(
 [[autodoc]] ChameleonImageProcessor
    - preprocess

-## ChameleonImageProcessorFast
-
-[[autodoc]] ChameleonImageProcessorFast
-    - preprocess
-
 ## ChameleonVQVAE

 [[autodoc]] ChameleonVQVAE
--- a/docs/source/en/model_doc/cohere.md
+++ b/docs/source/en/model_doc/cohere.md
@ -3,7 +3,6 @@
        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
    </div>
 </div>

--- a/docs/source/en/model_doc/cohere2.md
+++ b/docs/source/en/model_doc/cohere2.md
@ -4,7 +4,6 @@
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
 <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
 <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
 </div>

 ## Overview
--- a/docs/source/en/model_doc/deberta-v2.md
+++ b/docs/source/en/model_doc/deberta-v2.md
@ -14,111 +14,66 @@ rendered properly in your Markdown viewer.

 -->

-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-           <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white" >
-           <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
-    </div>
-</div>
-
-
 # DeBERTa-v2

-[DeBERTa-v2](https://huggingface.co/papers/2006.03654) improves on the original [DeBERTa](./deberta) architecture by using a SentencePiece-based tokenizer and a new vocabulary size of 128K. It also adds an additional convolutional layer within the first transformer layer to better learn local dependencies of input tokens. Finally, the position projection and content projection matrices are shared in the attention layer to reduce the number of parameters.
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+</div>

-You can find all the original [DeBERTa-v2] checkpoints under the [Microsoft](https://huggingface.co/microsoft?search_models=deberta-v2) organization.
+## Overview
+
+The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://huggingface.co/papers/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen It is based on Google's
+BERT model released in 2018 and Facebook's RoBERTa model released in 2019.
+
+It builds on RoBERTa with disentangled attention and enhanced mask decoder training with half of the data used in
+RoBERTa.
+
+The abstract from the paper is the following:
+
+*Recent progress in pre-trained neural language models has significantly improved the performance of many natural
+language processing (NLP) tasks. In this paper we propose a new model architecture DeBERTa (Decoding-enhanced BERT with
+disentangled attention) that improves the BERT and RoBERTa models using two novel techniques. The first is the
+disentangled attention mechanism, where each word is represented using two vectors that encode its content and
+position, respectively, and the attention weights among words are computed using disentangled matrices on their
+contents and relative positions. Second, an enhanced mask decoder is used to replace the output softmax layer to
+predict the masked tokens for model pretraining. We show that these two techniques significantly improve the efficiency
+of model pretraining and performance of downstream tasks. Compared to RoBERTa-Large, a DeBERTa model trained on half of
+the training data performs consistently better on a wide range of NLP tasks, achieving improvements on MNLI by +0.9%
+(90.2% vs. 91.1%), on SQuAD v2.0 by +2.3% (88.4% vs. 90.7%) and RACE by +3.6% (83.2% vs. 86.8%). The DeBERTa code and
+pre-trained models will be made publicly available at https://github.com/microsoft/DeBERTa.*


-> [!TIP]
-> This model was contributed by [Pengcheng He](https://huggingface.co/DeBERTa).
->
-> Click on the DeBERTa-v2 models in the right sidebar for more examples of how to apply DeBERTa-v2 to different language tasks.
+The following information is visible directly on the [original implementation
+repository](https://github.com/microsoft/DeBERTa). DeBERTa v2 is the second version of the DeBERTa model. It includes
+the 1.5B model used for the SuperGLUE single-model submission and achieving 89.9, versus human baseline 89.8. You can
+find more details about this submission in the authors'
+[blog](https://www.microsoft.com/en-us/research/blog/microsoft-deberta-surpasses-human-performance-on-the-superglue-benchmark/)

-The example below demonstrates how to classify text with [`Pipeline`] or the [`AutoModel`] class.
+New in v2:

-<hfoptions id="usage">
-<hfoption id="Pipeline">
+- **Vocabulary** In v2 the tokenizer is changed to use a new vocabulary of size 128K built from the training data.
+  Instead of a GPT2-based tokenizer, the tokenizer is now
+  [sentencepiece-based](https://github.com/google/sentencepiece) tokenizer.
+- **nGiE(nGram Induced Input Encoding)** The DeBERTa-v2 model uses an additional convolution layer aside with the first
+  transformer layer to better learn the local dependency of input tokens.
+- **Sharing position projection matrix with content projection matrix in attention layer** Based on previous
+  experiments, this can save parameters without affecting the performance.
+- **Apply bucket to encode relative positions** The DeBERTa-v2 model uses log bucket to encode relative positions
+  similar to T5.
+- **900M model & 1.5B model** Two additional model sizes are available: 900M and 1.5B, which significantly improves the
+  performance of downstream tasks.

-```py
-import torch
-from transformers import pipeline
+This model was contributed by [DeBERTa](https://huggingface.co/DeBERTa). This model TF 2.0 implementation was
+contributed by [kamalkraj](https://huggingface.co/kamalkraj). The original code can be found [here](https://github.com/microsoft/DeBERTa).

-pipeline = pipeline(
-    task="text-classification",
-    model="microsoft/deberta-v2-xlarge-mnli",
-    device=0,
-    torch_dtype=torch.float16
-)
-result = pipeline("DeBERTa-v2 is great at understanding context!")
-print(result)
-```
-
-</hfoption>
-<hfoption id="AutoModel">
-
-```py
-import torch
-from transformers import AutoTokenizer, AutoModelForSequenceClassification
-
-tokenizer = AutoTokenizer.from_pretrained(
-    "microsoft/deberta-v2-xlarge-mnli"
-)
-model = AutoModelForSequenceClassification.from_pretrained(
-    "microsoft/deberta-v2-xlarge-mnli",
-    torch_dtype=torch.float16,
-    device_map="auto"
-)
-
-inputs = tokenizer("DeBERTa-v2 is great at understanding context!", return_tensors="pt").to("cuda")
-outputs = model(**inputs)
-
-logits = outputs.logits
-predicted_class_id = logits.argmax().item()
-predicted_label = model.config.id2label[predicted_class_id]
-print(f"Predicted label: {predicted_label}")
-
-```
-
-</hfoption>
-
-<hfoption id="transformers CLI">
-
-```bash
-echo -e "DeBERTa-v2 is great at understanding context!" | transformers-cli run --task fill-mask --model microsoft/deberta-v2-xlarge-mnli --device 0
-```
-</hfoption>
-</hfoptions>
-
-Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
-
-The example below uses [bitsandbytes quantization](../quantization/bitsandbytes) to only quantize the weights to 4-bit.
-
-```py
-from transformers import AutoModelForSequenceClassification, AutoTokenizer, BitsAndBytesConfig
-
-model_id = "microsoft/deberta-v2-xlarge-mnli"
-quantization_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_quant_type="nf4",
-    bnb_4bit_compute_dtype="float16",
-    bnb_4bit_use_double_quant=True,
-)
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForSequenceClassification.from_pretrained(
-    model_id,
-    quantization_config=quantization_config,
-    torch_dtype="float16"
-)
-
-inputs = tokenizer("DeBERTa-v2 is great at understanding context!", return_tensors="pt").to("cuda")
-outputs = model(**inputs)
-logits = outputs.logits
-predicted_class_id = logits.argmax().item()
-predicted_label = model.config.id2label[predicted_class_id]
-print(f"Predicted label: {predicted_label}")
-
-```
+## Resources

+- [Text classification task guide](../tasks/sequence_classification)
+- [Token classification task guide](../tasks/token_classification)
+- [Question answering task guide](../tasks/question_answering)
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+- [Multiple choice task guide](../tasks/multiple_choice)

 ## DebertaV2Config

--- a/docs/source/en/model_doc/dia.md
+++ b/docs/source/en/model_doc/dia.md
@ -1,162 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Dia
-
-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-    </div>
-</div>
-
-## Overview
-
-Dia is an opensource text-to-speech (TTS) model (1.6B parameters) developed by [Nari Labs](https://huggingface.co/nari-labs).
-It can generate highly realistic dialogue from transcript including nonverbal communications such as laughter and coughing.
-Furthermore, emotion and tone control is also possible via audio conditioning (voice cloning).
-
-**Model Architecture:**
-Dia is an encoder-decoder transformer based on the original transformer architecture. However, some more modern features such as
-rotational positional embeddings (RoPE) are also included. For its text portion (encoder), a byte tokenizer is utilized while
-for the audio portion (decoder), a pretrained codec model [DAC](./dac.md) is used - DAC encodes speech into discrete codebook
-tokens and decodes them back into audio.
-
-## Usage Tips
-
-### Generation with Text
-
-```python
-from transformers import AutoProcessor, DiaForConditionalGeneration
-
-torch_device = "cuda"
-model_checkpoint = "nari-labs/Dia-1.6B-0626"
-
-text = ["[S1] Dia is an open weights text to dialogue model."]
-processor = AutoProcessor.from_pretrained(model_checkpoint)
-inputs = processor(text=text, padding=True, return_tensors="pt").to(torch_device)
-
-model = DiaForConditionalGeneration.from_pretrained(model_checkpoint).to(torch_device)
-outputs = model.generate(**inputs, max_new_tokens=256)  # corresponds to around ~2s
-
-# save audio to a file
-outputs = processor.batch_decode(outputs)
-processor.save_audio(outputs, "example.wav")
-
-```
-
-### Generation with Text and Audio (Voice Cloning)
-
-```python
-from datasets import load_dataset, Audio
-from transformers import AutoProcessor, DiaForConditionalGeneration
-
-torch_device = "cuda"
-model_checkpoint = "nari-labs/Dia-1.6B-0626"
-
-ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
-ds = ds.cast_column("audio", Audio(sampling_rate=44100))
-audio = ds[-1]["audio"]["array"]
-# text is a transcript of the audio + additional text you want as new audio
-text = ["[S1] I know. It's going to save me a lot of money, I hope. [S2] I sure hope so for you."]
-
-processor = AutoProcessor.from_pretrained(model_checkpoint)
-inputs = processor(text=text, audio=audio, padding=True, return_tensors="pt").to(torch_device)
-prompt_len = processor.get_audio_prompt_len(inputs["decoder_attention_mask"])
-
-model = DiaForConditionalGeneration.from_pretrained(model_checkpoint).to(torch_device)
-outputs = model.generate(**inputs, max_new_tokens=256)  # corresponds to around ~2s
-
-# retrieve actually generated audio and save to a file
-outputs = processor.batch_decode(outputs, audio_prompt_len=prompt_len)
-processor.save_audio(outputs, "example_with_audio.wav")
-```
-
-### Training
-
-```python
-from datasets import load_dataset, Audio
-from transformers import AutoProcessor, DiaForConditionalGeneration
-
-torch_device = "cuda"
-model_checkpoint = "nari-labs/Dia-1.6B-0626"
-
-ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
-ds = ds.cast_column("audio", Audio(sampling_rate=44100))
-audio = ds[-1]["audio"]["array"]
-# text is a transcript of the audio
-text = ["[S1] I know. It's going to save me a lot of money, I hope."]
-
-processor = AutoProcessor.from_pretrained(model_checkpoint)
-inputs = processor(
-    text=text,
-    audio=audio,
-    generation=False,
-    output_labels=True,
-    padding=True,
-    return_tensors="pt"
-).to(torch_device)
-
-model = DiaForConditionalGeneration.from_pretrained(model_checkpoint).to(torch_device)
-out = model(**inputs)
-out.loss.backward()
-```
-
-
-This model was contributed by [Jaeyong Sung](https://huggingface.co/buttercrab), [Arthur Zucker](https://huggingface.co/ArthurZ),
-and [Anton Vlasjuk](https://huggingface.co/AntonV). The original code can be found [here](https://github.com/nari-labs/dia/).
-
-
-## DiaConfig
-
-[[autodoc]] DiaConfig
-
-## DiaDecoderConfig
-
-[[autodoc]] DiaDecoderConfig
-
-## DiaEncoderConfig
-
-[[autodoc]] DiaEncoderConfig
-
-## DiaTokenizer
-
-[[autodoc]] DiaTokenizer
-    - __call__
-
-## DiaFeatureExtractor
-
-[[autodoc]] DiaFeatureExtractor
-    - __call__
-
-## DiaProcessor
-
-[[autodoc]] DiaProcessor
-    - __call__
-    - batch_decode
-    - decode
-
-## DiaModel
-
-[[autodoc]] DiaModel
-    - forward
-
-## DiaForConditionalGeneration
-
-[[autodoc]] DiaForConditionalGeneration
-    - forward
-    - generate
--- a/docs/source/en/model_doc/doge.md
+++ b/docs/source/en/model_doc/doge.md
@ -1,103 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Doge
-
-
-## Overview
-
-Doge is a series of small language models based on the [Doge](https://github.com/SmallDoges/small-doge) architecture, aiming to combine the advantages of state-space and self-attention algorithms, calculate dynamic masks from cached value states using the zero-order hold method, and solve the problem of existing mainstream language models getting lost in context. It uses the `wsd_scheduler` scheduler to pre-train on the `smollm-corpus`, and can continue training on new datasets or add sparse activation feedforward networks from stable stage checkpoints.
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/refs%2Fpr%2F426/transformers/model_doc/doge_architecture.png" alt="drawing" width="600"/>
-
-As shown in the figure below, the sequence transformation part of the Doge architecture uses `Dynamic Mask Attention`, which can be understood as using self-attention related to value states during training, and using state-space without past state decay during inference, to solve the problem of existing Transformers or SSMs getting lost in long text. The state transformation part of Doge uses `Cross Domain Mixture of Experts`, which consists of dense linear layers and sparse embedding layers, and can additionally increase sparse parameters to continue training from dense weight checkpoints without retraining the entire model, thereby reducing the cost of continuous iteration of the model. In addition, Doge also uses `RMSNorm` and `Residual` with learnable parameters to adapt the gradient range of deep models.
-
-Checkout all Doge model checkpoints [here](https://huggingface.co/collections/SmallDoge/doge-slm-679cc991f027c4a3abbded4a).
-
-
-## Usage
-
-<details>
-<summary>Using Doge-Base for text generation</summary>
-
-```python
-from transformers import AutoTokenizer, AutoModelForCausalLM
-
-tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-20M")
-model = AutoModelForCausalLM.from_pretrained("SmallDoge/Doge-20M")
-inputs = tokenizer("Hey how are you doing?", return_tensors="pt")
-
-outputs = model.generate(**inputs, max_new_tokens=100)
-print(tokenizer.batch_decode(outputs))
-```
-</details>
-
-<details>
-<summary>Using Doge-Instruct for question answering</summary>
-
-```python
-from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, TextStreamer
-
-tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-20M-Instruct")
-model = AutoModelForCausalLM.from_pretrained("SmallDoge/Doge-20M-Instruct")
-
-generation_config = GenerationConfig(
-      max_new_tokens=100, 
-      use_cache=True, 
-      do_sample=True, 
-      temperature=0.8, 
-      top_p=0.9,
-      repetition_penalty=1.0
-)
-steamer = TextStreamer(tokenizer=tokenizer, skip_prompt=True)
-
-prompt = "Hi, how are you doing today?"
-conversation = [
-      {"role": "user", "content": prompt}
-]
-inputs = tokenizer.apply_chat_template(
-    conversation=conversation,
-    tokenize=True,
-    return_tensors="pt",
-)
-
-outputs = model.generate(
-    inputs, 
-    tokenizer=tokenizer,
-    generation_config=generation_config, 
-    streamer=steamer
-)
-```
-</details>
-
-## DogeConfig
-
-[[autodoc]] DogeConfig
-
-## DogeModel
-
-[[autodoc]] DogeModel
-    - forward
-
-## DogeForCausalLM
-
-[[autodoc]] DogeForCausalLM
-    - forward
-
-## DogeForSequenceClassification
-
-[[autodoc]] DogeForSequenceClassification
-    - forward
--- a/docs/source/en/model_doc/eomt.md
+++ b/docs/source/en/model_doc/eomt.md
@ -1,210 +0,0 @@
-<!--Copyright 2025 Mobile Perception Systems Lab at TU/e and The HuggingFace Inc. team. All rights reserved.
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-->
-
-# EoMT
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-</div>
-
-## Overview
-
-The Encoder-only Mask Transformer (EoMT) model was introduced in the CVPR 2025 Highlight Paper [Your ViT is Secretly an Image Segmentation Model](https://www.tue-mps.org/eomt) by Tommie Kerssies, Niccolò Cavagnero, Alexander Hermans, Narges Norouzi, Giuseppe Averta, Bastian Leibe, Gijs Dubbelman, and Daan de Geus.
-EoMT reveals Vision Transformers can perform image segmentation efficiently without task-specific components.
-
-The abstract from the paper is the following:
-
-*Vision Transformers (ViTs) have shown remarkable performance and scalability across various computer vision tasks. To apply single-scale ViTs to image segmentation, existing methods adopt a convolutional adapter to generate multi-scale features, a pixel decoder to fuse these features, and a Transformer decoder that uses the fused features to make predictions. In this paper, we show that the inductive biases introduced by these task-specific components can instead be learned by the ViT itself, given sufficiently large models and extensive pre-training. Based on these findings, we introduce the Encoder-only Mask Transformer (EoMT), which repurposes the plain ViT architecture to conduct image segmentation. With large-scale models and pre-training, EoMT obtains a segmentation accuracy similar to state-of-the-art models that use task-specific components. At the same time, EoMT is significantly faster than these methods due to its architectural simplicity, e.g., up to 4x faster with ViT-L. Across a range of model sizes, EoMT demonstrates an optimal balance between segmentation accuracy and prediction speed, suggesting that compute resources are better spent on scaling the ViT itself rather than adding architectural complexity.*
-
-This model was contributed by [Yaswanth Gali](https://huggingface.co/yaswanthgali).
-The original code can be found [here](https://github.com/tue-mps/eomt).
-
-## Architecture Info
-
-The `EoMT` model uses a DINOv2-pretrained Vision Transformer with **register tokens** as its backbone. EoMT simplifies the segmentation pipeline by relying solely on the encoder, eliminating the need for task-specific decoders commonly used in prior approaches.
-
-Architecturally, EoMT introduces a small set of **learned queries** and a lightweight **mask prediction module**. These queries are injected into the final encoder blocks, enabling **joint attention** between image patches and object queries. During training, **masked attention** is applied to constrain each query to focus on its corresponding region—effectively mimicking cross-attention. This constraint is gradually phased out via a **mask annealing strategy**, allowing for **efficient, decoder-free inference** without compromising segmentation performance.
-
-<div style="text-align: center;">
-  <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/eomt_architecture.png"
-       alt="drawing" width="500"/>
-</div>
-
-
-The model supports semantic, instance, and panoptic segmentation using a unified architecture and task-specific post-processing.
-
-## Usage Examples
-
-Use the Hugging Face implementation of EoMT for inference with pre-trained models.
-
-### Semantic Segmentation
-
-The EoMT model performs semantic segmentation using sliding-window inference. The input image is resized such that the shorter side matches the target input size, then it is split into overlapping crops. Each crop is then passed through the model. After inference, the predicted logits from each crop are stitched back together and rescaled to the original image size to get the final segmentation mask.
-
-> **Note:**  
-> If you want to use a custom target size for **semantic segmentation**, specify it in the following format:  
-> `{"shortest_edge": 512}`  
-> Notice that `longest_edge` is not provided here — this is intentional. For semantic segmentation, images are typically **scaled so that the shortest edge is greater than or equal to the target size** hence longest_edge is not necessary.
-
-```python
-import matplotlib.pyplot as plt
-import requests
-import torch
-from PIL import Image
-
-from transformers import EomtForUniversalSegmentation, AutoImageProcessor
-
-
-model_id = "tue-mps/ade20k_semantic_eomt_large_512"
-processor = AutoImageProcessor.from_pretrained(model_id)
-model = EomtForUniversalSegmentation.from_pretrained(model_id)
-
-image = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
-
-inputs = processor(
-    images=image,
-    return_tensors="pt",
-)
-
-with torch.inference_mode():
-    outputs = model(**inputs)
-
-# Prepare the original image size in the format (height, width)
-target_sizes = [(image.height, image.width)]
-
-# Post-process the model outputs to get final segmentation prediction
-preds = processor.post_process_semantic_segmentation(
-    outputs,
-    target_sizes=target_sizes,
-)
-
-# Visualize the segmentation mask
-plt.imshow(preds[0])
-plt.axis("off")
-plt.title("Semantic Segmentation")
-plt.show()
-```
-
-### Instance Segmentation
-
-The EoMT model performs instance segmentation using padded inference. The input image is resized so that the longer side matches the target input size, and the shorter side is zero-padded to form a square. The resulting mask and class logits are combined through post-processing (adapted from Mask2Former) to produce a unified instance segmentation map, along with segment metadata like segment id, class labels and confidence scores.
-
-> **Note:**  
-> To use a custom target size, specify the size as a dictionary in the following format:  
-> `{"shortest_edge": 512, "longest_edge": 512}`  
-> For both instance and panoptic segmentation, input images will be **scaled and padded** to this target size.
-
-```python
-import matplotlib.pyplot as plt
-import requests
-import torch
-from PIL import Image
-
-from transformers import EomtForUniversalSegmentation, AutoImageProcessor
-
-
-model_id = "tue-mps/coco_instance_eomt_large_640"
-processor = AutoImageProcessor.from_pretrained(model_id)
-model = EomtForUniversalSegmentation.from_pretrained(model_id)
-
-image = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
-
-inputs = processor(
-    images=image,
-    return_tensors="pt",
-)
-
-with torch.inference_mode():
-    outputs = model(**inputs)
-
-# Prepare the original image size in the format (height, width)
-target_sizes = [(image.height, image.width)]
-
-# Post-process the model outputs to get final segmentation prediction
-preds = processor.post_process_instance_segmentation(
-    outputs,
-    target_sizes=target_sizes,
-)
-
-# Visualize the segmentation mask
-plt.imshow(preds[0]["segmentation"])
-plt.axis("off")
-plt.title("Instance Segmentation")
-plt.show()
-```
-
-### Panoptic Segmentation
-
-The EoMT model performs panoptic segmentation using the same padded inference strategy as in instance segmentation. After padding and normalization, the model predicts both thing (instances) and stuff (amorphous regions) classes. The resulting mask and class logits are combined through post-processing (adapted from Mask2Former) to produce a unified panoptic segmentation map, along with segment metadata like segment id, class labels and confidence scores.
-
-```python
-import matplotlib.pyplot as plt
-import requests
-import torch
-from PIL import Image
-
-from transformers import EomtForUniversalSegmentation, AutoImageProcessor
-
-
-model_id = "tue-mps/coco_panoptic_eomt_large_640"
-processor = AutoImageProcessor.from_pretrained(model_id)
-model = EomtForUniversalSegmentation.from_pretrained(model_id)
-
-image = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
-
-inputs = processor(
-    images=image,
-    return_tensors="pt",
-)
-
-with torch.inference_mode():
-    outputs = model(**inputs)
-
-# Prepare the original image size in the format (height, width)
-target_sizes = [(image.height, image.width)]
-
-# Post-process the model outputs to get final segmentation prediction
-preds = processor.post_process_panoptic_segmentation(
-    outputs,
-    target_sizes=target_sizes,
-)
-
-# Visualize the panoptic segmentation mask
-plt.imshow(preds[0]["segmentation"])
-plt.axis("off")
-plt.title("Panoptic Segmentation")
-plt.show()
-```
-
-## EomtImageProcessor
-
-[[autodoc]] EomtImageProcessor
-    - preprocess
-    - post_process_semantic_segmentation
-    - post_process_instance_segmentation
-    - post_process_panoptic_segmentation
-
-## EomtImageProcessorFast
-
-[[autodoc]] EomtImageProcessorFast
-    - preprocess
-    - post_process_semantic_segmentation
-    - post_process_instance_segmentation
-    - post_process_panoptic_segmentation
-
-## EomtConfig
-
-[[autodoc]] EomtConfig
-
-## EomtForUniversalSegmentation
-
-[[autodoc]] EomtForUniversalSegmentation
-    - forward
--- a/docs/source/en/model_doc/gemma.md
+++ b/docs/source/en/model_doc/gemma.md
@ -23,7 +23,6 @@ rendered properly in your Markdown viewer.
        ">
        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
    </div>
 </div>

--- a/docs/source/en/model_doc/gemma2.md
+++ b/docs/source/en/model_doc/gemma2.md
@ -22,7 +22,6 @@ rendered properly in your Markdown viewer.
        ">
        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
    </div>
 </div>

--- a/docs/source/en/model_doc/gemma3n.md
+++ b/docs/source/en/model_doc/gemma3n.md
@ -1,205 +0,0 @@
-
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-    </div>
-</div>
-
-# Gemma3n
-
-## Overview
-
-Gemma3n is a multimodal model with pretrained and instruction-tuned variants, available in E4B and E2B sizes. While
-large portions of the language model architecture are shared with prior Gemma releases, there are many new additions in
-this model, including [Alternating Updates][altup] (AltUp), [Learned Augmented Residual Layer][laurel] (LAuReL),
-[MatFormer][matformer], Per-Layer Embeddings (PLE), [Activation Sparsity with Statistical Top-k][spark-transformer], and KV cache sharing. The language model uses
-a similar attention pattern to [Gemma 3](./gemma3.md) with alternating 4 local sliding window self-attention layers for
-every global self-attention layer with a maximum context length of 32k tokens. Gemma 3n introduces
-[MobileNet v5][mobilenetv5] as the vision encoder, using a default resolution of 768x768 pixels, and adds a newly
-trained audio encoder based on the [Universal Speech Model][usm] (USM) architecture.
-
-The instruction-tuned variant was post-trained with knowledge distillation and reinforcement learning.
-
-You can find all the original Gemma 3n checkpoints under the [Gemma 3n][gemma3n-collection] release.
-
-> [!TIP]
-> Click on the Gemma 3n models in the right sidebar for more examples of how to apply Gemma to different vision, audio,
-> and language tasks.
-
-The example below demonstrates how to generate text based on an image with [`Pipeline`] or the [`AutoModel`] class.
-
-<hfoptions id="usage">
-<hfoption id="Pipeline">
-
-```py
-import torch
-from transformers import pipeline
-
-pipeline = pipeline(
-    task="image-text-to-text",
-    model="google/gemma-3n-e4b",
-    device=0,
-    torch_dtype=torch.bfloat16
-)
-pipeline(
-    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
-    text="<start_of_image> What is shown in this image?"
-)
-```
-
-</hfoption>
-<hfoption id="AutoModel">
-
-```py
-import torch
-from transformers import AutoProcessor, Gemma3nForConditionalGeneration
-
-model = Gemma3nForConditionalGeneration.from_pretrained(
-    "google/gemma-3n-e4b-it",
-    torch_dtype=torch.bfloat16,
-    device_map="auto",
-    attn_implementation="sdpa"
-)
-processor = AutoProcessor.from_pretrained(
-    "google/gemma-3n-e4b-it",
-    padding_side="left"
-)
-
-messages = [
-    {
-        "role": "system",
-        "content": [
-            {"type": "text", "text": "You are a helpful assistant."}
-        ]
-    },
-    {
-        "role": "user", "content": [
-            {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
-            {"type": "text", "text": "What is shown in this image?"},
-        ]
-    },
-]
-inputs = processor.apply_chat_template(
-    messages,
-    tokenize=True,
-    return_dict=True,
-    return_tensors="pt",
-    add_generation_prompt=True,
-).to("cuda")
-
-output = model.generate(**inputs, max_new_tokens=50, cache_implementation="static")
-print(processor.decode(output[0], skip_special_tokens=True))
-```
-
-</hfoption>
-<hfoption id="transformers CLI">
-
-```bash
-echo -e "Plants create energy through a process known as" | transformers run --task text-generation --model google/gemma-3n-e2b --device 0
-```
-
-</hfoption>
-</hfoptions>
-
-## Notes
-
-   Use [`Gemma3nForConditionalGeneration`] for image-audio-and-text, image-and-text, image-and-audio, audio-and-text,
-    image-only and audio-only inputs.
-   Gemma 3n supports multiple images per input, but make sure the images are correctly batched before passing them to
-    the processor. Each batch should be a list of one or more images.
-
-    ```py
-    url_cow = "https://media.istockphoto.com/id/1192867753/photo/cow-in-berchida-beach-siniscola.jpg?s=612x612&w=0&k=20&c=v0hjjniwsMNfJSuKWZuIn8pssmD5h5bSN1peBd1CmH4="
-    url_cat = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
-
-    messages =[
-        {
-            "role": "system",
-            "content": [
-                {"type": "text", "text": "You are a helpful assistant."}
-            ]
-        },
-        {
-            "role": "user",
-            "content": [
-                {"type": "image", "url": url_cow},
-                {"type": "image", "url": url_cat},
-                {"type": "text", "text": "Which image is cuter?"},
-            ]
-        },
-    ]
-    ```
-   Text passed to the processor should have a `<image_soft_token>` token wherever an image should be inserted.
-   Gemma 3n accept at most one target audio clip per input, though multiple audio clips can be provided in few-shot
-    prompts, for example.
-   Text passed to the processor should have a `<audio_soft_token>` token wherever an audio clip should be inserted.
-   The processor has its own [`~ProcessorMixin.apply_chat_template`] method to convert chat messages to model inputs.
-
-## Gemma3nAudioFeatureExtractor
-
-[[autodoc]] Gemma3nAudioFeatureExtractor
-
-## Gemma3nProcessor
-
-[[autodoc]] Gemma3nProcessor
-
-## Gemma3nTextConfig
-
-[[autodoc]] Gemma3nTextConfig
-
-## Gemma3nVisionConfig
-
-[[autodoc]] Gemma3nVisionConfig
-
-## Gemma3nAudioConfig
-
-[[autodoc]] Gemma3nAudioConfig
-
-## Gemma3nConfig
-
-[[autodoc]] Gemma3nConfig
-
-## Gemma3nTextModel
-
-[[autodoc]] Gemma3nTextModel
-    - forward
-
-## Gemma3nModel
-
-[[autodoc]] Gemma3nModel
-    - forward
-
-## Gemma3nForCausalLM
-
-[[autodoc]] Gemma3nForCausalLM
-    - forward
-
-## Gemma3nForConditionalGeneration
-
-[[autodoc]] Gemma3nForConditionalGeneration
-    - forward
-
-[altup]: https://proceedings.neurips.cc/paper_files/paper/2023/hash/f2059277ac6ce66e7e5543001afa8bb5-Abstract-Conference.html
-[attention-mask-viz]: https://github.com/huggingface/transformers/blob/beb9b5b02246b9b7ee81ddf938f93f44cfeaad19/src/transformers/utils/attention_visualizer.py#L139
-[gemma3n-collection]: https://huggingface.co/collections/google/gemma-3n
-[laurel]: https://arxiv.org/abs/2411.07501
-[matformer]: https://arxiv.org/abs/2310.07707
-[spark-transformer]: https://arxiv.org/abs/2506.06644
-[usm]: https://arxiv.org/abs/2303.01037
--- a/docs/source/en/model_doc/glm.md
+++ b/docs/source/en/model_doc/glm.md
@ -20,7 +20,6 @@ rendered properly in your Markdown viewer.
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
 <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
 <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
 </div>

 ## Overview
--- a/docs/source/en/model_doc/glm4.md
+++ b/docs/source/en/model_doc/glm4.md
@ -18,37 +18,7 @@ rendered properly in your Markdown viewer.

 ## Overview

-The GLM family welcomes new members [GLM-4-0414](https://arxiv.org/pdf/2406.12793) series models.
-
-The **GLM-4-32B-0414** series models, featuring 32 billion parameters. Its performance is comparable to OpenAI’s GPT
-series and DeepSeek’s V3/R1 series. It also supports very user-friendly local deployment features. GLM-4-32B-Base-0414
-was pre-trained on 15T of high-quality data, including substantial reasoning-type synthetic data. This lays the
-foundation for subsequent reinforcement learning extensions. In the post-training stage, we employed human preference
-alignment for dialogue scenarios. Additionally, using techniques like rejection sampling and reinforcement learning, we
-enhanced the model’s performance in instruction following, engineering code, and function calling, thus strengthening
-the atomic capabilities required for agent tasks. GLM-4-32B-0414 achieves good results in engineering code, Artifact
-generation, function calling, search-based Q&A, and report generation. In particular, on several benchmarks, such as
-code generation or specific Q&A tasks, GLM-4-32B-Base-0414 achieves comparable performance with those larger models like
-GPT-4o and DeepSeek-V3-0324 (671B).
-
-**GLM-Z1-32B-0414** is a reasoning model with deep thinking capabilities. This was developed based on GLM-4-32B-0414
-through cold start, extended reinforcement learning, and further training on tasks including mathematics, code, and
-logic. Compared to the base model, GLM-Z1-32B-0414 significantly improves mathematical abilities and the capability to
-solve complex tasks. During training, we also introduced general reinforcement learning based on pairwise ranking
-feedback, which enhances the model's general capabilities.
-
-**GLM-Z1-Rumination-32B-0414** is a deep reasoning model with rumination capabilities (against OpenAI's Deep Research).
-Unlike typical deep thinking models, the rumination model is capable of deeper and longer thinking to solve more
-open-ended and complex problems (e.g., writing a comparative analysis of AI development in two cities and their future
-development plans). Z1-Rumination is trained through scaling end-to-end reinforcement learning with responses graded by
-the ground truth answers or rubrics and can make use of search tools during its deep thinking process to handle complex
-tasks. The model shows significant improvements in research-style writing and complex tasks.
-
-Finally, **GLM-Z1-9B-0414** is a surprise. We employed all the aforementioned techniques to train a small model (9B).
-GLM-Z1-9B-0414 exhibits excellent capabilities in mathematical reasoning and general tasks. Its overall performance is
-top-ranked among all open-source models of the same size. Especially in resource-constrained scenarios, this model
-achieves an excellent balance between efficiency and effectiveness, providing a powerful option for users seeking
-lightweight deployment.
+To be released with the official model launch.

 ## Glm4Config

--- a/docs/source/en/model_doc/glm4v.md
+++ b/docs/source/en/model_doc/glm4v.md
@ -23,29 +23,6 @@ rendered properly in your Markdown viewer.

 # GLM-4.1V

-## Overview
-
-**GLM-4.1V-9B-Thinking** is a bilingual vision-language model optimized for reasoning, built on GLM-4-9B. It introduces
-a "thinking paradigm" with reinforcement learning, achieving state-of-the-art results among 10B-class models and
-rivaling 72B-scale models. It supports 64k context, 4K resolution, and arbitrary aspect ratios, with an open-source base
-model for further research. You can check our paper [here](https://huggingface.co/papers/2507.01006). and below is a abstract.
-
-*We present GLM-4.1V-Thinking, a vision-language model (VLM) designed to advance general-purpose multimodal understanding
-and reasoning. In this report, we share our key findings in the development of the reasoning-centric training framework.
-We first develop a capable vision foundation model with significant potential through large-scale pre-training, which
-arguably sets the upper bound for the final performance. We then propose Reinforcement Learning with Curriculum
-Sampling (RLCS) to unlock the full potential of the model, leading to comprehensive capability enhancement across a
-diverse range of tasks, including STEM problem solving, video understanding, content recognition, coding, grounding,
-GUI-based agents, and long document understanding. We open-source GLM-4.1V-9B-Thinking, which achieves state-of-the-art
-performance among models of comparable size. In a comprehensive evaluation across 28 public benchmarks, our model
-outperforms Qwen2.5-VL-7B on nearly all tasks and achieves comparable or even superior performance on 18 benchmarks
-relative to the significantly larger Qwen2.5-VL-72B. Notably, GLM-4.1V-9B-Thinking also demonstrates competitive or
-superior performance compared to closed-source models such as GPT-4o on challenging tasks including long document
-understanding and STEM reasoning, further underscoring its strong capabilities. Code, models and more information
-are released at https://github.com/THUDM/GLM-4.1V-Thinking.*
-
-## Usage
-
 The example below demonstrates how to generate text based on an image with [`Pipeline`] or the [`AutoModel`] class.

 <hfoptions id="usage">
--- a/docs/source/en/model_doc/granite.md
+++ b/docs/source/en/model_doc/granite.md
@ -19,7 +19,6 @@ rendered properly in your Markdown viewer.
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
 <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
 <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
 </div>

 # Granite
--- a/docs/source/en/model_doc/led.md
+++ b/docs/source/en/model_doc/led.md
@ -14,135 +14,62 @@ rendered properly in your Markdown viewer.

 -->

-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-           <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-            <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
-    </div>
-</div>
-
 # LED

-[Longformer-Encoder-Decoder (LED)](https://huggingface.co/papers/2004.05150) is an encoder-decoder  transformer model for sequence-to-sequence tasks like summarization. It extends [Longformer](.longformer), an encoder-only model designed to handle long inputs, by adding a decoder layer. The decoder uses full self-attention on the encoded tokens and previously decoded locations. Because of Longformer's linear self-attention mechanism, LED is more efficient than standard encoder-decoder models when processing long sequences.
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+</div>

-You can find all the original [LED] checkpoints under the [Ai2](https://huggingface.co/allenai/models?search=led) organization.
+## Overview

-> [!TIP]
-> This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten).
->
-> Click on the LED models in the right sidebar for more examples of how to apply LED to different language tasks.
+The LED model was proposed in [Longformer: The Long-Document Transformer](https://huggingface.co/papers/2004.05150) by Iz
+Beltagy, Matthew E. Peters, Arman Cohan.

-The example below demonstrates how to summarize text with [`Pipeline`], [`AutoModel`], and from the command line.
+The abstract from the paper is the following:

-<hfoptions id="usage">
-<hfoption id="Pipeline">
+*Transformer-based models are unable to process long sequences due to their self-attention operation, which scales
+quadratically with the sequence length. To address this limitation, we introduce the Longformer with an attention
+mechanism that scales linearly with sequence length, making it easy to process documents of thousands of tokens or
+longer. Longformer's attention mechanism is a drop-in replacement for the standard self-attention and combines a local
+windowed attention with a task motivated global attention. Following prior work on long-sequence transformers, we
+evaluate Longformer on character-level language modeling and achieve state-of-the-art results on text8 and enwik8. In
+contrast to most prior work, we also pretrain Longformer and finetune it on a variety of downstream tasks. Our
+pretrained Longformer consistently outperforms RoBERTa on long document tasks and sets new state-of-the-art results on
+WikiHop and TriviaQA. We finally introduce the Longformer-Encoder-Decoder (LED), a Longformer variant for supporting
+long document generative sequence-to-sequence tasks, and demonstrate its effectiveness on the arXiv summarization
+dataset.*

-```python
-import torch
-from transformers import pipeline
+## Usage tips

-pipeline = pipeline(
-    task="summarization",
-    model="allenai/led-base-16384",
-    torch_dtype=torch.float16,
-    device=0
-)
-pipeline("""Plants are among the most remarkable and essential life forms on Earth, possessing a unique ability to produce their own food through a process known as photosynthesis. This complex biochemical process is fundamental not only to plant life but to virtually all life on the planet.
-Through photosynthesis, plants capture energy from sunlight using a green pigment called chlorophyll, which is located in specialized cell structures called chloroplasts. In the presence of light, plants absorb carbon dioxide from the atmosphere through small pores in their leaves called stomata, and take in water from the soil through their root systems.
-These ingredients are then transformed into glucose, a type of sugar that serves as a source of chemical energy, and oxygen, which is released as a byproduct into the atmosphere. The glucose produced during photosynthesis is not just used immediately; plants also store it as starch or convert it into other organic compounds like cellulose, which is essential for building their cellular structure.
-This energy reserve allows them to grow, develop leaves, produce flowers, bear fruit, and carry out various physiological processes throughout their lifecycle.""")
-```
+- [`LEDForConditionalGeneration`] is an extension of
+  [`BartForConditionalGeneration`] exchanging the traditional *self-attention* layer with
+  *Longformer*'s *chunked self-attention* layer. [`LEDTokenizer`] is an alias of
+  [`BartTokenizer`].
+- LED works very well on long-range *sequence-to-sequence* tasks where the `input_ids` largely exceed a length of
+  1024 tokens.
+- LED pads the `input_ids` to be a multiple of `config.attention_window` if required. Therefore a small speed-up is
+  gained, when [`LEDTokenizer`] is used with the `pad_to_multiple_of` argument.
+- LED makes use of *global attention* by means of the `global_attention_mask` (see
+  [`LongformerModel`]). For summarization, it is advised to put *global attention* only on the first
+  `<s>` token. For question answering, it is advised to put *global attention* on all tokens of the question.
+- To fine-tune LED on all 16384, *gradient checkpointing* can be enabled in case training leads to out-of-memory (OOM)
+  errors. This can be done by executing `model.gradient_checkpointing_enable()`. 
+ Moreover, the `use_cache=False`
+  flag can be used to disable the caching mechanism to save memory.
+- LED is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
+  the left.

-</hfoption>
-<hfoption id="AutoModel">
-
-```python
-import torch
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
-
-tokenizer = AutoTokenizer.from_pretrained(
-    "allenai/led-base-16384"
-)
-model = AutoModelForSeq2SeqLM.from_pretrained(
-    "allenai/led-base-16384",
-    torch_dtype=torch.float16,
-    device_map="auto"
-)
-
-input_text = """Plants are among the most remarkable and essential life forms on Earth, possessing a unique ability to produce their own food through a process known as photosynthesis. This complex biochemical process is fundamental not only to plant life but to virtually all life on the planet.
-Through photosynthesis, plants capture energy from sunlight using a green pigment called chlorophyll, which is located in specialized cell structures called chloroplasts. In the presence of light, plants absorb carbon dioxide from the atmosphere through small pores in their leaves called stomata, and take in water from the soil through their root systems.
-These ingredients are then transformed into glucose, a type of sugar that serves as a source of chemical energy, and oxygen, which is released as a byproduct into the atmosphere. The glucose produced during photosynthesis is not just used immediately; plants also store it as starch or convert it into other organic compounds like cellulose, which is essential for building their cellular structure.
-This energy reserve allows them to grow, develop leaves, produce flowers, bear fruit, and carry out various physiological processes throughout their lifecycle."""
-input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
-
-# Place global attention on the first token
-global_attention_mask = torch.zeros_like(input_ids.input_ids).to("cuda")
-global_attention_mask[:, 0] = 1
-
-output = model.generate(**input_ids, global_attention_mask=global_attention_mask, cache_implementation="static")
-print(tokenizer.decode(output[0], skip_special_tokens=True))
-```
-
-</hfoption>
-<hfoption id="transformers-cli">
-
-```bash
-!echo -e "Plants are among the most remarkable and essential life forms on Earth, possessing a unique ability to produce their own food through a process known as photosynthesis. This complex biochemical process is fundamental not only to plant life but to virtually all life on the planet. Through photosynthesis, plants capture energy from sunlight using a green pigment called chlorophyll, which is located in specialized cell structures called chloroplasts." | transformers-cli run --task summarization --model allenai/led-base-16384 --device 0
-```
-</hfoption>
-</hfoptions>
-
-Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
-
-The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to int4.
-
-```python
-import torch
-from transformers import BitsAndBytesConfig, AutoModelForSeq2SeqLM, AutoTokenizer
-
-quantization_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_compute_dtype=torch.bfloat16,
-    bnb_4bit_quant_type="nf4"
-)
-model = AutoModelForSeq2SeqLM.from_pretrained(
-    "allenai/led-large-16384",
-    torch_dtype=torch.bfloat16,
-    device_map="auto",
-    quantization_config=quantization_config
-)
-
-tokenizer = AutoTokenizer.from_pretrained(
-    "allenai/led-large-16384"
-)
-
-input_text = """Plants are among the most remarkable and essential life forms on Earth, possessing a unique ability to produce their own food through a process known as photosynthesis. This complex biochemical process is fundamental not only to plant life but to virtually all life on the planet.
-Through photosynthesis, plants capture energy from sunlight using a green pigment called chlorophyll, which is located in specialized cell structures called chloroplasts. In the presence of light, plants absorb carbon dioxide from the atmosphere through small pores in their leaves called stomata, and take in water from the soil through their root systems.
-These ingredients are then transformed into glucose, a type of sugar that serves as a source of chemical energy, and oxygen, which is released as a byproduct into the atmosphere. The glucose produced during photosynthesis is not just used immediately; plants also store it as starch or convert it into other organic compounds like cellulose, which is essential for building their cellular structure.
-This energy reserve allows them to grow, develop leaves, produce flowers, bear fruit, and carry out various physiological processes throughout their lifecycle."""
-input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
-
-# Place global attention on the first token
-global_attention_mask = torch.zeros_like(input_ids.input_ids).to("cuda")
-global_attention_mask[:, 0] = 1
-
-output = model.generate(**input_ids, global_attention_mask=global_attention_mask, cache_implementation="static")
-print(tokenizer.decode(output[0], skip_special_tokens=True))
-```
-
-## Notes
-
- [`LEDForConditionalGeneration`] is an extension of [`BartForConditionalGeneration`] exchanging the traditional self-attention layer with Longformer's chunked self-attention layer. [`LEDTokenizer`] is an alias of [`BartTokenizer`].
- LED pads the `input_ids` to be a multiple of `config.attention_window` if required. A small speedup is gained when [`LEDTokenizer`] is used with the `pad_to_multiple_of` argument.
- LED works best on long-range sequence-to-sequence tasks where the `input_ids` are significantly longer than 1024 tokens.
- LED uses global attention by means of the `global_attention_mask` (see [`LongformerModel`]). For summarization, it is advised to put global attention only on the first `<s>` token. For question answering, it is advised to put global attention on all tokens of the question.
- To fine-tune LED on all 16384 parameters, gradient checkpointing can be enabled in case training leads to out-of-memory (OOM) errors. Enable gradient checkpointing by adding `model.gradient_checkpointing_enable()` and setting `use_cache=False` to disable the caching mechanism to save memory.
- Inputs should be padded on the right because LED uses absolute position embeddings.
+This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten).

 ## Resources

- Read the [LED on Arxiv notebook](https://colab.research.google.com/drive/12INTTR6n64TzS4RrXZxMSXfrOd9Xzamo?usp=sharing) to see how LED can achieve state-of-the-art performance on Arxiv article summarization.
- Read the [Fine-tune LED notebook](https://colab.research.google.com/drive/12LjJazBl7Gam0XBPy_y0CTOJZeZ34c2v?usp=sharing) to learn how to fine-tune LED on PubMed articles.
+- [A notebook showing how to evaluate LED](https://colab.research.google.com/drive/12INTTR6n64TzS4RrXZxMSXfrOd9Xzamo?usp=sharing).
+- [A notebook showing how to fine-tune LED](https://colab.research.google.com/drive/12LjJazBl7Gam0XBPy_y0CTOJZeZ34c2v?usp=sharing).
+- [Text classification task guide](../tasks/sequence_classification)
+- [Question answering task guide](../tasks/question_answering)
+- [Translation task guide](../tasks/translation)
+- [Summarization task guide](../tasks/summarization)

 ## LEDConfig

--- a/docs/source/en/model_doc/llama.md
+++ b/docs/source/en/model_doc/llama.md
@ -21,7 +21,6 @@ rendered properly in your Markdown viewer.
        ">
        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
    </div>
 </div>

--- a/docs/source/en/model_doc/llama2.md
+++ b/docs/source/en/model_doc/llama2.md
@ -19,7 +19,6 @@ rendered properly in your Markdown viewer.
        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
        <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
        ">
-        <img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
    </div>
 </div>

--- a/docs/source/en/model_doc/llama3.md
+++ b/docs/source/en/model_doc/llama3.md
@ -20,7 +20,6 @@ rendered properly in your Markdown viewer.
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
 <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
 ">
-<img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
 </div>

 ```py3
--- a/docs/source/en/model_doc/llama4.md
+++ b/docs/source/en/model_doc/llama4.md
@ -21,7 +21,6 @@ rendered properly in your Markdown viewer.
    <div class="flex flex-wrap space-x-1">
        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-        <img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
    </div>
 </div>

--- a/docs/source/en/model_doc/mistral.md
+++ b/docs/source/en/model_doc/mistral.md
@ -22,7 +22,6 @@ rendered properly in your Markdown viewer.
        ">
        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
    </div>
 </div>

--- a/docs/source/en/model_doc/mixtral.md
+++ b/docs/source/en/model_doc/mixtral.md
@ -20,7 +20,6 @@ rendered properly in your Markdown viewer.
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
 <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
 <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
 </div>

 ## Overview
--- a/docs/source/en/model_doc/mobilenet_v2.md
+++ b/docs/source/en/model_doc/mobilenet_v2.md
@ -114,7 +114,6 @@ print(f"The predicted class label is: {predicted_class_label}")

 [[autodoc]] MobileNetV2ImageProcessor
    - preprocess
-    - post_process_semantic_segmentation

 ## MobileNetV2ImageProcessorFast

--- a/docs/source/en/model_doc/mobilevit.md
+++ b/docs/source/en/model_doc/mobilevit.md
@ -95,12 +95,6 @@ If you're interested in submitting a resource to be included here, please feel f
    - preprocess
    - post_process_semantic_segmentation

-## MobileViTImageProcessorFast
-
-[[autodoc]] MobileViTImageProcessorFast
-    - preprocess
-    - post_process_semantic_segmentation
-
 <frameworkcontent>
 <pt>

--- a/docs/source/en/model_doc/nougat.md
+++ b/docs/source/en/model_doc/nougat.md
@ -107,11 +107,6 @@ The model is identical to [Donut](donut) in terms of architecture.
 [[autodoc]] NougatImageProcessor
    - preprocess

-## NougatImageProcessorFast
-
-[[autodoc]] NougatImageProcessorFast
-    - preprocess
-
 ## NougatTokenizerFast

 [[autodoc]] NougatTokenizerFast
--- a/docs/source/en/model_doc/olmo.md
+++ b/docs/source/en/model_doc/olmo.md
@ -20,7 +20,6 @@ rendered properly in your Markdown viewer.
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
 <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
 <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
 </div>

 ## Overview
--- a/docs/source/en/model_doc/pegasus_x.md
+++ b/docs/source/en/model_doc/pegasus_x.md
@ -14,115 +14,35 @@ rendered properly in your Markdown viewer.

 -->

-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-    </div>
-</div>
-
 # PEGASUS-X

-[PEGASUS-X](https://huggingface.co/papers/2208.04347) is an encoder-decoder (sequence-to-sequence) transformer model for long-input summarization. It extends the [Pegasus](./pegasus) model with staggered block-local attention, global encoder tokens, and additional pretraining on long text sequences, enabling it to handle inputs of up to 16,000 tokens. PEGASUS-X matches the performance of much larger models while using fewer parameters.
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+</div>

-You can find all the original PEGASUS-X checkpoints under the [Google](https://huggingface.co/google/models?search=pegasus-x) organization.
+## Overview

-> [!TIP]
-> This model was contributed by [zphang](https://huggingface.co/zphang).
->
-> Click on the PEGASUS-X models in the right sidebar for more examples of how to apply PEGASUS-X to different language tasks.
+The PEGASUS-X model was proposed in [Investigating Efficiently Extending Transformers for Long Input Summarization](https://huggingface.co/papers/2208.04347)  by Jason Phang, Yao Zhao and Peter J. Liu.

-The example below demonstrates how to summarize text with [`Pipeline`], [`AutoModel`], and from the command line.
+PEGASUS-X (PEGASUS eXtended) extends the PEGASUS models for long input summarization through additional long input pretraining and using staggered block-local attention with global tokens in the encoder.

-<hfoptions id="usage">
-<hfoption id="Pipeline">
+The abstract from the paper is the following:

-```py
-import torch
-from transformers import pipeline
+*While large pretrained Transformer models have proven highly capable at tackling natural language tasks, handling long sequence inputs continues to be a significant challenge. One such task is long input summarization, where inputs are longer than the maximum input context of most pretrained models. Through an extensive set of experiments, we investigate what model architectural changes and pretraining paradigms can most efficiently adapt a pretrained Transformer for long input summarization. We find that a staggered, block-local Transformer with global encoder tokens strikes a good balance of performance and efficiency, and that an additional pretraining phase on long sequences meaningfully improves downstream summarization performance. Based on our findings, we introduce PEGASUS-X, an extension of the PEGASUS model with additional long input pretraining to handle inputs of up to 16K tokens. PEGASUS-X achieves strong performance on long input summarization tasks comparable with much larger models while adding few additional parameters and not requiring model parallelism to train.*

-pipeline = pipeline(
-    task="summarization",
-    model="google/pegasus-x-large",
-    torch_dtype=torch.bfloat16,
-    device=0
-)
-pipeline("""Plants are among the most remarkable and essential life forms on Earth, possessing a unique ability to produce their own food through a process known as photosynthesis. This complex biochemical process is fundamental not only to plant life but to virtually all life on the planet.
-Through photosynthesis, plants capture energy from sunlight using a green pigment called chlorophyll, which is located in specialized cell structures called chloroplasts. In the presence of light, plants absorb carbon dioxide from the atmosphere through small pores in their leaves called stomata, and take in water from the soil through their root systems.
-These ingredients are then transformed into glucose, a type of sugar that serves as a source of chemical energy, and oxygen, which is released as a byproduct into the atmosphere. The glucose produced during photosynthesis is not just used immediately; plants also store it as starch or convert it into other organic compounds like cellulose, which is essential for building their cellular structure.
-This energy reserve allows them to grow, develop leaves, produce flowers, bear fruit, and carry out various physiological processes throughout their lifecycle.""")
-```
-</hfoption>
-<hfoption id="AutoModel">
+This model was contributed by [zphang](https://huggingface.co/zphang). The original code can be found [here](https://github.com/google-research/pegasus).

-```py
-import torch
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+## Documentation resources

-tokenizer = AutoTokenizer.from_pretrained(
-    "google/pegasus-x-large"
-)
-model = AutoModelForSeq2SeqLM.from_pretrained(
-    "google/pegasus-x-large",
-    torch_dtype=torch.bfloat16,
-    device_map="auto",
-)
+- [Translation task guide](../tasks/translation)
+- [Summarization task guide](../tasks/summarization)

-input_text = """Plants are among the most remarkable and essential life forms on Earth, possessing a unique ability to produce their own food through a process known as photosynthesis. This complex biochemical process is fundamental not only to plant life but to virtually all life on the planet.
-Through photosynthesis, plants capture energy from sunlight using a green pigment called chlorophyll, which is located in specialized cell structures called chloroplasts. In the presence of light, plants absorb carbon dioxide from the atmosphere through small pores in their leaves called stomata, and take in water from the soil through their root systems.
-These ingredients are then transformed into glucose, a type of sugar that serves as a source of chemical energy, and oxygen, which is released as a byproduct into the atmosphere. The glucose produced during photosynthesis is not just used immediately; plants also store it as starch or convert it into other organic compounds like cellulose, which is essential for building their cellular structure.
-This energy reserve allows them to grow, develop leaves, produce flowers, bear fruit, and carry out various physiological processes throughout their lifecycle."""
-input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+<Tip>

-output = model.generate(**input_ids, cache_implementation="static")
-print(tokenizer.decode(output[0], skip_special_tokens=True))
-```
-</hfoption>
-<hfoption id="transformers-cli">
+PEGASUS-X uses the same tokenizer as [PEGASUS](pegasus).

-```bash
-echo -e "Plants are among the most remarkable and essential life forms on Earth, possessing a unique ability to produce their own food through a process known as photosynthesis. This complex biochemical process is fundamental not only to plant life but to virtually all life on the planet. Through photosynthesis, plants capture energy from sunlight using a green pigment called chlorophyll, which is located in specialized cell structures called chloroplasts." | transformers-cli run --task summarization --model google/pegasus-x-large --device 0
-```
-</hfoption>
-</hfoptions>
-
-Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
-
-The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to int4.
-
-```py
-import torch
-from transformers import BitsAndBytesConfig, AutoModelForSeq2SeqLM, AutoTokenizer
-
-quantization_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_compute_dtype=torch.bfloat16,
-    bnb_4bit_quant_type="nf4"
-)
-model = AutoModelForSeq2SeqLM.from_pretrained(
-    "google/pegasus-x-large",
-    torch_dtype=torch.bfloat16,
-    device_map="auto",
-    quantization_config=quantization_config
-)
-
-tokenizer = AutoTokenizer.from_pretrained(
-    "google/pegasus-x-large"
-)
-
-input_text = """Plants are among the most remarkable and essential life forms on Earth, possessing a unique ability to produce their own food through a process known as photosynthesis. This complex biochemical process is fundamental not only to plant life but to virtually all life on the planet.
-Through photosynthesis, plants capture energy from sunlight using a green pigment called chlorophyll, which is located in specialized cell structures called chloroplasts. In the presence of light, plants absorb carbon dioxide from the atmosphere through small pores in their leaves called stomata, and take in water from the soil through their root systems.
-These ingredients are then transformed into glucose, a type of sugar that serves as a source of chemical energy, and oxygen, which is released as a byproduct into the atmosphere. The glucose produced during photosynthesis is not just used immediately; plants also store it as starch or convert it into other organic compounds like cellulose, which is essential for building their cellular structure.
-This energy reserve allows them to grow, develop leaves, produce flowers, bear fruit, and carry out various physiological processes throughout their lifecycle."""
-input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
-
-output = model.generate(**input_ids, cache_implementation="static")
-print(tokenizer.decode(output[0], skip_special_tokens=True))
-```
-
-## Notes
-
- PEGASUS-X also uses the [`PegasusTokenizer`].
+</Tip>

 ## PegasusXConfig

--- a/docs/source/en/model_doc/phi.md
+++ b/docs/source/en/model_doc/phi.md
@ -18,7 +18,6 @@ rendered properly in your Markdown viewer.
        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
    </div>
 </div>

--- a/docs/source/en/model_doc/phi3.md
+++ b/docs/source/en/model_doc/phi3.md
@ -20,7 +20,6 @@ rendered properly in your Markdown viewer.
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
 <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
 <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
 </div>

 ## Overview
--- a/docs/source/en/model_doc/qwen2.md
+++ b/docs/source/en/model_doc/qwen2.md
@ -19,7 +19,6 @@ rendered properly in your Markdown viewer.
        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
    </div>
 </div>

--- a/docs/source/en/model_doc/qwen2_moe.md
+++ b/docs/source/en/model_doc/qwen2_moe.md
@ -18,7 +18,6 @@ rendered properly in your Markdown viewer.
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
 <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
 <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
 </div>

 # Qwen2MoE
--- a/docs/source/en/model_doc/qwen2_vl.md
+++ b/docs/source/en/model_doc/qwen2_vl.md
@ -19,7 +19,6 @@ rendered properly in your Markdown viewer.
 <div class="flex flex-wrap space-x-1">
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
 <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-<img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
 </div>

 ## Overview
--- a/docs/source/en/model_doc/smollm3.md
+++ b/docs/source/en/model_doc/smollm3.md
@ -1,173 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-    </div>
-</div>
-
-# SmolLM3
-
-SmolLM3 is a fully open, compact language model designed for efficient deployment while maintaining strong performance. It uses a Transformer decoder architecture with Grouped Query Attention (GQA) to reduce the kv cache, and no RoPE, enabling improved performance on long-context tasks. It is trained using a multi-stage training approach on high-quality public datasets across web, code, and math domains. The model is multilingual and supports very large context lengths. The instruct variant is optimized for reasoning and tool use.
-
-> [!TIP]
-> Click on the SmolLM3 models in the right sidebar for more examples of how to apply SmolLM3 to different language tasks.
-
-The example below demonstrates how to generate text with [`Pipeline`], [`AutoModel`], and from the command line using the instruction-tuned models.
-
-<hfoptions id="usage">
-<hfoption id="Pipeline">
-
-```python
-import torch
-from transformers import pipeline
-
-pipe = pipeline(
-    task="text-generation",
-    model="HuggingFaceTB/SmolLM3-3B",
-    torch_dtype=torch.bfloat16,
-    device_map=0
-)
-
-messages = [
-    {"role": "system", "content": "You are a helpful assistant."},
-    {"role": "user", "content": "Tell me about yourself."},
-]
-outputs = pipe(messages, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
-print(outputs[0]["generated_text"][-1]['content'])
-```
-
-</hfoption>
-<hfoption id="AutoModel">
-
-```python
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-model = AutoModelForCausalLM.from_pretrained(
-    "HuggingFaceTB/SmolLM3-3B",
-    torch_dtype=torch.bfloat16,
-    device_map="auto",
-    attn_implementation="sdpa"
-)
-tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM3-3B")
-
-prompt = "Give me a short introduction to large language models."
-messages = [
-    {"role": "system", "content": "You are a helpful assistant."},
-    {"role": "user", "content": prompt}
-]
-text = tokenizer.apply_chat_template(
-    messages,
-    tokenize=False,
-    add_generation_prompt=True
-)
-model_inputs = tokenizer([text], return_tensors="pt").to("cuda")
-
-generated_ids = model.generate(
-    model_inputs.input_ids,
-    cache_implementation="static",
-    max_new_tokens=512,
-    do_sample=True,
-    temperature=0.7,
-    top_k=50,
-    top_p=0.95
-)
-generated_ids = [
-    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
-]
-
-response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
-print(response)
-```
-
-</hfoption>
-<hfoption id="transformers CLI">
-
-```bash
-# pip install -U flash-attn --no-build-isolation
-transformers chat HuggingFaceTB/SmolLM3-3B --torch_dtype auto --attn_implementation flash_attention_2 --device 0
-```
-
-</hfoption>
-</hfoptions>
-
-Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
-
-The example below uses [bitsandbytes](../quantization/bitsandbytes) to quantize the weights to 4-bits.
-
-```python
-# pip install -U flash-attn --no-build-isolation
-import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
-
-quantization_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_compute_dtype=torch.bfloat16,
-    bnb_4bit_quant_type="nf4",
-    bnb_4bit_use_double_quant=True,
-)
-
-tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM3-3B")
-model = AutoModelForCausalLM.from_pretrained(
-    "HuggingFaceTB/SmolLM3-3B",
-    torch_dtype=torch.bfloat16,
-    device_map="auto",
-    quantization_config=quantization_config,
-    attn_implementation="flash_attention_2"
-)
-
-inputs = tokenizer("Gravity is the force", return_tensors="pt").to("cuda")
-outputs = model.generate(**inputs, max_new_tokens=100)
-print(tokenizer.decode(outputs[0], skip_special_tokens=True))
-```
-
-
-## Notes
-
- Ensure your Transformers library version is up-to-date. SmolLM3 requires Transformers>=4.53.0 for full support.
-
-## SmolLM3Config
-
-[[autodoc]] SmolLM3Config
-
-## SmolLM3Model
-
-[[autodoc]] SmolLM3Model
-    - forward
-
-## SmolLM3ForCausalLM
-
-[[autodoc]] SmolLM3ForCausalLM
-    - forward
-
-## SmolLM3ForSequenceClassification
-
-[[autodoc]] SmolLM3ForSequenceClassification
-    - forward
-
-## SmolLM3ForTokenClassification
-
-[[autodoc]] SmolLM3ForTokenClassification
-    - forward
-
-## SmolLM3ForQuestionAnswering
-
-[[autodoc]] SmolLM3ForQuestionAnswering
-    - forward
--- a/docs/source/en/model_doc/speech_to_text_2.md
+++ b/docs/source/en/model_doc/speech_to_text_2.md
@ -61,16 +61,19 @@ predicted token ids.
 - Step-by-step Speech Translation

 ```python
+>>> import torch
 >>> from transformers import Speech2Text2Processor, SpeechEncoderDecoderModel
 >>> from datasets import load_dataset
+>>> import soundfile as sf

 >>> model = SpeechEncoderDecoderModel.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
 >>> processor = Speech2Text2Processor.from_pretrained("facebook/s2t-wav2vec2-large-en-de")


->>> def map_to_array(example):
-...     example["speech"] = example["audio"]["array"]
-...     return example
+>>> def map_to_array(batch):
+...     speech, _ = sf.read(batch["file"])
+...     batch["speech"] = speech
+...     return batch


 >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
--- a/docs/source/en/model_doc/starcoder2.md
+++ b/docs/source/en/model_doc/starcoder2.md
@ -20,7 +20,6 @@ rendered properly in your Markdown viewer.
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
 <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
 <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
 </div>

 ## Overview
--- a/docs/source/en/model_doc/kyutai_speech_to_text.md
+++ b/docs/source/en/model_doc/kyutai_speech_to_text.md
@ -36,10 +36,10 @@ from transformers import KyutaiSpeechToTextProcessor, KyutaiSpeechToTextForCondi

 # 1. load the model and the processor
 torch_device = "cuda" if torch.cuda.is_available() else "cpu"
-model_id = "kyutai/stt-2.6b-en-trfs"
+model_id = "kyutai/stt-2.6b-en"

 processor = KyutaiSpeechToTextProcessor.from_pretrained(model_id)
-model = KyutaiSpeechToTextForConditionalGeneration.from_pretrained(model_id, device_map=torch_device, torch_dtype="auto")
+model = KyutaiSpeechToTextForConditionalGeneration.from_pretrained(model_id, device_map=torch_device)

 # 2. load audio samples
 ds = load_dataset(
@ -69,10 +69,10 @@ from transformers import KyutaiSpeechToTextProcessor, KyutaiSpeechToTextForCondi

 # 1. load the model and the processor
 torch_device = "cuda" if torch.cuda.is_available() else "cpu"
-model_id = "kyutai/stt-2.6b-en-trfs"
+model_id = "kyutai/stt-2.6b-en"

 processor = KyutaiSpeechToTextProcessor.from_pretrained(model_id)
-model = KyutaiSpeechToTextForConditionalGeneration.from_pretrained(model_id, device_map=torch_device, torch_dtype="auto")
+model = KyutaiSpeechToTextForConditionalGeneration.from_pretrained(model_id, device_map=torch_device)

 # 2. load audio samples
 ds = load_dataset(
--- a/docs/source/en/model_doc/superpoint.md
+++ b/docs/source/en/model_doc/superpoint.md
@ -10,35 +10,48 @@ specific language governing permissions and limitations under the License.
 ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
 rendered properly in your Markdown viewer.

-->

-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white" >
-    </div>
-</div>
+-->

 # SuperPoint

-[SuperPoint](https://huggingface.co/papers/1712.07629) is the result of self-supervised training of a fully-convolutional network for interest point detection and description. The model is able to detect interest points that are repeatable under homographic transformations and provide a descriptor for each point. Usage on it's own is limited, but it can be used as a feature extractor for other tasks such as homography estimation and image matching.
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
+## Overview
+
+The SuperPoint model was proposed
+in [SuperPoint: Self-Supervised Interest Point Detection and Description](https://huggingface.co/papers/1712.07629) by Daniel
+DeTone, Tomasz Malisiewicz and Andrew Rabinovich.
+
+This model is the result of a self-supervised training of a fully-convolutional network for interest point detection and
+description. The model is able to detect interest points that are repeatable under homographic transformations and
+provide a descriptor for each point. The use of the model in its own is limited, but it can be used as a feature
+extractor for other tasks such as homography estimation, image matching, etc.
+
+The abstract from the paper is the following:
+
+*This paper presents a self-supervised framework for training interest point detectors and descriptors suitable for a
+large number of multiple-view geometry problems in computer vision. As opposed to patch-based neural networks, our
+fully-convolutional model operates on full-sized images and jointly computes pixel-level interest point locations and
+associated descriptors in one forward pass. We introduce Homographic Adaptation, a multi-scale, multi-homography
+approach for boosting interest point detection repeatability and performing cross-domain adaptation (e.g.,
+synthetic-to-real). Our model, when trained on the MS-COCO generic image dataset using Homographic Adaptation, is able
+to repeatedly detect a much richer set of interest points than the initial pre-adapted deep model and any other
+traditional corner detector. The final system gives rise to state-of-the-art homography estimation results on HPatches
+when compared to LIFT, SIFT and ORB.*

 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/superpoint_architecture.png"
 alt="drawing" width="500"/>

-You can find all the original SuperPoint checkpoints under the [Magic Leap Community](https://huggingface.co/magic-leap-community) organization.
+<small> SuperPoint overview. Taken from the <a href="https://huggingface.co/papers/1712.07629v4">original paper.</a> </small>

-> [!TIP]
-> This model was contributed by [stevenbucaille](https://huggingface.co/stevenbucaille).
->
-> Click on the SuperPoint models in the right sidebar for more examples of how to apply SuperPoint to different computer vision tasks.
+## Usage tips

+Here is a quick example of using the model to detect interest points in an image:

-
-The example below demonstrates how to detect interest points in an image with the [`AutoModel`] class.
-<hfoptions id="usage">
-<hfoption id="AutoModel">
-
-```py
+```python
 from transformers import AutoImageProcessor, SuperPointForKeypointDetection
 import torch
 from PIL import Image
@ -51,76 +64,67 @@ processor = AutoImageProcessor.from_pretrained("magic-leap-community/superpoint"
 model = SuperPointForKeypointDetection.from_pretrained("magic-leap-community/superpoint")

 inputs = processor(image, return_tensors="pt")
-with torch.no_grad():
-    outputs = model(**inputs)
-
-# Post-process to get keypoints, scores, and descriptors
-image_size = (image.height, image.width)
-processed_outputs = processor.post_process_keypoint_detection(outputs, [image_size])
+outputs = model(**inputs)
 ```

-</hfoption>
-</hfoptions>
+The outputs contain the list of keypoint coordinates with their respective score and description (a 256-long vector).

-## Notes
+You can also feed multiple images to the model. Due to the nature of SuperPoint, to output a dynamic number of keypoints,
+you will need to use the mask attribute to retrieve the respective information :

- SuperPoint outputs a dynamic number of keypoints per image, which makes it suitable for tasks requiring variable-length feature representations.
+```python
+from transformers import AutoImageProcessor, SuperPointForKeypointDetection
+import torch
+from PIL import Image
+import requests

-    ```py
-    from transformers import AutoImageProcessor, SuperPointForKeypointDetection
-    import torch
-    from PIL import Image
-    import requests
-    processor = AutoImageProcessor.from_pretrained("magic-leap-community/superpoint")
-    model = SuperPointForKeypointDetection.from_pretrained("magic-leap-community/superpoint")
-    url_image_1 = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    image_1 = Image.open(requests.get(url_image_1, stream=True).raw)
-    url_image_2 = "http://images.cocodataset.org/test-stuff2017/000000000568.jpg"
-    image_2 = Image.open(requests.get(url_image_2, stream=True).raw)
-    images = [image_1, image_2]
-    inputs = processor(images, return_tensors="pt")
-    # Example of handling dynamic keypoint output
-    outputs = model(**inputs)
-    keypoints = outputs.keypoints  # Shape varies per image
-    scores = outputs.scores        # Confidence scores for each keypoint
-    descriptors = outputs.descriptors  # 256-dimensional descriptors
-    mask = outputs.mask # Value of 1 corresponds to a keypoint detection
-    ```
+url_image_1 = "http://images.cocodataset.org/val2017/000000039769.jpg"
+image_1 = Image.open(requests.get(url_image_1, stream=True).raw)
+url_image_2 = "http://images.cocodataset.org/test-stuff2017/000000000568.jpg"
+image_2 = Image.open(requests.get(url_image_2, stream=True).raw)

- The model provides both keypoint coordinates and their corresponding descriptors (256-dimensional vectors) in a single forward pass.
- For batch processing with multiple images, you need to use the mask attribute to retrieve the respective information for each image. You can use the `post_process_keypoint_detection` from the `SuperPointImageProcessor` to retrieve the each image information.
+images = [image_1, image_2]

-    ```py
-    # Batch processing example
-    images = [image1, image2, image3]
-    inputs = processor(images, return_tensors="pt")
-    outputs = model(**inputs)
-    image_sizes = [(img.height, img.width) for img in images]
-    processed_outputs = processor.post_process_keypoint_detection(outputs, image_sizes)
-    ```
+processor = AutoImageProcessor.from_pretrained("magic-leap-community/superpoint")
+model = SuperPointForKeypointDetection.from_pretrained("magic-leap-community/superpoint")

- You can then print the keypoints on the image of your choice to visualize the result:
-    ```py
-    import matplotlib.pyplot as plt
-    plt.axis("off")
-    plt.imshow(image_1)
-    plt.scatter(
-        outputs[0]["keypoints"][:, 0],
-        outputs[0]["keypoints"][:, 1],
-        c=outputs[0]["scores"] * 100,
-        s=outputs[0]["scores"] * 50,
-        alpha=0.8
-    )
-    plt.savefig(f"output_image.png")
-    ```
+inputs = processor(images, return_tensors="pt")
+outputs = model(**inputs)
+image_sizes = [(image.height, image.width) for image in images]
+outputs = processor.post_process_keypoint_detection(outputs, image_sizes)

-<div class="flex justify-center">
-    <img src="https://cdn-uploads.huggingface.co/production/uploads/632885ba1558dac67c440aa8/ZtFmphEhx8tcbEQqOolyE.png">
-</div>
+for output in outputs:
+    for keypoints, scores, descriptors in zip(output["keypoints"], output["scores"], output["descriptors"]):
+        print(f"Keypoints: {keypoints}")
+        print(f"Scores: {scores}")
+        print(f"Descriptors: {descriptors}")
+```
+
+You can then print the keypoints on the image of your choice to visualize the result:
+```python
+import matplotlib.pyplot as plt
+
+plt.axis("off")
+plt.imshow(image_1)
+plt.scatter(
+    outputs[0]["keypoints"][:, 0],
+    outputs[0]["keypoints"][:, 1],
+    c=outputs[0]["scores"] * 100,
+    s=outputs[0]["scores"] * 50,
+    alpha=0.8
+)
+plt.savefig(f"output_image.png")
+```
+![image/png](https://cdn-uploads.huggingface.co/production/uploads/632885ba1558dac67c440aa8/ZtFmphEhx8tcbEQqOolyE.png)
+
+This model was contributed by [stevenbucaille](https://huggingface.co/stevenbucaille).
+The original code can be found [here](https://github.com/magicleap/SuperPointPretrainedNetwork).

 ## Resources

- Refer to this [noteboook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/SuperPoint/Inference_with_SuperPoint_to_detect_interest_points_in_an_image.ipynb) for an inference and visualization example.
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with SuperPoint. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+- A notebook showcasing inference and visualization with SuperPoint can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/SuperPoint/Inference_with_SuperPoint_to_detect_interest_points_in_an_image.ipynb). 🌎

 ## SuperPointConfig

@ -133,12 +137,8 @@ processed_outputs = processor.post_process_keypoint_detection(outputs, [image_si
 - preprocess
 - post_process_keypoint_detection

-<frameworkcontent>
-<pt>
 ## SuperPointForKeypointDetection

 [[autodoc]] SuperPointForKeypointDetection

 - forward
-
-</pt>
--- a/docs/source/en/model_doc/t5gemma.md
+++ b/docs/source/en/model_doc/t5gemma.md
@ -14,13 +14,7 @@ specific language governing permissions and limitations under the License.
 rendered properly in your Markdown viewer.

 -->
-<div style="float: right;">
-    <div class="flex flex-wrap space-x-1">
-        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-    </div>
-</div>
+

 # T5Gemma

@ -30,9 +24,6 @@ T5Gemma has two groups of model sizes: 1) [Gemma 2](https://ai.google.dev/gemma/

 The pretrained varaints are trained with two objectives: prefix language modeling with knowledge distillation (PrefixLM) and UL2, separately. We release both variants for each model size. The instruction-turned varaints was post-trained with supervised fine-tuning and reinforcement learning.

-> [!TIP]
-> Click on the T5Gemma models in the right sidebar for more examples of how to apply T5Gemma to different language tasks.
-
 The example below demonstrates how to chat with the model with [`Pipeline`] or the [`AutoModel`] class, and from the command line.

 <hfoptions id="usage">
@ -44,52 +35,43 @@ import torch
 from transformers import pipeline

 pipe = pipeline(
-    "text2text-generation",
-    model="google/t5gemma-2b-2b-prefixlm-it",
+    task="text2text-generation",
+    model="google/t5gemma-placeholder",
    torch_dtype=torch.bfloat16,
-    device="cuda",  # replace with "mps" to run on a Mac device
+    device="cuda",
 )

-messages = [
-    {"role": "user", "content": "Tell me an unknown interesting biology fact about the brain."},
-]
-prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-
-pipe(prompt, max_new_tokens=32)
+pipe("Question: Why is the sky blue?\nAnswer:", max_new_tokens=50)
 ```

 </hfoption>
 <hfoption id="AutoModel">

 ```python
-# pip install accelerate
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

-tokenizer = AutoTokenizer.from_pretrained("google/t5gemma-2b-2b-prefixlm-it")
+tokenizer = AutoTokenizer.from_pretrained("google/t5gemma-placeholder")
 model = AutoModelForSeq2SeqLM.from_pretrained(
-    "google/t5gemma-2b-2b-prefixlm-it",
-    device_map="auto",
+    "google/t5gemma-placeholder",
    torch_dtype=torch.bfloat16,
+    device_map="auto"
 )

-messages = [
-    {"role": "user", "content": "Tell me an unknown interesting biology fact about the brain."},
-]
-input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", return_dict=True, add_generation_prompt=True).to("cuda")
+input_text = "Question: Why is the sky blue?\nAnswer:"
+input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")

 outputs = model.generate(**input_ids, max_new_tokens=32)
-print(tokenizer.decode(outputs[0]))
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+
 ```

 </hfoption>
 <hfoption id="transformers CLI">

 ```
-echo -e "Write me a poem about Machine Learning. Answer:" | transformers run --task text2text-generation --model google/t5gemma-2b-2b-prefixlm --device 0
+echo -e "Question: Why is the sky blue? Answer:" | transformers run --task text2text-generation --model google/t5gemma-placeholder --device 0
 ```
-</hfoption>
-</hfoptions>

 ## T5GemmaConfig

--- a/docs/source/en/model_doc/vitpose.md
+++ b/docs/source/en/model_doc/vitpose.md
@ -10,39 +10,52 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-<div style="float: right;">
-  <div class="flex flex-wrap space-x-1">
-    <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-  </div>
-</div>
-
 # ViTPose

-[ViTPose](https://huggingface.co/papers/2204.12484) is a vision transformer-based model for keypoint (pose) estimation. It uses a simple, non-hierarchical [ViT](./vit) backbone and a lightweight decoder head. This architecture simplifies model design, takes advantage of transformer scalability, and can be adapted to different training strategies.
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>

-[ViTPose++](https://huggingface.co/papers/2212.04246) improves on ViTPose by incorporating a mixture-of-experts (MoE) module in the backbone and using more diverse pretraining data.
+## Overview
+
+The ViTPose model was proposed in [ViTPose: Simple Vision Transformer Baselines for Human Pose Estimation](https://huggingface.co/papers/2204.12484) by Yufei Xu, Jing Zhang, Qiming Zhang, Dacheng Tao. ViTPose employs a standard, non-hierarchical [Vision Transformer](vit) as backbone for the task of keypoint estimation. A simple decoder head is added on top to predict the heatmaps from a given image. Despite its simplicity, the model gets state-of-the-art results on the challenging MS COCO Keypoint Detection benchmark. The model was further improved in [ViTPose++: Vision Transformer for Generic Body Pose Estimation](https://huggingface.co/papers/2212.04246) where the authors employ
+a mixture-of-experts (MoE) module in the ViT backbone along with pre-training on more data, which further enhances the performance.
+
+The abstract from the paper is the following:
+
+*Although no specific domain knowledge is considered in the design, plain vision transformers have shown excellent performance in visual recognition tasks. However, little effort has been made to reveal the potential of such simple structures for pose estimation tasks. In this paper, we show the surprisingly good capabilities of plain vision transformers for pose estimation from various aspects, namely simplicity in model structure, scalability in model size, flexibility in training paradigm, and transferability of knowledge between models, through a simple baseline model called ViTPose. Specifically, ViTPose employs plain and non-hierarchical vision transformers as backbones to extract features for a given person instance and a lightweight decoder for pose estimation. It can be scaled up from 100M to 1B parameters by taking the advantages of the scalable model capacity and high parallelism of transformers, setting a new Pareto front between throughput and performance. Besides, ViTPose is very flexible regarding the attention type, input resolution, pre-training and finetuning strategy, as well as dealing with multiple pose tasks. We also empirically demonstrate that the knowledge of large ViTPose models can be easily transferred to small ones via a simple knowledge token. Experimental results show that our basic ViTPose model outperforms representative methods on the challenging MS COCO Keypoint Detection benchmark, while the largest model sets a new state-of-the-art.*

 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/vitpose-architecture.png"
 alt="drawing" width="600"/>

-You can find all ViTPose and ViTPose++ checkpoints under the [ViTPose collection](https://huggingface.co/collections/usyd-community/vitpose-677fcfd0a0b2b5c8f79c4335).
+<small> ViTPose architecture. Taken from the <a href="https://huggingface.co/papers/2204.12484">original paper.</a> </small>

-The example below demonstrates pose estimation with the [`VitPoseForPoseEstimation`] class.
+This model was contributed by [nielsr](https://huggingface.co/nielsr) and [sangbumchoi](https://github.com/SangbumChoi).
+The original code can be found [here](https://github.com/ViTAE-Transformer/ViTPose).
+
+## Usage Tips
+
+ViTPose is a so-called top-down keypoint detection model. This means that one first uses an object detector, like [RT-DETR](rt_detr.md), to detect people (or other instances) in an image. Next, ViTPose takes the cropped images as input and predicts the keypoints for each of them.

 ```py
 import torch
 import requests
 import numpy as np
-import supervision as sv
+
 from PIL import Image
+
 from transformers import AutoProcessor, RTDetrForObjectDetection, VitPoseForPoseEstimation

 device = "cuda" if torch.cuda.is_available() else "cpu"

-url = "https://www.fcbarcelona.com/fcbarcelona/photo/2021/01/31/3c55a19f-dfc1-4451-885e-afd14e890a11/mini_2021-01-31-BARCELONA-ATHLETIC-BILBAOI-30.JPG"
+url = "http://images.cocodataset.org/val2017/000000000139.jpg"
 image = Image.open(requests.get(url, stream=True).raw)

-# Detect humans in the image
+# ------------------------------------------------------------------------
+# Stage 1. Detect humans on the image
+# ------------------------------------------------------------------------
+
+# You can choose any detector of your choice
 person_image_processor = AutoProcessor.from_pretrained("PekingU/rtdetr_r50vd_coco_o365")
 person_model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd_coco_o365", device_map=device)

@ -54,7 +67,7 @@ with torch.no_grad():
 results = person_image_processor.post_process_object_detection(
    outputs, target_sizes=torch.tensor([(image.height, image.width)]), threshold=0.3
 )
-result = results[0]
+result = results[0]  # take first image results

 # Human label refers 0 index in COCO dataset
 person_boxes = result["boxes"][result["labels"] == 0]
@ -64,7 +77,10 @@ person_boxes = person_boxes.cpu().numpy()
 person_boxes[:, 2] = person_boxes[:, 2] - person_boxes[:, 0]
 person_boxes[:, 3] = person_boxes[:, 3] - person_boxes[:, 1]

-# Detect keypoints for each person found
+# ------------------------------------------------------------------------
+# Stage 2. Detect keypoints for each person found
+# ------------------------------------------------------------------------
+
 image_processor = AutoProcessor.from_pretrained("usyd-community/vitpose-base-simple")
 model = VitPoseForPoseEstimation.from_pretrained("usyd-community/vitpose-base-simple", device_map=device)

@ -74,7 +90,54 @@ with torch.no_grad():
    outputs = model(**inputs)

 pose_results = image_processor.post_process_pose_estimation(outputs, boxes=[person_boxes])
-image_pose_result = pose_results[0]
+image_pose_result = pose_results[0]  # results for first image
+```
+
+### ViTPose++ models
+
+The best [checkpoints](https://huggingface.co/collections/usyd-community/vitpose-677fcfd0a0b2b5c8f79c4335) are those of the [ViTPose++ paper](https://huggingface.co/papers/2212.04246). ViTPose++ models employ a so-called [Mixture-of-Experts (MoE)](https://huggingface.co/blog/moe) architecture for the ViT backbone, resulting in better performance.
+
+The ViTPose+ checkpoints use 6 experts, hence 6 different dataset indices can be passed. 
+An overview of the various dataset indices is provided below:
+
+- 0: [COCO validation 2017](https://cocodataset.org/#overview) dataset, using an object detector that gets 56 AP on the "person" class
+- 1: [AiC](https://github.com/fabbrimatteo/AiC-Dataset) dataset
+- 2: [MPII](https://www.mpi-inf.mpg.de/departments/computer-vision-and-machine-learning/software-and-datasets/mpii-human-pose-dataset) dataset
+- 3: [AP-10K](https://github.com/AlexTheBad/AP-10K) dataset
+- 4: [APT-36K](https://github.com/pandorgan/APT-36K) dataset
+- 5: [COCO-WholeBody](https://github.com/jin-s13/COCO-WholeBody) dataset
+
+Pass the `dataset_index` argument in the forward of the model to indicate which experts to use for each example in the batch. Example usage is shown below:
+
+```python
+image_processor = AutoProcessor.from_pretrained("usyd-community/vitpose-plus-base")
+model = VitPoseForPoseEstimation.from_pretrained("usyd-community/vitpose-plus-base", device=device)
+
+inputs = image_processor(image, boxes=[person_boxes], return_tensors="pt").to(device)
+
+dataset_index = torch.tensor([0], device=device) # must be a tensor of shape (batch_size,)
+
+with torch.no_grad():
+    outputs = model(**inputs, dataset_index=dataset_index)
+```
+
+The ViTPose+ checkpoints use 6 experts, hence 6 different dataset indices can be passed. 
+An overview of the various dataset indices is provided below:
+
+- 0: [COCO validation 2017](https://cocodataset.org/#overview) dataset, using an object detector that gets 56 AP on the "person" class
+- 1: [AiC](https://github.com/fabbrimatteo/AiC-Dataset) dataset
+- 2: [MPII](https://www.mpi-inf.mpg.de/departments/computer-vision-and-machine-learning/software-and-datasets/mpii-human-pose-dataset) dataset
+- 3: [AP-10K](https://github.com/AlexTheBad/AP-10K) dataset
+- 4: [APT-36K](https://github.com/pandorgan/APT-36K) dataset
+- 5: [COCO-WholeBody](https://github.com/jin-s13/COCO-WholeBody) dataset
+
+
+### Visualization
+
+To visualize the various keypoints, one can either leverage the `supervision` [library](https://github.com/roboflow/supervision (requires `pip install supervision`):
+
+```python
+import supervision as sv

 xy = torch.stack([pose_result['keypoints'] for pose_result in image_pose_result]).cpu().numpy()
 scores = torch.stack([pose_result['scores'] for pose_result in image_pose_result]).cpu().numpy()
@ -99,192 +162,119 @@ annotated_frame = vertex_annotator.annotate(
    scene=annotated_frame,
    key_points=key_points
 )
-annotated_frame
 ```

-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/vitpose.png"/>
-</div>
+Alternatively, one can also visualize the keypoints using [OpenCV](https://opencv.org/) (requires `pip install opencv-python`):

-Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
+```python
+import math
+import cv2

-The example below uses [torchao](../quantization/torchao) to only quantize the weights to int4.
+def draw_points(image, keypoints, scores, pose_keypoint_color, keypoint_score_threshold, radius, show_keypoint_weight):
+    if pose_keypoint_color is not None:
+        assert len(pose_keypoint_color) == len(keypoints)
+    for kid, (kpt, kpt_score) in enumerate(zip(keypoints, scores)):
+        x_coord, y_coord = int(kpt[0]), int(kpt[1])
+        if kpt_score > keypoint_score_threshold:
+            color = tuple(int(c) for c in pose_keypoint_color[kid])
+            if show_keypoint_weight:
+                cv2.circle(image, (int(x_coord), int(y_coord)), radius, color, -1)
+                transparency = max(0, min(1, kpt_score))
+                cv2.addWeighted(image, transparency, image, 1 - transparency, 0, dst=image)
+            else:
+                cv2.circle(image, (int(x_coord), int(y_coord)), radius, color, -1)

-```py
-# pip install torchao
-import torch
-import requests
-import numpy as np
-from PIL import Image
-from transformers import AutoProcessor, RTDetrForObjectDetection, VitPoseForPoseEstimation, TorchAoConfig
-
-url = "https://www.fcbarcelona.com/fcbarcelona/photo/2021/01/31/3c55a19f-dfc1-4451-885e-afd14e890a11/mini_2021-01-31-BARCELONA-ATHLETIC-BILBAOI-30.JPG"
-image = Image.open(requests.get(url, stream=True).raw)
-
-person_image_processor = AutoProcessor.from_pretrained("PekingU/rtdetr_r50vd_coco_o365")
-person_model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd_coco_o365", device_map=device)
-
-inputs = person_image_processor(images=image, return_tensors="pt").to(device)
-
-with torch.no_grad():
-    outputs = person_model(**inputs)
-
-results = person_image_processor.post_process_object_detection(
-    outputs, target_sizes=torch.tensor([(image.height, image.width)]), threshold=0.3
-)
-result = results[0]
-
-person_boxes = result["boxes"][result["labels"] == 0]
-person_boxes = person_boxes.cpu().numpy()
-
-person_boxes[:, 2] = person_boxes[:, 2] - person_boxes[:, 0]
-person_boxes[:, 3] = person_boxes[:, 3] - person_boxes[:, 1]
-
-quantization_config = TorchAoConfig("int4_weight_only", group_size=128)
-
-image_processor = AutoProcessor.from_pretrained("usyd-community/vitpose-plus-huge")
-model = VitPoseForPoseEstimation.from_pretrained("usyd-community/vitpose-plus-huge", device_map=device, quantization_config=quantization_config)
-
-inputs = image_processor(image, boxes=[person_boxes], return_tensors="pt").to(device)
-
-with torch.no_grad():
-    outputs = model(**inputs)
-
-pose_results = image_processor.post_process_pose_estimation(outputs, boxes=[person_boxes])
-image_pose_result = pose_results[0]
-```
-
-## Notes
-
- Use [`AutoProcessor`] to automatically prepare bounding box and image inputs.
- ViTPose is a top-down pose estimator. It uses a object detector to detect individuals first before keypoint prediction.
- ViTPose++ has 6 different MoE expert heads (COCO validation `0`, AiC `1`, MPII `2`, AP-10K `3`, APT-36K `4`, COCO-WholeBody `5`) which supports 6 different datasets. Pass a specific value corresponding to the dataset to the `dataset_index` to indicate which expert to use.
-
-    ```py
-    from transformers import AutoProcessor, VitPoseForPoseEstimation
-
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-
-    image_processor = AutoProcessor.from_pretrained("usyd-community/vitpose-plus-base")
-    model = VitPoseForPoseEstimation.from_pretrained("usyd-community/vitpose-plus-base", device=device)
-
-    inputs = image_processor(image, boxes=[person_boxes], return_tensors="pt").to(device)
-    dataset_index = torch.tensor([0], device=device) # must be a tensor of shape (batch_size,)
-
-    with torch.no_grad():
-        outputs = model(**inputs, dataset_index=dataset_index)
-    ```
-
- [OpenCV](https://opencv.org/) is an alternative option for visualizing the estimated pose.
-
-    ```py
-    # pip install opencv-python
-    import math
-    import cv2
-
-    def draw_points(image, keypoints, scores, pose_keypoint_color, keypoint_score_threshold, radius, show_keypoint_weight):
-        if pose_keypoint_color is not None:
-            assert len(pose_keypoint_color) == len(keypoints)
-        for kid, (kpt, kpt_score) in enumerate(zip(keypoints, scores)):
-            x_coord, y_coord = int(kpt[0]), int(kpt[1])
-            if kpt_score > keypoint_score_threshold:
-                color = tuple(int(c) for c in pose_keypoint_color[kid])
+def draw_links(image, keypoints, scores, keypoint_edges, link_colors, keypoint_score_threshold, thickness, show_keypoint_weight, stick_width = 2):
+    height, width, _ = image.shape
+    if keypoint_edges is not None and link_colors is not None:
+        assert len(link_colors) == len(keypoint_edges)
+        for sk_id, sk in enumerate(keypoint_edges):
+            x1, y1, score1 = (int(keypoints[sk[0], 0]), int(keypoints[sk[0], 1]), scores[sk[0]])
+            x2, y2, score2 = (int(keypoints[sk[1], 0]), int(keypoints[sk[1], 1]), scores[sk[1]])
+            if (
+                x1 > 0
+                and x1 < width
+                and y1 > 0
+                and y1 < height
+                and x2 > 0
+                and x2 < width
+                and y2 > 0
+                and y2 < height
+                and score1 > keypoint_score_threshold
+                and score2 > keypoint_score_threshold
+            ):
+                color = tuple(int(c) for c in link_colors[sk_id])
                if show_keypoint_weight:
-                    cv2.circle(image, (int(x_coord), int(y_coord)), radius, color, -1)
-                    transparency = max(0, min(1, kpt_score))
+                    X = (x1, x2)
+                    Y = (y1, y2)
+                    mean_x = np.mean(X)
+                    mean_y = np.mean(Y)
+                    length = ((Y[0] - Y[1]) ** 2 + (X[0] - X[1]) ** 2) ** 0.5
+                    angle = math.degrees(math.atan2(Y[0] - Y[1], X[0] - X[1]))
+                    polygon = cv2.ellipse2Poly(
+                        (int(mean_x), int(mean_y)), (int(length / 2), int(stick_width)), int(angle), 0, 360, 1
+                    )
+                    cv2.fillConvexPoly(image, polygon, color)
+                    transparency = max(0, min(1, 0.5 * (keypoints[sk[0], 2] + keypoints[sk[1], 2])))
                    cv2.addWeighted(image, transparency, image, 1 - transparency, 0, dst=image)
                else:
-                    cv2.circle(image, (int(x_coord), int(y_coord)), radius, color, -1)
+                    cv2.line(image, (x1, y1), (x2, y2), color, thickness=thickness)

-    def draw_links(image, keypoints, scores, keypoint_edges, link_colors, keypoint_score_threshold, thickness, show_keypoint_weight, stick_width = 2):
-        height, width, _ = image.shape
-        if keypoint_edges is not None and link_colors is not None:
-            assert len(link_colors) == len(keypoint_edges)
-            for sk_id, sk in enumerate(keypoint_edges):
-                x1, y1, score1 = (int(keypoints[sk[0], 0]), int(keypoints[sk[0], 1]), scores[sk[0]])
-                x2, y2, score2 = (int(keypoints[sk[1], 0]), int(keypoints[sk[1], 1]), scores[sk[1]])
-                if (
-                    x1 > 0
-                    and x1 < width
-                    and y1 > 0
-                    and y1 < height
-                    and x2 > 0
-                    and x2 < width
-                    and y2 > 0
-                    and y2 < height
-                    and score1 > keypoint_score_threshold
-                    and score2 > keypoint_score_threshold
-                ):
-                    color = tuple(int(c) for c in link_colors[sk_id])
-                    if show_keypoint_weight:
-                        X = (x1, x2)
-                        Y = (y1, y2)
-                        mean_x = np.mean(X)
-                        mean_y = np.mean(Y)
-                        length = ((Y[0] - Y[1]) ** 2 + (X[0] - X[1]) ** 2) ** 0.5
-                        angle = math.degrees(math.atan2(Y[0] - Y[1], X[0] - X[1]))
-                        polygon = cv2.ellipse2Poly(
-                            (int(mean_x), int(mean_y)), (int(length / 2), int(stick_width)), int(angle), 0, 360, 1
-                        )
-                        cv2.fillConvexPoly(image, polygon, color)
-                        transparency = max(0, min(1, 0.5 * (keypoints[sk[0], 2] + keypoints[sk[1], 2])))
-                        cv2.addWeighted(image, transparency, image, 1 - transparency, 0, dst=image)
-                    else:
-                        cv2.line(image, (x1, y1), (x2, y2), color, thickness=thickness)

-    # Note: keypoint_edges and color palette are dataset-specific
-    keypoint_edges = model.config.edges
+# Note: keypoint_edges and color palette are dataset-specific
+keypoint_edges = model.config.edges

-    palette = np.array(
-        [
-            [255, 128, 0],
-            [255, 153, 51],
-            [255, 178, 102],
-            [230, 230, 0],
-            [255, 153, 255],
-            [153, 204, 255],
-            [255, 102, 255],
-            [255, 51, 255],
-            [102, 178, 255],
-            [51, 153, 255],
-            [255, 153, 153],
-            [255, 102, 102],
-            [255, 51, 51],
-            [153, 255, 153],
-            [102, 255, 102],
-            [51, 255, 51],
-            [0, 255, 0],
-            [0, 0, 255],
-            [255, 0, 0],
-            [255, 255, 255],
-        ]
-    )
+palette = np.array(
+    [
+        [255, 128, 0],
+        [255, 153, 51],
+        [255, 178, 102],
+        [230, 230, 0],
+        [255, 153, 255],
+        [153, 204, 255],
+        [255, 102, 255],
+        [255, 51, 255],
+        [102, 178, 255],
+        [51, 153, 255],
+        [255, 153, 153],
+        [255, 102, 102],
+        [255, 51, 51],
+        [153, 255, 153],
+        [102, 255, 102],
+        [51, 255, 51],
+        [0, 255, 0],
+        [0, 0, 255],
+        [255, 0, 0],
+        [255, 255, 255],
+    ]
+)

-    link_colors = palette[[0, 0, 0, 0, 7, 7, 7, 9, 9, 9, 9, 9, 16, 16, 16, 16, 16, 16, 16]]
-    keypoint_colors = palette[[16, 16, 16, 16, 16, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0, 0, 0]]
+link_colors = palette[[0, 0, 0, 0, 7, 7, 7, 9, 9, 9, 9, 9, 16, 16, 16, 16, 16, 16, 16]]
+keypoint_colors = palette[[16, 16, 16, 16, 16, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0, 0, 0]]

-    numpy_image = np.array(image)
+numpy_image = np.array(image)

-    for pose_result in image_pose_result:
-        scores = np.array(pose_result["scores"])
-        keypoints = np.array(pose_result["keypoints"])
+for pose_result in image_pose_result:
+    scores = np.array(pose_result["scores"])
+    keypoints = np.array(pose_result["keypoints"])

-        # draw each point on image
-        draw_points(numpy_image, keypoints, scores, keypoint_colors, keypoint_score_threshold=0.3, radius=4, show_keypoint_weight=False)
+    # draw each point on image
+    draw_points(numpy_image, keypoints, scores, keypoint_colors, keypoint_score_threshold=0.3, radius=4, show_keypoint_weight=False)

-        # draw links
-        draw_links(numpy_image, keypoints, scores, keypoint_edges, link_colors, keypoint_score_threshold=0.3, thickness=1, show_keypoint_weight=False)
+    # draw links
+    draw_links(numpy_image, keypoints, scores, keypoint_edges, link_colors, keypoint_score_threshold=0.3, thickness=1, show_keypoint_weight=False)

-    pose_image = Image.fromarray(numpy_image)
-    pose_image
-    ```
+pose_image = Image.fromarray(numpy_image)
+pose_image
+```
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/vitpose-coco.jpg" alt="drawing" width="600"/>

 ## Resources

-Refer to resources below to learn more about using ViTPose.
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ViTPose. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.

- This [notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/ViTPose/Inference_with_ViTPose_for_body_pose_estimation.ipynb) demonstrates inference and visualization.
- This [Space](https://huggingface.co/spaces/hysts/ViTPose-transformers) demonstrates ViTPose on images and video.
+- A demo of ViTPose on images and video can be found [here](https://huggingface.co/spaces/hysts/ViTPose-transformers).
+- A notebook illustrating inference and visualization can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/ViTPose/Inference_with_ViTPose_for_human_pose_estimation.ipynb).

 ## VitPoseImageProcessor

--- a/docs/source/en/model_doc/wav2vec2.md
+++ b/docs/source/en/model_doc/wav2vec2.md
@ -172,9 +172,9 @@ Otherwise, [`~Wav2Vec2ProcessorWithLM.batch_decode`] performance will be slower
 >>> dataset = dataset.cast_column("audio", datasets.Audio(sampling_rate=16_000))


->>> def map_to_array(example):
-...     example["speech"] = example["audio"]["array"]
-...     return example
+>>> def map_to_array(batch):
+...     batch["speech"] = batch["audio"]["array"]
+...     return batch


 >>> # prepare speech data for batch inference
--- a/docs/source/en/modular_transformers.md
+++ b/docs/source/en/modular_transformers.md
@ -1,4 +1,4 @@
-# Contributing a new model to Transformers
+# Modular Transformers

 Modular Transformers lowers the bar for contributing models and significantly reduces the code required to add a model by allowing imports and inheritance.

@ -540,9 +540,6 @@ This makes it very easy to switch decorators and makes it explicit that the only

 ## Docstring variables

-> [!TIP]
-> Refer to the [Documeting a model](./auto_docstring) guide for more information about how you can use the `@auto_docstring` decorator to help automatically generate consistent docstring arguments.
-
 If an object defined in both the modular and modeling file from which it inherits, the modular definition has precedence unless for assignments containing the pattern `DOCSTRING`. These variables are typically used in `MODEL_START_DOCSTRING` and `MODEL_INPUT_DOCSTRING` in the modeling files. They are big blocks of docstrings and the linter rewrites the names everywhere. For this reason, assignments containing the `DOCSTRING` variable can use the definition found in the source file without copying the whole docstring, by simply setting the variable to `None` in the modular file.

 This is very useful if you need the variable reference somewhere but you don't want to clutter the modular file with docstrings which are always the same. The example code below allows you to automatically use the same docstrings from [Mistral](./model_doc/mistral) in [Starcoder2](./model_doc/starcoder2).
--- a/docs/source/en/perf_infer_gpu_multi.md
+++ b/docs/source/en/perf_infer_gpu_multi.md
@ -13,19 +13,21 @@ rendered properly in your Markdown viewer.

 -->

-# Distributed inference
+# Tensor parallelism in transformers

-When a model doesn't fit on a single GPU, distributed inference with [tensor parallelism](./perf_train_gpu_many#tensor-parallelism) can help. Tensor parallelism shards a model onto multiple accelerators (CUDA GPU, Intel XPU, etc.) and parallelizes computations such as matrix multiplication. It enables fitting larger model sizes into memory and is faster because each accelerator can process a tensor slice.
-
-However, tensor parallelism adds communication overhead and should be used on single machine setups with multiple accelerators to take advantage of fast intra-node communication. For multi-node training, it may be more efficient to use pipeline or data parallelism depending on your use case.
+[Tensor parallelism](./perf_train_gpu_many#tensor-parallelism) shards a model onto multiple GPUs and parallelizes computations such as matrix multiplication. It enables fitting larger model sizes into memory and is faster because each GPU can process a tensor slice.
+This document assumes that you are already familiar with the basics of tensor parallelism. If you are not, please refer to the [Ultra-Scale Playbook](https://huggingface.co/spaces/nanotron/ultrascale-playbook?section=tensor_parallelism) section on tensor parallelism.

 > [!TIP]
-> Refer to the [Ultra-Scale Playbook](https://huggingface.co/spaces/nanotron/ultrascale-playbook?section=tensor_parallelism) section on tensor parallelism to learn more.
+> Tensor parallelism is very communication intensive, therefore it is reccomended to use it on a single machine with multiple GPUs, utilizing fast intra-node communication. For multi-node training, methods as pipeline or data parallelism are more efficient (depending on your use case).

-Check the list below for models that natively support tensor parallelism. Open a GitHub issue or pull request to add support for a model.
+Tensor parallelism requires slight changes to the model parameters, therefore in transformers, we support some of the popular models out of the box.
+
+> [!TIP]
+> Expand the list below to see which models support tensor parallelism. Open a GitHub issue or pull request to add support for a model not currently below.

 <details>
-<summary>Show supported models</summary>
+<summary>Supported models</summary>

 * [Cohere](./model_doc/cohere) and [Cohere 2](./model_doc/cohere2)
 * [Gemma](./model_doc/gemma) and [Gemma 2](./model_doc/gemma2)
@ -41,74 +43,19 @@ Check the list below for models that natively support tensor parallelism. Open a

 </details>

-This guide shows how to enable tensor parallelism with Transformers and different partitioning strategies.
+## Using 🤗 transformers

-## Partitioning a model
+Transformers provides a simple interface to use for tensor parallelism. We provide multiple classes implementing different partitioning
+strategies and a simple entrypoint to parallelize `nn.Module` instance. You won't have to interact with this interface directly, everything is done in `PretrainedModel.from_pretrained` method for you. This section will first talk about the partitioning strategies
+we support, then the user interface you will be interacting with, and finally it will teach you how to extend it with your own partitioning
+strategies.

-Transformers supports tensor parallelism if a model has a `tp_plan`. There are two plans to partition a model.
+### Partitioning strategies

- The `auto` tensor parallelism plan partitions a model (see the supported models above) based on a predefined configuration.
- You can also manually specify your own partitioning plan and pass it to the `tp_plan` parameter in [`~PreTrainedModel.from_pretrained`].
+In transformers, partitioning strategies reside in a class `ParallelInterface` which works like a mapping from string to the strategy implementation.

-<hfoptions id="sharding">
-<hfoption id="auto plan">

-```py
-import os
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-# model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct" # better to visualize all the possible strategies
-model_id = "meta-llama/Meta-Llama-3-8B-Instruct"  # better for smaller number of GPUs
-
-model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, tp_plan="auto")
-print(model._tp_plan)
-
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
-prompt = "Can I help"
-inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
-
-# distributed run
-outputs = model(inputs)
-```
-
-Launch the inference script above on [torchrun](https://pytorch.org/docs/stable/elastic/run.html) with 4 processes per GPU.
-
-```bash
-torchrun --nproc-per-node 4 demo.py
-```
-
-</hfoption>
-<hfoption id="manual plan">
-
-Define a tensor parallel plan for each layer in `tp_plan` and pass it to [`~PreTrainedModel.from_pretrained`]. The example below uses a combination of column and row partitioning. Refer to the [Partitioning strategies](#partitioning-strategies) section to learn about other supported partitioning strategies.
-
-> [!WARNING]
-> Manually specifying your own partitioning plan requires a good understanding of the model architecture and how the partitioning strategies interact together. If you are not sure about the partitioning strategies, the resulting model can be very slow, even failing or incorrect. Refer to the [Ultra-Scale Playbook](https://huggingface.co/spaces/nanotron/ultrascale-playbook?section=tensor_parallelism) to learn more.
-
-```py
-from transformers import AutoModelForCausalLM
-
-tp_plan = {
-    "model.layers.*.self_attn.q_proj": "colwise",
-    "model.layers.*.self_attn.k_proj": "colwise",
-    "model.layers.*.self_attn.v_proj": "colwise",
-    "model.layers.*.self_attn.o_proj": "rowwise",
-    ...
-}
-
-model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, tp_plan=tp_plan)
-print(model._tp_plan)
-```
-
-</hfoption>
-</hfoptions>
-
-## Partitioning strategies
-
-All partitioning strategies are defined in the [`ParallelInterface`] class which maps a string to the strategy implementation. You don't need to interact with this class directly since all the strategies are set with `tp_plan` in [`~PreTrainedModel.from_pretrained`], but it is useful for checking what strategies are available.
-
-```py
+```python
 class ParallelInterface(MutableMapping):
    """
    Dict-like object keeping track of allowed attention functions. You can easily add a new attention function
@ -130,32 +77,66 @@ class ParallelInterface(MutableMapping):
    }
 ```

-Refer to the table below to learn more about each strategy.
+We support the following strategies:

-| Strategy | Description |
-|---|---|
-| `ColwiseParallel` | Column-wise partitioning of weights and biases. |
-| `RowwiseParallel` | Row-wise partitioning of weights and biases. Also supports partitioning `nn.Embedding` modules. |
-| `SequenceParallel` | Sequence parallel implementation to support `LayerNorm` and `Dropout` layers. Also supports Python implementation of [RMSNorm](https://github.com/facebookresearch/llama/blob/main/llama/model.py#L34). |
-| `PackedColwiseParallel` | Variant of `ColwiseParallel` to support packed weights (for example, packing `up_proj` and `gate_proj` together). Refer to the [code](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/tensor_parallel.py#L79-#L108) for more details. |
-| `PackedRowwiseParallel` | Variant of `RowwiseParallel` to support packed weights (refer to the [code](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/tensor_parallel.py#L79-#L108) for more details). |
-| `GatherParallel` | Gather outputs of the module across devices. |
-| `IsolatedParallel` | Used for Experts in Mixture-of-Experts (MoE) layers to isolates module from other devices. |
-| `ReplicateParallel` | Replicate modules across all devices to prevent `torch.distributed` APIs from breaking due to a partially sharded model. |
+- `ColwiseParallel` - A simple column-wise partitioning, being able to handle both weights and biases, does exactly what we've discussed before.
+- `RowwiseParallel` - Again, row-wise partitioning as dicussed before, supports weights and biases, on top of that it also supports `nn.Embedding` modules.
+- `SequenceParallel` - Sequence parallel implementation, for support of `LayerNorm` and `Dropout` layers. Also supports Python implementation of `RMSNorm` (see [this](https://github.com/facebookresearch/llama/blob/main/llama/model.py#L34))
+- `PackedColwiseParallel` - A variant of column-wise partitioning, however it works on packed weights (i.e. `up_proj` and `gate_proj` being packed together). For more details, see [this comment](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/tensor_parallel.py#L79-#L108)
+- `PackedRowwiseParallel` - A variant of row-wise partitioning, works on packed weights, for more details check the comment linked above.
+- `GatherParallel` - A very simple class, that only makes the outputs of the module to be gathered across devices.
+- `IsolatedParallel` - This is a special case, where we want to *isolate* the module from the rest of the devices (world). This is used for Experts in MoE layers, basically creating Expert parallelism of sorts.
+- `ReplicateParallel` - Many `torch.distributed` APIs break if model is partially sharded, so this class is used to replicate the module across all devices.

-### Packed strategies
+### Sharding a model

-Weight packing packs multiple linear layers into a single, bigger layer. Packed strategies, `PackedColwiseParallel` and `PackedRowwiseParallel`, are used to shard packed weights. The more basic `ColwiseParallel` or `RowwiseParallel` will incorrectly shard the packed weights.
+We provide two ways to shard a model, first one is to use `auto` tensor parallelism plan, which will automatically shard the model based on our predefined configuration. This requires the model to have predefined tensor parallel plan in transformers.

-The example below packs `up_proj` and `gate_proj` into a single `gate_up_proj` module and requires the `PackedRowwiseParallel` strategy to shard `gate_up_proj`.
+```python
+from transformers import AutoModelForCausalLM

+# model_id = "meta-llama/Meta-Llama-3-8B-Instruct" # better for smaller number of GPUs
+model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct" # better to visualize all the possible strategies
+
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, tp_plan="auto")
+
+print(model._tp_plan)
+```
+
+> [!TIP]
+> For a list of models that support tensor parallelism, see the [Supported models](#supported-models) section above.
+
+The second way is to manually specify your own partitioning plan.
+
+```python
+from transformers import AutoModelForCausalLM
+
+tp_plan = {
+    "model.layers.*.self_attn.q_proj": "colwise",
+    "model.layers.*.self_attn.k_proj": "colwise",
+    "model.layers.*.self_attn.v_proj": "colwise",
+    "model.layers.*.self_attn.o_proj": "rowwise",
+    ...
+}
+
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, tp_plan=tp_plan)
+
+print(model._tp_plan)
+```
+
+You might have noticed that there are some special cases in the `ParallelInterface` mapping, let's now talk about them. This will help you understand their purpose and help with extending to other strategies.
+
+### PackedRowwiseParallel
+This class is a special case of `RowwiseParallel`, it's used to shard packed weights. Weight packing is a common technique used in models. It's a technique where we pack multiple linear layers into a single, bigger one.
+
+For example in `Llama4` model, we pack `up_proj` and `gate_proj` into a single `gate_up_proj` module.
 ```python
 class Llama4TextExperts(nn.Module):
    ...
    self.gate_up_proj = nn.Parameter(torch.empty(self.num_experts, self.hidden_size, 2 * self.expert_dim))
 ```

-Batch matrix multiplication can be used in the `forward` pass to compute the output of the `gate_up_proj` module.
+Then in forward, we can use batch matrix multiplication to compute the output of the `gate_up_proj` module.

 ```python
 def forward(self, hidden_states):
@ -164,148 +145,185 @@ def forward(self, hidden_states):
    gate, up = gate_up.chunk(2, dim=-1) # Split the output into gate and up
 ```

+In this case, we need to use the `PackedRowwiseParallel` strategy to shard the `gate_up_proj` module, as using a simple `RowwiseParallel` will shard the layers wrongly.
+
 > [!TIP]
-> Refer to [this comment](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/tensor_parallel.py#L79-#L108) for an visual representation of why `Packed*` needs to be used.
+> If this is a bit difficult to wrap your head around, check out [this comment](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/tensor_parallel.py#L79-#L108) for an amazing visual representation of why `Packed*` needs to be used.

-### Local strategies

-Local strategies (`local_colwise`, `local_rowwise`, `local_packed_rowwise`) don't use [DTensor](https://docs.pytorch.org/docs/stable/distributed.tensor.html) because it isn't supported for some operations such as [torch.chunk](https://docs.pytorch.org/docs/stable/generated/torch.chunk.html). Instead, local strategies use the basic [torch.Tensor](https://docs.pytorch.org/docs/stable/tensors.html) and performs some of the distributed logic manually.
+### `local*` strategies

-<!--
+You could have noticed that there are `local*` strategies, which use the same layers as `*` strategy, but don't use `DTensor` at all.
+This is because `DTensor` is not supported for some of the operations: such as `torch.chunk`. Therefore, sometimes we need to use the `local*` strategies, which use vanilla `torch.Tensor` and do some of the distributed logic manually.
+
+<!---
 Readd this when I get the exact error message
 > [!TIP]
 > If you are using a custom partitioning strategy, and it's not working with `... is not supported` error, try using the `local*` strategies to see if they work better.
 -->

-## Custom partitioning strategies
+> [!WARNING]
+> Manually specifying your own partitiong plan requires a good understanding of the model architecture and how the partitioning strategies interact together. If you are not sure about this, the resulting model can be very slow, even failing or incorrect. Again, refer to the [Ultra-Scale Playbook](https://huggingface.co/spaces/nanotron/ultrascale-playbook?section=tensor_parallelism) which can teach you everything required.

-A custom partitioning strategy should inherit from [`TensorParallelLayer`](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/tensor_parallel.py) and implement `partition_tensor`, `_prepare_input_fn` and `_prepare_output_fn`.
+### Extending the interface with your own partitioning strategies

-Then it needs to be registered in the `ParallelInterface` mapping so the dispatching logic can find it when specified in `tp_plan`.
+This is a very advanced topic, which requires a good understanding of distributed collectives and the model architecture.
+Your custom partitioning strategy should inherit from `TensorParallelLayer` defined in [integrations/tensor_parallel.py](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/tensor_parallel.py) and implement: `partition_tensor`, `_prepare_input_fn` and `_prepare_output_fn`. Then it should be registered in the `ParallelInterface` mapping, so our dispatching logic can find it when specified in the `tp_plan`.

-The example below shows how to implement `ColwiseParallel` with this workflow.
+Let's go through this workflow step by step, on an already existing example: `ColwiseParallel`.

-1. Inherit from `TensorParallelLayer`. In the `__init__` method, define `input_layouts` and `output_layouts` to describe how the input and output tensors should be placed on devices. The `desired_input_layouts` attribute is used to specify how the input *should* be placed on devices.
+1. Inherit from `TensorParallelLayer` and initialization

-    ```python
-    class ColwiseParallel(TensorParallelLayer):
-        def __init__(
-            self,
-            *,
-            input_layouts: Optional[Placement] = None, # The input layout coming from the previous layer
-            output_layouts: Optional[Placement] = None, # The output layout we want to achieve
-            use_local_output: bool = True, # Whether to use local output or not
-            use_dtensor=True, # Whether to use DTensor or not
-        ):
-            self.input_layouts = (input_layouts or Replicate(),) # The input sharding coming from the previous layer
-            self.output_layouts = (output_layouts or Shard(-1),) # Desired output sharding
-            self.desired_input_layouts = (Replicate(),) # Desired input sharding, inputs should be replicated across GPUs
-            self.use_local_output = use_local_output
-            self.use_dtensor = use_dtensor
-    ```
-
-2. Implement the `partition_tensor`, `_prepare_input_fn` and `_prepare_output_fn` methods.
-
-    The `partition_tensor` method partitions the tensor and fills `empty_param` with the partitioned tensor. Use the utility function `get_tensor_shard` to help you get the correct shard of the original parameter for a given rank and `get_packed_weights` to help with packed weights.
-
-    ```python
-    def partition_tensor(
+```python
+class ColwiseParallel(TensorParallelLayer):
+    def __init__(
        self,
-        param, # Full tensor of the parameter
-        empty_param, # Empty tensor of the parameter, will be filled with the partitioned tensor
-        param_type, # Type of the parameter, `bias` or `weight`
-        param_casting_dtype, # The type to cast the parameter to
-        to_contiguous, # Whether to convert the tensor to a contiguous memory layout
-        rank, # The rank of the current device
-        device_mesh, # The device mesh
-    ) -> nn.Parameter: # Return the partitioned parameter
-        ...
-    ```
+        *,
+        input_layouts: Optional[Placement] = None, # The input layout coming from the previous layer
+        output_layouts: Optional[Placement] = None, # The output layout we want to achieve
+        use_local_output: bool = True, # Whether to use local output or not
+        use_dtensor=True, # Whether to use DTensor or not
+    ):
+        self.input_layouts = (input_layouts or Replicate(),) # The input sharding coming from the previous layer
+        self.output_layouts = (output_layouts or Shard(-1),) # Desired output sharding
+        self.desired_input_layouts = (Replicate(),) # Desired input sharding, inputs should be replicated across GPUs
+        self.use_local_output = use_local_output
+        self.use_dtensor = use_dtensor
+```

-    The `_prepare_input_fn` and `_prepare_output_fn` methods are used in the [pre-forward](https://docs.pytorch.org/docs/stable/generated/torch.nn.modules.module.register_module_forward_pre_hook.html) and [forward](https://docs.pytorch.org/docs/stable/generated/torch.nn.modules.module.register_module_forward_hook.html) hooks. They redistribute the inputs and outputs to the desired layout as specified in the `__init__`.
+In the `__init__` method, we define these attributes, where `input_layouts` and `output_layouts` describing, how the input and output tensors should be placed on the devices. `desired_input_layouts` is used to specify, how the input *SHOULD* be placed on the devices.

-    ```python
-    def _prepare_input_fn(input_layouts, desired_input_layouts, mod, inputs, device_mesh):
-        ...
-        # Do some custom logic, cast to DTensor etc.
-        ...
-        return inputs.redistribute(placements=desired_input_layouts, device_mesh=device_mesh)
-    def _prepare_output_fn(output_layouts, use_local_output, mod, outputs, device_mesh):
-        ...
-        # Do some custom logic, cast to DTensor etc.
-        ...
-        return outputs.redistribute(placements=output_layouts, device_mesh=device_mesh)
-    ```
+2a. Implement `partition_tensor` method

-3. Register the strategy to [`ParallelInterface`] to enable it for use with `tp_plan`.
+```python
+def partition_tensor(
+    self,
+    param, # Full tensor of the parameter
+    empty_param, # Empty tensor of the parameter, will be filled with the partitioned tensor
+    param_type, # Type of the parameter, `bias` or `weight`
+    param_casting_dtype, # The type to cast the parameter to
+    to_contiguous, # Whether to convert the tensor to a contiguous memory layout
+    rank, # The rank of the current device
+    device_mesh, # The device mesh
+) -> nn.Parameter: # Return the partitioned parameter
+    ...
+```

-    ```python
-    from transformers.integrations.tensor_parallel import ParallelInterface
+This method is used to partition the tensor, and fill the `empty_param` with the partitioned tensor.
+We provide some utility functions to help you with this, such as `get_tensor_shard` which will get you the correct shard of the original parameter for this rank or `get_packed_weights` to help with packed weights.

-    ParallelInterface.register_strategy("colwise_custom", ColwiseParallel)
-    tp_plan = {
-        "model.layers.*.self_attn.q_proj": "colwise_custom",
-        ...
-    }
-    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, tp_plan=tp_plan)
-    ```
+2b. Implement `_prepare_input_fn` and `_prepare_output_fn` methods

-## Benchmarks
+These methods are used as [`pre-forward`](https://docs.pytorch.org/docs/stable/generated/torch.nn.modules.module.register_module_forward_pre_hook.html) and [`forward`](https://docs.pytorch.org/docs/stable/generated/torch.nn.modules.module.register_module_forward_hook.html) hooks respectively. Their purpose is to re-distribute the inputs and outputs to the desired layout, passed in the `__init__` method.

-Tensor parallelism can considerably speedup inference, especially for inputs with large batch sizes or long sequences.
+```python
+def _prepare_input_fn(input_layouts, desired_input_layouts, mod, inputs, device_mesh):
+    ...
+    # Do some custom logic, cast to DTensor etc.
+    ...
+    return inputs.redistribute(placements=desired_input_layouts, device_mesh=device_mesh)

-Refer to the chart below for the expected speedup for a single forward pass on [Llama](./model_doc/llama) with a sequence length of 512.
+def _prepare_output_fn(output_layouts, use_local_output, mod, outputs, device_mesh):
+    ...
+    # Do some custom logic, cast to DTensor etc.
+    ...
+    return outputs.redistribute(placements=output_layouts, device_mesh=device_mesh)
+```
+
+3. Register the strategy
+Congratulations! You've implemented your own partitioning strategy. Now, to use it with your own `tp_plan`, you need to register it in the `ParallelInterface` mapping.
+
+```python
+from transformers.integrations.tensor_parallel import ParallelInterface
+
+ParallelInterface.register_strategy("colwise_custom", ColwiseParallel)
+```
+
+And now you can use it in your `tp_plan` as such:
+
+```python
+tp_plan = {
+    "model.layers.*.self_attn.q_proj": "colwise_custom",
+    ...
+}
+
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, tp_plan=tp_plan)
+```
+
+
+## Full example
+
+Let's go through a full example of inference with tensor parallelism.
+```python
+import os
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+
+# enable tensor parallelism
+model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Meta-Llama-3-8B-Instruct",
+    tp_plan="auto",
+)
+
+# prepare input tokens
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
+prompt = "Can I help"
+inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
+
+# distributed run
+outputs = model(inputs)
+```
+
+Launch the inference script above on [torchrun](https://pytorch.org/docs/stable/elastic/run.html) with 4 processes per GPU.
+
+```bash
+torchrun --nproc-per-node 4 demo.py
+```
+
+You can benefit from considerable speed ups for inference, especially for inputs with large batch size or long sequences.
+
+For a single forward pass on [Llama](./model_doc/llama) with a sequence length of 512 and various batch sizes, you can expect the following speed ups.

 <div style="text-align: center">
    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Meta-Llama-3-8B-Instruct%2C%20seqlen%20%3D%20512%2C%20python%2C%20w_%20compile.png">
 </div>

-## Design implementation
-
-The Transformers tensor parallelism implementation is framework-agnostic, but for specific implementations, we rely on [DeviceMesh](https://docs.pytorch.org/tutorials/recipes/distributed_device_mesh.html) and [DTensor](https://docs.pytorch.org/docs/stable/distributed.tensor.html) from [torch.distributed](https://docs.pytorch.org/tutorials/beginner/dist_overview.html) to provide a simple and extensible interface.
+## Tensor parallelism in-depth
+Our implementation of tensor parallelism is framework-agnostic in design, but the specific implementations we've developed rely on the torch.distributed package. We heavily utilize abstractions such as `DeviceMesh` or `DTensor` to provide a simple and extensible interface to the user.

 ### DeviceMesh
-
-Imagine `DeviceMesh` as a multi-dimensional grid of devices that communicate together. Different parallelization strategies require different types of communication patterns, so you can create a `DeviceMesh` with multiple sub-meshes.
-
+Imagine `DeviceMesh` as a multi-dimensional grid of devices that communicate together. Different parallelization strategies require different types of communication patterns, therefore we can create a `DeviceMesh` with multiple submeshes:
 ```python
 from torch.distributed.device_mesh import init_device_mesh

 # Create a 1D mesh of 4 GPUs
 device_mesh = init_device_mesh("cuda", (4,), mesh_dim_names=["tp"])
 ```
-
-Most of the `torch.distributed` defined parallelization strategies can be applied to the mesh itself, or its sub-mesh, and it automatically handles the communication patterns.
+Then, most of the `torch.distributed` defined parallelization strategies can be applied to a mesh itself, or its submesh, automatically handling the communication patterns.

 ### DTensor

-`DTensor` (Distributed Tensor) is a tensor subclass that handles the distributed logic on top of the usual tensor operations. Most of the model weights in tensor parallelism are stored as `DTensor`s.
+Abbreviation for Distributed Tensor, `DTensor` is a tensor subclass that handles the distributed logic on-top of the usual tensor operations. Most of the model weights in case of tensor parallelism are stored as `DTensor`s (with some exceptions, more on that later).
+The most important part of DTensor, that is crucial to understand, is the `placement` attribute. It's an attribute that tells PyTorch how is the tensor placed on the devices of the `DeviceMesh`.

-The most important part of DTensor is the `placement` attribute because it tells PyTorch how a tensor is placed on the devices in `DeviceMesh`. The `placement` attribute can take the following values.
+It can have the following values:

- `Shard(dimension)` - Indicates how a `DTensor` is sharded across a given dimension, over the `DeviceMesh` it was constructed under. The example below demonstrates how to shard weights over different dimensions for column-wise partitioning.
+- `Shard(dimension)` - Annotates that this `DTensor` is sharded across a given dimension, over the `DeviceMesh` it was constructed under. For example, if we would like to shard weights for column-wise partitioning, we would do:
+```python
+weight = ...
+weight = DTensor.from_local(weight, device_mesh["tp"], placements=[Shard(0)]) # Shard across the 1st (column-wise) dimension
+bias = ...
+bias = DTensor.from_local(bias, device_mesh["tp"], placements=[Shard(-1)]) # Shard across the ONLY dimension
+```

-    ```python
-    weight = ...
-    weight = DTensor.from_local(weight, device_mesh["tp"], placements=[Shard(0)]) # Shard across the 1st (column-wise) dimension
-    bias = ...
-    bias = DTensor.from_local(bias, device_mesh["tp"], placements=[Shard(-1)]) # Shard across the ONLY dimension
-    ```
+To give another example, for row-wise partitioning, we would do:
+```python
+weight = ...
+weight = DTensor.from_local(weight, device_mesh["tp"], placements=[Shard(1)]) # Shard across the 2nd (row-wise) dimension
+bias = ...
+bias = DTensor.from_local(bias, device_mesh["tp"], placements=[Replicate()]) # Replicate bias across all GPUs
+```

-    This example demonstrates how to shard weights over different dimensions for row-wise partitioning.
-
-    ```python
-    weight = ...
-    weight = DTensor.from_local(weight, device_mesh["tp"], placements=[Shard(1)]) # Shard across the 2nd (row-wise) dimension
-    bias = ...
-    bias = DTensor.from_local(bias, device_mesh["tp"], placements=[Replicate()]) # Replicate bias across all GPUs
-    ```
-
- `Replicate()` - Indicates a `DTensor` is replicated across the `DeviceMesh`. It only creates a full copy of the tensor on each device.
-
-    ```py
-    bias = ...
-    bias = DTensor.from_local(bias, device_mesh["tp"], placements=[Replicate()]) # Replicate bias across all GPUs
-    ```
-
- `Partial()` - Indicates a tensor is pending a reduction operation (not typically relevant for usage in Transformers).
+- `Replicate()` - Annotates that this `DTensor` is replicated across the `DeviceMesh`. Very straight-forward, only creates a full copy of the tensor on each device.
+- `Partial()` - This placement is mostly of no interest to us, it's used to annotate that this tensor is pending a reduction operation.
--- a/docs/source/en/perf_train_gpu_many.md
+++ b/docs/source/en/perf_train_gpu_many.md
@ -91,8 +91,6 @@ Tensor parallelism distributes large tensor computations across multiple GPUs. T

 Tensor parallelism is effective for training large models that don't fit into the memory of a single GPU. It is also faster and more efficient because each GPU can process its tensor slice in parallel, and it can be combined with other parallelism methods. Like other parallelism methods though, tensor parallelism adds communication overhead between GPUs.

-Refer to the [Tensor parallelism](./perf_infer_gpu_multi) guide to learn how to use it for inference.
-
 ## Hybrid parallelism

 Parallelism methods can be combined to achieve even greater memory savings and more efficiently train models with billions of parameters.
--- a/docs/source/en/perf_train_tpu_tf.md
+++ b/docs/source/en/perf_train_tpu_tf.md
@ -0,0 +1,355 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# TPU
+
+TPU (Tensor Processing Unit) is a type of hardware designed to accelerate tensor computations for training and inference. TPUs are generally accessed through Google cloud services, but smaller TPUs are also available for free from [Google Colab](https://colab.research.google.com/notebooks/tpu.ipynb) or [Kaggle](https://www.kaggle.com/docs/tpu).
+
+This guide focuses on training a Keras model for sequence classification on a TPU from Google Colab. Make sure the TPU runtime is enabled by going to **Runtime > Change runtime type** and selecting a TPU.
+
+Run the command below to install the latest version of Transformers and [Datasets](https://huggingface.co/docs/datasets).
+
+```py
+!pip install --U transformers datasets
+```
+
+Create an instance of [tf.distribute.cluster_resolver.TPUClusterResolver](https://www.tensorflow.org/api_docs/python/tf/distribute/cluster_resolver/TPUClusterResolver), and then connect to the remote cluster and initialize the TPUs.
+
+```py
+import tensorflow as tf
+
+resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
+tf.config.experimental_connect_to_cluster(resolver)
+tf.tpu.experimental.initialize_tpu_system(resolver)
+```
+
+There are various distribution strategies for running your model on multiple TPUs. The [tpu.distribute.TPUStrategy](https://www.tensorflow.org/api_docs/python/tf/distribute/TPUStrategy) offers synchronized distributed training.
+
+```py
+strategy = tf.distribute.TPUStrategy(resolver)
+```
+
+Load and tokenize a dataset - this example uses [CoLA](https://huggingface.co/datasets/nyu-mll/glue/viewer/cola) from the GLUE benchmark - and pad all samples to the maximum length so it is easier to load as an array and to avoid [XLA compilation issues](#xla).
+
+```py
+from transformers import AutoTokenizer
+from datasets import load_dataset
+import numpy as np
+
+dataset = load_dataset("glue", "cola")["train"]
+tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
+
+train_data = tokenizer(
+    dataset["sentence"],
+    padding="max_length",
+    truncation=True,
+    max_length=128,
+    return_tensors="np",
+)
+train_data = dict(train_data)
+train_labels = np.array(dataset["label"])
+```
+
+The model **must** be created inside [Strategy.scope](https://www.tensorflow.org/api_docs/python/tf/distribute/MirroredStrategy#scope) in order to replicate the model layers on each TPU device.
+
+```py
+from transformers import TFAutoModelForSequenceClassification
+
+with strategy.scope():
+    model = TFAutoModelForSequenceClassification.from_pretrained(model_checkpoint)
+    model.compile(optimizer="adam")
+```
+
+TPUs only accept [tf.data.Dataset](https://www.tensorflow.org/api_docs/python/tf/data/Dataset) inputs unlike the Keras [fit](https://keras.io/api/models/model_training_apis/#fit-method) method which accepts a broader range of inputs.
+
+```py
+BATCH_SIZE = 8 * strategy.num_replicas_in_sync
+
+tf_dataset = tf.data.Dataset.from_tensor_slices((train_data, train_labels))
+tf_dataset = tf_dataset.shuffle(len(tf_dataset))
+tf_dataset = tf_dataset.batch(BATCH_SIZE, drop_remainder=True)
+```
+
+Finally, call [fit](https://keras.io/api/models/model_training_apis/#fit-method) to start training.
+
+```py
+model.fit(tf_dataset)
+```
+
+## Large datasets
+
+The dataset created above pads every sample to the maximum length and loads the whole dataset into memory. This may not be possible if you're working with larger datasets. When training on large datasets, you may want to create a [tf.TFRecord](https://www.tensorflow.org/tutorials/load_data/tfrecord) or stream the data.
+
+### tf.TFRecord
+
+[tf.TFRecord](https://www.tensorflow.org/tutorials/load_data/tfrecord) is the standard [tf.data](https://www.tensorflow.org/guide/data) format for storing training data. For very large training jobs, it's worth preprocessing your data and storing it in the `tf.TFRecord` format and building a `tf.data` pipeline on top. Refer to the table below to help you decide whether `tf.TFRecord` is helpful for you.
+
+| pros | cons |
+|---|---|
+| works on all TPU instances | costs associated with cloud storage |
+| supports huge datasets and massive throughput | some data types (images) can take a lot of space to store |
+| suitable for training on entire TPU pods |  |
+| preprocessing is done in advance, maximizing training speed |  |
+
+Preprocess and tokenize the dataset before writing it to a `tf.TFRecord` to avoid writing every time the data is loaded.
+
+An exception is made for *train-time augmentations*, because augmentations applied after writing to a `tf.TFRecord` results in the same augmentation for each epoch. Instead, apply augmentations in the `tf.data` pipeline that loads the data.
+
+> [!TIP]
+> In practice, you probably won't be able to load the entire dataset in memory. Load a chunk of the dataset at a time and convert it to `TFRecord`, and repeat until the entire dataset is in the `TFRecord` format. Then you can use a list of all the files to create a `TFRecordDataset`. The example below demonstrates a single file for simplicity.
+
+```py
+tokenized_data = tokenizer(
+    dataset["sentence"],
+    padding="max_length",
+    truncation=True,
+    max_length=128,
+    return_tensors="np",
+)
+labels = dataset["label"]
+
+with tf.io.TFRecordWriter("dataset.tfrecords") as file_writer:
+    for i in range(len(labels)):
+        features = {
+            "input_ids": tf.train.Feature(
+                int64_list=tf.train.Int64List(value=tokenized_data["input_ids"][i])
+            ),
+            "attention_mask": tf.train.Feature(
+                int64_list=tf.train.Int64List(value=tokenized_data["attention_mask"][i])
+            ),
+            "labels": tf.train.Feature(
+                int64_list=tf.train.Int64List(value=[labels[i]])
+            ),
+        }
+        features = tf.train.Features(feature=features)
+        example = tf.train.Example(features=features)
+        record_bytes = example.SerializeToString()
+        file_writer.write(record_bytes)
+```
+
+Build a [TFRecordDataset](https://www.tensorflow.org/api_docs/python/tf/data/TFRecordDataset) using the saved filename to load it.
+
+```py
+def decode_fn(sample):
+    features = {
+        "input_ids": tf.io.FixedLenFeature((128,), dtype=tf.int64),
+        "attention_mask": tf.io.FixedLenFeature((128,), dtype=tf.int64),
+        "labels": tf.io.FixedLenFeature((1,), dtype=tf.int64),
+    }
+    return tf.io.parse_example(sample, features)
+
+# TFRecordDataset can handle gs:// paths
+tf_dataset = tf.data.TFRecordDataset(["gs://matt-tf-tpu-tutorial-datasets/cola/dataset.tfrecords"])
+tf_dataset = tf_dataset.map(decode_fn)
+tf_dataset = tf_dataset.shuffle(len(dataset)).batch(BATCH_SIZE, drop_remainder=True)
+tf_dataset = tf_dataset.apply(
+    tf.data.experimental.assert_cardinality(len(labels) // BATCH_SIZE)
+)
+```
+
+The dataset can now be passed to the [fit](https://keras.io/api/models/model_training_apis/#fit-method) method.
+
+```py
+model.fit(tf_dataset)
+```
+
+### Stream from raw data
+
+Data can be stored in its native format and preprocessed in a [tf.data](https://www.tensorflow.org/guide/data) pipeline as the data is loaded. This approach isn't supported for many models with complex tokenization schemes, but some models like BERT are supported because their tokenization can be compiled. Refer to the table below to help you decide whether this approach is helpful for you.
+
+| pros | cons |
+|---|---|
+| suitable for highly compressed big data in native format (images, audio) | requires writing a full preprocessing pipeline |
+| convenient if raw data is available in a public cloud bucket | complex preprocessing on-the-fly can hurt throughput |
+| works on all TPU instances if data is stored in Google Cloud | must place data in cloud storage if not already there |
+|  | not as suitable for text data because writing a tokenization pipeline is hard (use `TFRecord` for text) |
+
+The example below demonstrates streaming data for an image model.
+
+Load an image dataset and get a list of the underlying image file paths and labels.
+
+```py
+from datasets import load_dataset
+
+image_dataset = load_dataset("beans", split="train")
+filenames = image_dataset["image_file_path"]
+labels = image_dataset["labels"]
+```
+
+Convert the local filenames in the dataset into `gs://` paths in Google Cloud Storage.
+
+```py
+# strip everything but the category directory and filenames
+base_filenames = ['/'.join(filename.split('/')[-2:]) for filename in filenames]
+# prepend the Google Cloud base path to everything instead
+gs_paths = ["gs://matt-tf-tpu-tutorial-datasets/beans/"+filename for filename in base_filenames]
+
+# create tf_dataset
+tf_dataset = tf.data.Dataset.from_tensor_slices(
+    {"filename": gs_paths, "labels": labels}
+)
+tf_dataset = tf_dataset.shuffle(len(tf_dataset))
+```
+
+Transformers preprocessing classes like [`AutoImageProcessor`] are framework-agnostic and can't be compiled into a pipeline by `tf.data`. To get around this, get the normalization values (`mean` and `std`) from the [`AutoImageProcessor`] and use them in the `tf.data` pipeline.
+
+```py
+from transformers import AutoImageProcessor
+
+processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
+image_size = (processor.size["height"], processor.size["width"])
+image_mean = processor.image_mean
+image_std = processor.image_std
+```
+
+Use these normalization values to create a function to load and preprocess the images.
+
+```py
+BATCH_SIZE = 8 * strategy.num_replicas_in_sync
+
+def decode_fn(sample):
+    image_data = tf.io.read_file(sample["filename"])
+    image = tf.io.decode_jpeg(image_data, channels=3)
+    image = tf.image.resize(image, image_size)
+    array = tf.cast(image, tf.float32)
+    array /= 255.0
+    array = (array - image_mean) / image_std
+    array = tf.transpose(array, perm=[2, 0, 1])
+    return {"pixel_values": array, "labels": sample["labels"]}
+
+tf_dataset = tf_dataset.map(decode_fn)
+tf_dataset = tf_dataset.batch(BATCH_SIZE, drop_remainder=True)
+print(tf_dataset.element_spec)
+```
+
+The dataset can now be passed to the [fit](https://keras.io/api/models/model_training_apis/#fit-method) method.
+
+```py
+from transformers import TFAutoModelForImageClassification
+
+with strategy.scope():
+    model = TFAutoModelForImageClassification.from_pretrained(image_model_checkpoint)
+    model.compile(optimizer="adam")
+
+model.fit(tf_dataset)
+```
+
+### Stream with prepare_tf_dataset
+
+[`~TFPreTrainedModel.prepare_tf_dataset`] creates a `tf.data` pipeline that loads samples from [tf.data.Dataset](https://www.tensorflow.org/api_docs/python/tf/data/Dataset). The pipeline uses [tf.numpy_function]() or [`~datasets.Dataset.from_generator`], which can't be compiled by TensorFlow, to access the underlying `tf.data.Dataset`. It also won't work on a Colab TPU or TPU Nodes because the pipeline streams data from a local disk. Refer to the table below to help you decide whether this approach is helpful for you.
+
+| pros | cons |
+|---|---|
+| simple code | only works on TPU VM |
+| same approach on TPU/GPU | data must be available as a Hugging Face Dataset |
+| dataset doesn't have to fit in memory | data must fit on local storage |
+| supports variable padding | data loading may be a bottleneck on a big TPU pod slice |
+
+[`~TFPreTrainedModel.prepare_tf_dataset`] only works on [TPU VM](#tpu-types). Add the tokenizer output as columns in the dataset since the dataset is stored on disk, which means it can handle data larger than the available memory. Use [`~TFPreTrainedModel.prepare_tf_dataset`] to stream data from the dataset by wrapping it with a `tf.data` pipeline.
+
+```py
+def tokenize_function(examples):
+    return tokenizer(
+        examples["sentence"], padding="max_length", truncation=True, max_length=128
+    )
+# add the tokenizer output to the dataset as new columns
+dataset = dataset.map(tokenize_function)
+
+# prepare_tf_dataset() chooses columns that match the models input names
+tf_dataset = model.prepare_tf_dataset(
+    dataset, batch_size=BATCH_SIZE, shuffle=True, tokenizer=tokenizer
+)
+```
+
+The dataset can now be passed to the [fit](https://keras.io/api/models/model_training_apis/#fit-method) method.
+
+```py
+from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
+
+with strategy.scope():
+    model = TFAutoModelForSequenceClassification.from_pretrained(model_checkpoint)
+    model.compile(optimizer="adam")
+
+model.fit(tf_dataset)
+```
+
+## TPU types
+
+There are two types of TPUs, a TPU Node and a TPU VM.
+
+A TPU Node indirectly accesses a remote TPU. It requires a separate VM to initialize your network and data pipeline, and then forwards it to the remote node. Google Colab TPUs are an example of a TPU Node. You can't use local data because the TPU is remotely located, and data must be stored in Google Cloud Storage where the data pipeline can access it.
+
+TPU VM are connected directly to the machine the TPU is located on, and they are generally easier to work with, especially when it comes to your data pipeline.
+
+> [!TIP]
+> We recommend avoiding TPU Nodes if possible because it is more difficult to debug than TPU VMs. TPU Nodes may also be unsupported in the future and become a legacy access method.
+
+A single TPU (v2-8, v3-8, v4-8) runs 8 replicas. TPUs can exist in **pods** which run hundreds or even thousands of replicas simultaneously. When you only use a portion of a pod, it is referred to as a **pod slice**. On Google Colab, you'll typically get a single v2-8 TPU.
+
+## XLA
+
+[XLA](https://openxla.org/xla) is a linear algebra compiler for high-performance execution and it is used by default to improve performance on TPUs.
+
+Before executing your code on a TPU, it's a good idea to try it first on a CPU or GPU because it is easier to debug. You can train for a few steps to make sure the model and data pipeline work as expected. Set `jit_compile=True` in the [compile](https://keras.io/api/models/model_training_apis/#compile-method) method to enable XLA compilation (but remember to remove this line of code before running on a TPU).
+
+The section below outlines three rules for making your code XLA-compatible. Transformers enforce the first two rules for models and loss functions by default, but don't forget about them if you're writing your own models and loss functions.
+
+### Data dependent conditionals
+
+Any `if` statements cannot depend on values inside a [tf.Tensor](https://www.tensorflow.org/api_docs/python/tf/Tensor). The code below can't be compiled by XLA.
+
+```py
+if tf.reduce_sum(tensor) > 10:
+    tensor = tensor / 2.0
+```
+
+To compile with XLA, use [tf.cond](https://www.tensorflow.org/api_docs/python/tf/cond) or remove the conditional and use indicator variables instead as shown below.
+
+```py
+sum_over_10 = tf.cast(tf.reduce_sum(tensor) > 10, tf.float32)
+tensor = tensor / (1.0 + sum_over_10)
+```
+
+### Data dependent shapes
+
+The shape of a [tf.Tensor](https://www.tensorflow.org/api_docs/python/tf/Tensor) cannot depend on their values. For example, [tf.unique](https://www.tensorflow.org/api_docs/python/tf/unique) can't be compiled because it returns a tensor containing an instance of each unique value in the input. The shape of this output depends on how repetitive the input [tf.Tensor](https://www.tensorflow.org/api_docs/python/tf/Tensor) is.
+
+This is an issue during **label masking**, where labels are set to a negative value to indicate they should be ignored when computing the loss. The code below can't be compiled by XLA because the shape of `masked_outputs` and `masked_labels` depend on how many positions are masked.
+
+```py
+label_mask = labels >= 0
+masked_outputs = outputs[label_mask]
+masked_labels = labels[label_mask]
+loss = compute_loss(masked_outputs, masked_labels)
+mean_loss = torch.mean(loss)
+```
+
+To compile with XLA, avoid the data-dependent shapes by computing the loss for every position and zeroing out the masked positions in both the numerator and denominator when calculating the mean. Convert `tf.bool` to `tf.float32` as an indicator variable to make your code XLA-compatible.
+
+```py
+label_mask = tf.cast(labels >= 0, tf.float32)
+loss = compute_loss(outputs, labels)
+loss = loss * label_mask
+mean_loss = tf.reduce_sum(loss) / tf.reduce_sum(label_mask)
+```
+
+### Recompile different input shapes
+
+XLA recompiles your model if input shapes are variable which create huge performance problems. It is especially common in text models because input texts have variable lengths after tokenization.
+
+> [!WARNING]
+> Execessive padding can also severely slow down training because requires more compute and memory to process.
+
+To avoid different shapes, use padding to pad all your inputs to the same length and use an `attention_mask`. Try padding batches of samples to a multiple of 32 or 64 tokens. Use the parameters `padding="max_length"`, `padding="longest"`, or `pad_to_multiple_of` to help with padding. This often increases the number of tokens by a small amount, but it significantly reduces the number of unique input shapes because every input shape is a multiple of 32 or 64. Fewer unique input shapes requires fewer recompilation.
--- a/docs/source/en/quantization/finegrained_fp8.md
+++ b/docs/source/en/quantization/finegrained_fp8.md
@ -47,7 +47,7 @@ quantized_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="

 tokenizer = AutoTokenizer.from_pretrained(model_name)
 input_text = "What are we having for dinner?"
-input_ids = tokenizer(input_text, return_tensors="pt").to(quantized_model.device.type)
+input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")

 output = quantized_model.generate(**input_ids, max_new_tokens=10)
 print(tokenizer.decode(output[0], skip_special_tokens=True))
--- a/docs/source/en/quantization/hqq.md
+++ b/docs/source/en/quantization/hqq.md
@ -20,7 +20,7 @@ rendered properly in your Markdown viewer.

 HQQ further supports fine-tuning with [PEFT](https://huggingface.co/docs/peft) and is fully compatible with [torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) for even faster inference and training.

-Install HQQ with the following command to get the latest version and to build its corresponding CUDA kernels if you are using a cuda device. It also support Intel XPU with pure pytorch implementation.
+Install HQQ with the following command to get the latest version and to build its corresponding CUDA kernels.

 ```bash
 pip install hqq
@ -34,14 +34,13 @@ You can choose to either replace all the linear layers in a model with the same
 Quantize a model by creating a [`HqqConfig`] and specifying the `nbits` and `group_size` to replace for all the linear layers ([torch.nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html)) of the model.

 ``` py
-import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, HqqConfig

 quant_config = HqqConfig(nbits=8, group_size=64)
 model = transformers.AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.1-8B", 
    torch_dtype=torch.float16, 
-    device_map="auto", 
+    device_map="cuda", 
    quantization_config=quant_config
 )
 ```
@ -68,7 +67,7 @@ quant_config  = HqqConfig(dynamic_config={
 model = transformers.AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.1-8B", 
    torch_dtype=torch.float16, 
-    device_map="auto", 
+    device_map="cuda", 
    quantization_config=quant_config
 )
 ```
--- a/docs/source/en/quantization/torchao.md
+++ b/docs/source/en/quantization/torchao.md
@ -49,7 +49,6 @@ Check the table below to see if your hardware is compatible.
 | Component | Compatibility |
 |----------|----------------|
 | CUDA Versions | ✅ cu118, cu126, cu128 |
-| XPU Versions | ✅ pytorch2.8 |
 | CPU | ✅ change `device_map="cpu"` (see examples below) |


@ -279,71 +278,6 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 </hfoption>
 </hfoptions>

-### Intel XPU
-<hfoptions id="examples-Intel-XPU">
-<hfoption id="int8-dynamic-and-weight-only">
-    
-```py
-import torch
-from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
-from torchao.quantization import Int8DynamicActivationInt8WeightConfig, Int8WeightOnlyConfig
-
-quant_config = Int8DynamicActivationInt8WeightConfig()
-# or int8 weight only quantization
-# quant_config = Int8WeightOnlyConfig()
-quantization_config = TorchAoConfig(quant_type=quant_config)
-
-# Load and quantize the model
-quantized_model = AutoModelForCausalLM.from_pretrained(
-    "meta-llama/Llama-3.1-8B-Instruct",
-    torch_dtype="auto",
-    device_map="auto",
-    quantization_config=quantization_config
-)
-
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
-input_text = "What are we having for dinner?"
-input_ids = tokenizer(input_text, return_tensors="pt").to("xpu")
-
-# auto-compile the quantized model with `cache_implementation="static"` to get speed up
-output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
-print(tokenizer.decode(output[0], skip_special_tokens=True))
-```
-</hfoption>
-
-<hfoption id="int4-weight-only">
-
-```py
-import torch
-from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
-from torchao.quantization import Int4WeightOnlyConfig
-from torchao.dtypes import Int4XPULayout
-from torchao.quantization.quant_primitives import ZeroPointDomain
-
-
-quant_config = Int4WeightOnlyConfig(group_size=128, layout=Int4XPULayout(), zero_point_domain=ZeroPointDomain.INT)
-quantization_config = TorchAoConfig(quant_type=quant_config)
-
-# Load and quantize the model
-quantized_model = AutoModelForCausalLM.from_pretrained(
-    "meta-llama/Llama-3.1-8B-Instruct",
-    torch_dtype="auto",
-    device_map="auto",
-    quantization_config=quantization_config
-)
-
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
-input_text = "What are we having for dinner?"
-input_ids = tokenizer(input_text, return_tensors="pt").to("xpu")
-
-# auto-compile the quantized model with `cache_implementation="static"` to get speed up
-output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
-print(tokenizer.decode(output[0], skip_special_tokens=True))
-```
-</hfoption>
-</hfoptions>
-
-
 ### CPU
 <hfoptions id="examples-CPU">
 <hfoption id="int8-dynamic-and-weight-only">
@ -429,7 +363,7 @@ tokenizer = AutoTokenizer.from_pretrained(model_id)

 # Manual Testing
 prompt = "Hey, are you conscious? Can you talk to me?"
-inputs = tokenizer(prompt, return_tensors="pt").to(quantized_model.device.type)
+inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
 generated_ids = quantized_model.generate(**inputs, max_new_tokens=128)
 output_text = tokenizer.batch_decode(
    generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
@ -500,7 +434,7 @@ quantized_model = AutoModelForCausalLM.from_pretrained(

 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
 input_text = "What are we having for dinner?"
-input_ids = tokenizer(input_text, return_tensors="pt").to(quantized_model.device.type)
+input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")

 # auto-compile the quantized model with `cache_implementation="static"` to get speed up
 output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
@ -540,7 +474,7 @@ tokenizer.push_to_hub(f"{USER_ID}/llama3-8b-int4wo-128")

 ## Loading quantized models

-Loading a quantized model depends on the quantization scheme. For quantization schemes, like int8 and float8, you can quantize the model on any device and also load it on any device. The example below demonstrates quantizing a model on the CPU and then loading it on CUDA or XPU.
+Loading a quantized model depends on the quantization scheme. For quantization schemes, like int8 and float8, you can quantize the model on any device and also load it on any device. The example below demonstrates quantizing a model on the CPU and then loading it on CUDA.
 ```py
 import torch
 from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
@ -557,7 +491,7 @@ quantized_model = AutoModelForCausalLM.from_pretrained(
    quantization_config=quantization_config
 )
 # save the quantized model
-output_dir = "llama-3.1-8b-torchao-int8"
+output_dir = "llama-3.1-8b-torchao-int8-cuda"
 quantized_model.save_pretrained(output_dir, safe_serialization=False)

 # reload the quantized model
@ -568,7 +502,7 @@ reloaded_model = AutoModelForCausalLM.from_pretrained(
 )
 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
 input_text = "What are we having for dinner?"
-input_ids = tokenizer(input_text, return_tensors="pt").to(reloaded_model.device.type)
+input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")

 output = reloaded_model.generate(**input_ids, max_new_tokens=10)
 print(tokenizer.decode(output[0], skip_special_tokens=True))
--- a/docs/source/en/serving.md
+++ b/docs/source/en/serving.md
@ -16,9 +16,7 @@ rendered properly in your Markdown viewer.

 # Serving

-Transformer models can be efficiently deployed using libraries such as vLLM, Text Generation Inference (TGI), and others. These libraries are designed for production-grade user-facing services, and can scale to multiple servers and millions of concurrent users.
-
-You can also serve transformer models easily using the `transformers serve` CLI. This is ideal for experimentation purposes, or to run models locally for personal and private use.
+Transformer models can be served for inference with specialized libraries such as Text Generation Inference (TGI) and vLLM. These libraries are specifically designed to optimize performance with LLMs and include many unique optimization features that may not be included in Transformers.

 ## TGI

@ -63,165 +61,4 @@ vllm serve Qwen/Qwen2.5-1.5B-Instruct \
    --task generate \
    --model-impl transformers \
    --trust-remote-code
-```
-
-## Serve CLI
-
-> [!WARNING]
-> This section is experimental and subject to change in future versions
-
-<!-- TODO: LLMs -> models, after we add audio/image input/output support -->
-You can serve LLMs supported by `transformers` with the `transformers serve` CLI. It spawns a local server that offers a chat Completions API compatible with the OpenAI SDK, which is the _de facto_ standard for LLM conversations. This way, you can use the server from many third party applications, or test it using the `transformers chat` CLI ([docs](conversations.md#chat-cli)).
-
-To launch a server, simply use the `transformers serve` CLI command:
-
-```shell
-transformers serve
-```
-
-The simplest way to interact with the server is through our `transformers chat` CLI
-
-```shell
-transformers chat localhost:8000 --model-name-or-path Qwen/Qwen3-4B
-```
-
-or by sending an HTTP request with `cURL`, e.g.
-
-```shell
-curl -X POST http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{"messages": [{"role": "system", "content": "hello"}], "temperature": 0.9, "max_tokens": 1000, "stream": true, "model": "Qwen/Qwen2.5-0.5B-Instruct"}'
-```
-
-from which you'll receive multiple chunks in the Completions API format
-
-```shell
-data: {"object": "chat.completion.chunk", "id": "req_0", "created": 1751377863, "model": "Qwen/Qwen2.5-0.5B-Instruct", "system_fingerprint": "", "choices": [{"delta": {"role": "assistant", "content": "", "tool_call_id": null, "tool_calls": null}, "index": 0, "finish_reason": null, "logprobs": null}]}
-
-data: {"object": "chat.completion.chunk", "id": "req_0", "created": 1751377863, "model": "Qwen/Qwen2.5-0.5B-Instruct", "system_fingerprint": "", "choices": [{"delta": {"role": "assistant", "content": "", "tool_call_id": null, "tool_calls": null}, "index": 0, "finish_reason": null, "logprobs": null}]}
-
-(...)
-```
-
-The server is also an MCP client, so it can interact with MCP tools in agentic use cases. This, of course, requires the use of an LLM that is designed to use tools.
-
-> [!TIP]
-> At the moment, MCP tool usage in `transformers` is limited to the `qwen` family of models.
-
-<!-- TODO: example with a minimal python example, and explain that it is possible to pass a full generation config in the request -->
-
-
-### Usage example 1: apps with local requests (feat. Jan)
-
-This example shows how to use `transformers serve` as a local LLM provider for the [Jan](https://jan.ai/) app. Jan is a ChatGPT-alternative graphical interface, fully running on your machine. The requests to `transformers serve` come directly from the local app -- while this section focuses on Jan, you can extrapolate some instructions to other apps that make local requests.
-
-To connect `transformers serve` with Jan, you'll need to set up a new model provider ("Settings" > "Model Providers"). Click on "Add Provider", and set a new name. In your new model provider page, all you need to set is the "Base URL" to the following pattern:
-
-```shell
-http://[host]:[port]/v1
-```
-
-where `host` and `port` are the `transformers serve` CLI parameters (`localhost:8000` by default). After setting this up, you should be able to see some models in the "Models" section, hitting "Refresh". Make sure you add some text in the "API key" text field too -- this data is not actually used, but the field can't be empty. Your custom model provider page should look like this:
-
-<h3 align="center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_serve_jan_model_providers.png"/>
-</h3>
-
-You are now ready to chat!
-
-> [!TIP]
-> You can add any `transformers`-compatible model to Jan through `transformers serve`. In the custom model provider you created, click on the "+" button in the "Models" section and add its Hub repository name, e.g. `Qwen/Qwen3-4B`.
-
-To conclude this example, let's look into a more advanced use-case. If you have a beefy machine to serve models with, but prefer using Jan on a different device, you need to add port forwarding. If you have `ssh` access from your Jan machine into your server, this can be accomplished by typing the following to your Jan machine's terminal
-
-```
-ssh -N -f -L 8000:localhost:8000 your_server_account@your_server_IP -p port_to_ssh_into_your_server
-```
-
-Port forwarding is not Jan-specific: you can use it to connect `transformers serve` running in a different machine with an app of your choice.
-
-
-### Usage example 2: apps with external requests (feat. Cursor)
-
-This example shows how to use `transformers serve` as a local LLM provider for [Cursor](https://cursor.com/), the popular IDE. Unlike in the previous example, requests to `transformers serve` will come from an external IP (Cursor's server IPs), which requires some additional setup. Furthermore, some of Cursor's requests require [CORS](https://developer.mozilla.org/en-US/docs/Web/HTTP/Guides/CORS), which is disabled by default for security reasons.
-
-To launch our server with CORS enabled, run
-
-```shell
-transformers serve --enable-cors
-```
-
-We'll also need to expose our server to external IPs. A potential solution is to use [`ngrok`](https://ngrok.com/), which has a permissive free tier. After setting up your `ngrok` account and authenticating on your server machine, you run
-
-```shell
-ngrok http [port]
-```
-
-where `port` is the port used by `transformers serve` (`8000` by default). On the terminal where you launched `ngrok`, you'll see an https address in the "Forwarding" row, as in the image below. This is the address to send requests to.
-
-<h3 align="center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_serve_ngrok.png"/>
-</h3>
-
-We're now ready to set things up on the app side! In Cursor, while we can't set a new provider, we can change the endpoint for OpenAI requests in the model selection settings. First, navigate to "Settings" > "Cursor Settings", "Models" tab, and expand the "API Keys" collapsible. To set our `transformers serve` endpoint, follow this order:
-1. Unselect ALL models in the list above (e.g. `gpt4`, ...);
-2. Add and select the model you want to use (e.g. `Qwen/Qwen3-4B`)
-3. Add some random text to OpenAI API Key. This field won't be used, but it can’t be empty;
-4. Add the https address from `ngrok` to the "Override OpenAI Base URL" field, appending `/v1` to the address (i.e. `https://(...).ngrok-free.app/v1`);
-5. Hit "Verify".
-
-After you follow these steps, your "Models" tab should look like the image below. Your server should also have received a few requests from the verification step.
-
-<h3 align="center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_serve_cursor.png"/>
-</h3>
-
-You are now ready to use your local model in Cursor! For instance, if you toggle the AI Pane, you can select the model you added and ask it questions about your local files.
-
-<h3 align="center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_serve_cursor_chat.png"/>
-</h3>
-
-
-### Usage example 3: `tiny-agents` CLI and MCP Tools
-
-To showcase the use of MCP tools, let's see how to integrate the `transformers serve` server with the [`tiny-agents`](https://huggingface.co/blog/python-tiny-agents) CLI.
-
-> [!TIP]
-> Many Hugging Face Spaces can be used as MCP servers, as in this example. You can find all compatible Spaces [here](https://huggingface.co/spaces?filter=mcp-server).
-
-The first step to use MCP tools is to let the model know which tools are available. As an example, let's consider a `tiny-agents` configuration file with a reference to an [image generation MCP server](https://evalstate-flux1-schnell.hf.space/).
-
-```json
-{
-    "model": "Menlo/Jan-nano",
-    "endpointUrl": "http://localhost:8000",
-    "servers": [
-        {
-            "type": "sse",
-            "url": "https://evalstate-flux1-schnell.hf.space/gradio_api/mcp/sse"
-        }
-    ]
-}
-```
-
-You can then launch your `tiny-agents` chat interface with the following command.
-
-```bash
-tiny-agents run path/to/your/config.json
-```
-
-If you have `transformers serve` running in the background, you're ready to use MCP tools from a local model! For instance, here's the example of a chat session with `tiny-agents`:
-
-```bash
-Agent loaded with 1 tools:
- • flux1_schnell_infer
-»  Generate an image of a cat on the moon
-<Tool req_0_tool_call>flux1_schnell_infer {"prompt": "a cat on the moon", "seed": 42, "randomize_seed": true, "width": 1024, "height": 1024, "num_inference_steps": 4}
-
-Tool req_0_tool_call
-[Binary Content: Image image/webp, 57732 bytes]
-The task is complete and the content accessible to the User
-Image URL: https://evalstate-flux1-schnell.hf.space/gradio_api/file=/tmp/gradio/3dbddc0e53b5a865ed56a4e3dbdd30f3f61cf3b8aabf1b456f43e5241bd968b8/image.webp
-380576952
-
-I have generated an image of a cat on the moon using the Flux 1 Schnell Image Generator. The image is 1024x1024 pixels and was created with 4 inference steps. Let me know if you would like to make any changes or need further assistance!
-```
+```
--- a/docs/source/en/testing.md
+++ b/docs/source/en/testing.md
@ -474,6 +474,13 @@ For example, here is a test that must be run only when there are 2 or more GPUs
 def test_example_with_multi_gpu():
 ```

+If a test requires `tensorflow` use the `require_tf` decorator. For example:
+
+```python no-style
+@require_tf
+def test_tf_thing_with_tensorflow():
+```
+
 These decorators can be stacked. For example, if a test is slow and requires at least one GPU under pytorch, here is
 how to set it up:

@ -1219,6 +1226,11 @@ if torch.cuda.is_available():
 import numpy as np

 np.random.seed(seed)
+
+# tf RNG
+import tensorflow as tf 
+
+tf.random.set_seed(seed)
 ```

 ### Debugging tests
--- a/docs/source/en/tf_xla.md
+++ b/docs/source/en/tf_xla.md
@ -0,0 +1,129 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# XLA
+
+[[open-in-colab]]
+
+[Accelerated Linear Algebra (XLA)](https://openxla.org/xla) is a linear algebra compiler that optimizes model runtime across different hardware and frameworks.
+
+This guide will look specifically at how to accelerate *TensorFlow* models with XLA.
+
+## TensorFlow
+
+XLA can potentially accelerate a TensorFlow model without making any source code changes. It is already packaged with the TensorFlow library, and it is triggered with `jit_compile` in any graph creating function such as [tf.function](https://www.tensorflow.org/api_docs/python/tf/function).
+
+If you're using Keras methods like [fit](https://keras.io/api/models/model_training_apis/#fit-method) and [predict](https://keras.io/api/models/model_training_apis/#predict-method), enable XLA by passing `jit_compile=True` to [compile](https://keras.io/api/models/model_training_apis/#compile-method).
+
+```py
+model.compile(jit_compile=True)
+```
+
+XLA can be used to accelerate any arbitrary [tf.function](https://www.tensorflow.org/api_docs/python/tf/function).
+
+Models with a TensorFlow implementation like [GPT2](./model_doc/gpt2), [T5](./model_doc/t5), [OPT](./model_doc/opt), and [Whisper](./model_doc/whisper) are XLA compatible. The speed up depends on a model, but in general, TensorFlow models in Transformers get a ~100x speed up.
+
+### Functions
+
+A typical forward pass in a TensorFlow model is shown below. To run a forward pass with XLA, wrap the model with [tf.function](https://www.tensorflow.org/api_docs/python/tf/function) and set `jit_compile=True`.
+
+```diff
+import tensorflow as tf
+
+model = tf.keras.Sequential(
+    [tf.keras.layers.Dense(10, input_shape=(10,), activation="relu"), tf.keras.layers.Dense(5, activation="softmax")]
+)
+# Generate random inputs for the model.
+batch_size = 16
+input_vector_dim = 10
+random_inputs = tf.random.normal((batch_size, input_vector_dim))
+
+# Run a forward pass.
+- _ = model(random_inputs)
+ xla_fn = tf.function(model, jit_compile=True)
+ _ = xla_fn(random_inputs)
+```
+
+The default `call` function of the model is used to compile the XLA graph. But if there's any other model function you want to compile with XLA, wrap them with [tf.function](https://www.tensorflow.org/api_docs/python/tf/function).
+
+```py
+my_xla_fn = tf.function(model.my_xla_fn, jit_compile=True)
+```
+
+### Text generation
+
+You could also compile other model functions with XLA. For example, enable XLA for text generation by wrapping [`~TFGenerationMixin.generate`] with [tf.function](https://www.tensorflow.org/api_docs/python/tf/function).
+
+```py
+import tensorflow as tf
+from transformers import AutoTokenizer, TFAutoModelForCausalLM
+# Will error if the minimal version of Transformers is not installed.
+from transformers.utils import check_min_version
+
+check_min_version("4.21.0")
+
+tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", padding_side="left", pad_token="</s>")
+model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2")
+input_string = ["TensorFlow is"]
+
+xla_generate = tf.function(model.generate, jit_compile=True)
+
+tokenized_input = tokenizer(input_string, return_tensors="tf")
+generated_tokens = xla_generate(**tokenized_input, num_beams=2)
+
+decoded_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
+print(f"Generated -- {decoded_text}")
+"Generated -- TensorFlow is an open-source, open-source, distributed-source application framework for the"
+```
+
+## Tracing
+
+When executing an XLA-enabled function for the first time, it tries to infer the computation graph in a process known as *tracing*. This is a time-consuming step, but any consecutive calls to the function will be much faster because it won't have to trace the computation graph again.
+
+To ensure a function is only traced once, the inputs must have the same shape as when the graph was built. This usually isn't an issue for fixed input shapes like images, but it can be an issue for inputs with variable shapes like text.
+
+One way to handle this is to pad your text so it always has the same shape. Configure padding options such as [pad_to_multiple_of](https://hf.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.pad.pad_to_multiple_of) in the tokenizer.
+
+```py
+import tensorflow as tf
+from transformers import AutoTokenizer, TFAutoModelForCausalLM
+
+tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", padding_side="left", pad_token="</s>")
+model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2")
+input_string = ["TensorFlow is"]
+
+xla_generate = tf.function(model.generate, jit_compile=True)
+
+# Call tokenizer with padding options.
+tokenized_input = tokenizer(input_string, pad_to_multiple_of=8, padding=True, return_tensors="tf")
+
+generated_tokens = xla_generate(**tokenized_input, num_beams=2)
+decoded_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
+print(f"Generated -- {decoded_text}")
+```
+
+In addition to the input shape, any changes to the generation options at any point also triggers tracing.
+
+## Resources
+
+Learn more about XLA with the following resources.
+
+- A [notebook](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/91_tf_xla_generate.ipynb) demonstrating XLA-compatible encoder-decoder and decoder-only text generation models.
+- The [Faster Text Generation with TensorFlow and XLA](https://hf.co/blog/tf-xla-generate) blog post compares benchmarks for XLA-compatible models and provides a friendly introduction to XLA in TensorFlow.
+- The [How Hugging Face improved Text Generation performance with XLA](https://blog.tensorflow.org/2022/11/how-hugging-face-improved-text-generation-performance-with-xla.html) blog post discusses the design philosophy behind adding XLA to TensorFlow models in Transformers.
+- The [Introduction to graphs and tf.function](https://www.tensorflow.org/guide/intro_to_graphs) guide.
+- The [Better performance with tf.function](https://www.tensorflow.org/guide/function) guide.
+- The [XLA](https://openxla.org/xla) documentation.
--- a/docs/source/en/tools.md
+++ b/docs/source/en/tools.md
@ -14,9 +14,5 @@ rendered properly in your Markdown viewer.

 -->

-# Tools
-
-(deprecated)
-
 > [!WARNING]
 > Agents and tools were spun out into the standalone [smolagents](https://huggingface.co/docs/smolagents/index) library. They were removed from `transformers` in v4.52.
--- a/docs/source/it/perf_train_special.md
+++ b/docs/source/it/perf_train_special.md
@ -7,7 +7,6 @@ http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.

 ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
 rendered properly in your Markdown viewer.
--- a/docs/source/ja/testing.md
+++ b/docs/source/ja/testing.md
@ -445,6 +445,13 @@ CUDA_VISIBLE_DEVICES="1" pytest tests/utils/test_logging.py
 def test_example_with_multi_gpu():
 ```

+テストに `tensorflow` が必要な場合は、`require_tf` デコレータを使用します。例えば：
+
+```python no-style
+@require_tf
+def test_tf_thing_with_tensorflow():
+```
+
 これらのデコレータは積み重ねることができます。たとえば、テストが遅く、pytorch で少なくとも 1 つの GPU が必要な場合は、次のようになります。
 設定方法:

@ -1128,6 +1135,9 @@ if torch.cuda.is_available():
 import numpy as np

 np.random.seed(seed)
+
+# tf RNG
+tf.random.set_seed(seed)
 ```


--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@ -225,7 +225,7 @@
 - sections:
  - local: philosophy
    title: 이념과 목표
-  - local: glossary
+  - local: in_translation
    title: (번역중) Glossary
  - local: task_summary
    title: 🤗 Transformers로 할 수 있는 작업
--- a/docs/source/ko/glossary.md
+++ b/docs/source/ko/glossary.md
@ -1,454 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# 용어집(Glossary)
-
-이 용어집은 전반적인 머신러닝 및 🤗 Transformers 관련 용어를 정의하여 문서를 더 잘 이해하는 데 도움을 줍니다.
-
-## A
-
-### 어텐션 마스크 (attention mask)
-
-어텐션 마스크(attention mask)는 여러 시퀀스를 배치(batch)로 처리할 때 사용되는 선택적 인자입니다.
-
-<Youtube id="M6adb1j2jPI"/>
-
-이 인자는 모델에게 어떤 토큰에 주의를 기울여야 하는지, 그리고 어떤 토큰은 무시해야 하는지를 알려줍니다.
-
-예를 들어, 다음 두 개의 시퀀스가 있다고 가정해 봅시다:
-
-```python
->>> from transformers import BertTokenizer
-
->>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-cased")
-
->>> sequence_a = "This is a short sequence."
->>> sequence_b = "This is a rather long sequence. It is at least longer than the sequence A."
-
->>> encoded_sequence_a = tokenizer(sequence_a)["input_ids"]
->>> encoded_sequence_b = tokenizer(sequence_b)["input_ids"]
-```
-
-인코딩된 버전들의 길이가 다릅니다:
-
-```python
->>> len(encoded_sequence_a), len(encoded_sequence_b)
-(8, 19)
-```
-
-따라서 이 두 시퀀스를 그대로 하나의 텐서에 넣을 수는 없습니다. 첫 번째 시퀀스를 두 번째 길이에 맞춰 패딩 하거나, 반대로 두 번째 시퀀스를 첫 번째 길이에 맞춰 잘라내야 합니다.
-
-첫 번째 경우에는 ID 목록이 패딩 인덱스로 확장됩니다. 이렇게 패딩을 적용하려면 토크나이저에 리스트를 전달하고 다음과 같이 요청할 수 있습니다:
-
-```python
->>> padded_sequences = tokenizer([sequence_a, sequence_b], padding=True)
-```
-
-첫 번째 문장 오른쪽에 0이 추가되어 두 번째 문장과 길이가 같아진 것을 볼 수 있습니다:
-
-```python
->>> padded_sequences["input_ids"]
-[[101, 1188, 1110, 170, 1603, 4954, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1188, 1110, 170, 1897, 1263, 4954, 119, 1135, 1110, 1120, 1655, 2039, 1190, 1103, 4954, 138, 119, 102]]
-```
-
-이것은 PyTorch나 TensorFlow의 텐서로 변환될 수 있습니다. 어텐션 마스크는 모델이 패딩 된 인덱스를 참조하지 않도록 해당 위치를 나타내는 이진 텐서입니다. [`BertTokenizer`]의 경우, `1`은 어텐션이 필요한 값을 나타내고, `0`은 패딩 된 값을 나타냅니다. 이 어텐션 마스크는 토크나이저가 반환되는 딕셔너리의 "attention_mask" 키 아래에 포함되어 있습니다:
-
-```python
->>> padded_sequences["attention_mask"]
-[[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
-```
-
-### 오토인코딩 모델 (autoencoding models)
-
-[인코더 모델](#encoder-models)과 [마스킹된 언어 모델링](#masked-language-modeling-mlm)을 참고하세요.
-
-### 자기회귀 모델 (autoregressive models)
-
-[인과적 언어 모델링](#causal-language-modeling)과 [디코더 모델](#decoder-models)을 참고하세요.
-
-## B
-
-### 백본 (backbone)
-
-백본(backbone)은 원시(hidden) 은닉 상태(hidden state) 또는 특징(feature)을 출력하는 네트워크(임베딩과 레이어)입니다. 일반적으로 이 백본은 해당 특징을 입력으로 받아 예측을 수행하는 [헤드](#head)와 연결됩니다. 예를 들어, [`ViTModel`]은 특정 헤드가 없는 백본입니다. 다른 모델들도[`VitModel`]을 백본으로 사용할 수 있으며, [DPT](model_doc/dpt)등이 그 예시입니다.
-
-## C
-
-### 인과적 언어 모델링 (causal language modeling)
-
-모델이 텍스트를 순서대로 읽으며 다음 단어를 예측해야 하는 사전 학습(pretraining) 작업입니다. 일반적으로 문장을 전체로 읽되, 모델 내부에서 특징 시점 이후의 토큰을 마스킹(masking)하여 다음 단어를 예측하게 됩니다.
-
-### 채널 (channel)
-
-컬러 이미지는 빨간색(R), 초록색(G), 파란색(B)의 세 채널 값을 조합하여 구성되며, 흑백 이미지는 단일 채널만을 가집니다. 🤗 Transformers에서는 이미지 텐서의 채널이 첫 번째 또는 마지막 차원에 위치할 수 있습니다:[`n_channels`, `height`, `width`] 또는 [`height`, `width`, `n_channels`]와 같은 형식입니다.
-
-### 연결 시간분류(connectionist temporal classification, CTC) 
-
-입력과 출력의 정렬 상태를 정확히 몰라도 모델이 학습할 수 있도록 돕는 알고리즘입니다. CTC는 주어진 입력에 대해 가능한 모든 출력의 확률 분포를 계산하고, 그중 가장 가능성이 높은 출력을 선택합니다. CTC는 말하는 속도의 차이 등 여러 이유로 음성과 텍스트가 항상 정확하게 일치하지 않기 때문에 음성 인식 작업에서 자주 사용됩니다.
-
-### 컨볼루션 (convolution)
-
-신경망에서 사용되는 레이어의 한 종류로, 입력 행렬에 대해 더 작은 행렬(커널 또는 필터)을 원소별로 곱한 뒤 그 값을 합산해 새로운 행렬을 만드는 연산입니다. 이 연산을 컨볼루션 연산이라고 하며, 입력 행렬 전체에 걸쳐 반복적으로 수행됩니다. 각 연산은 입력 행렬의 서로 다른 구간에 적용됩니다. 컨볼루션 신경망(CNN)은 컴퓨터 비전 분야에서 널리 사용됩니다.
-
-## D
-
-### 데이터 병렬화 (DataParallel)
-
-여러 개의 GPU에서 훈련을 수행할 때 사용하는 병렬화 기법으로, 동일한 모델 구성이 여러 번 복제되며 각 인스턴스는 서로 다른 데이터 조각을 받습니다. 모든 인스턴스는 병렬로 처리를 수행하며, 각 훈련 단계가 끝난 후 결과를 동기화합니다.
-
-DataParallel 방식에 대해 더 알아보려면 [여기](perf_train_gpu_many#dataparallel-vs-distributeddataparallel)를 참고하세요.
-
-### 디코더 입력 ID (decoder input IDs)
-
-이 입력은 인코더-디코더 모델에 특화된 것으로, 디코더에 전달될 input ID 들을 포함합니다. 이러한 입력은 번역이나 요약과 같은 시퀀스-투-시퀀스(sequence-to-sequence) 작업에 사용되며, 일반적으로 모델마다 고유한 방식으로 구성됩니다.
-
-대부분의 인코더-디코더 모델(BART, T5 등)은 `labels`로부터 자동으로 `decoder_input_ids`를 생성합니다. 이러한 모델에서는 학습 시 `labels`를 전달하는 것이 일반적으로 권장됩니다.
-
-시퀀스-투-시퀀스 학습에서 각 모델이 이러한 input ID를 어떻게 처리하는지는 모델 문서를 참고하시기를 바랍니다.
-
-### 디코더 모델 (decoder models)
-
-자기회귀 모델(Autoregressive models)이라고도 불리는 디코더 모델은 인과 언어 모델링(causal language modeling)이라 불리는 사전 학습 작업을 수행합니다. 이 작업에서는 모델이 텍스트를 순서대로 읽고 다음 단어를 예측해야 합니다. 일반적으로 문장의 전체를 읽되, 특정 시점 이후의 토큰은 마스크로 가려 예측하게 합니다.
-
-<Youtube id="d_ixlCubqQw"/>
-
-### 딥러닝 (deep learning)
-
-여러 층의 신경망(neural network)을 사용하는 머신러닝 알고리즘입니다.
-
-## E
-
-### 인코더 모델 (encoder models)
-
-자동 인코딩 모델(Autoencoding models)이라고도 불리는 인코더 모델은 텍스트나 이미지와 같은 입력을 받아 임베딩이라 불리는 압축된 수치 표현으로 반환합니다. 일반적으로 인코더 모델은 입력 시퀀스의 일부를 마스킹하고 더 의미 있는 표현을 생성하도록 학습하는 [masked language modeling](#masked-language-modeling-mlm)과 같은 기술을 사용하여 사전 학습됩니다.
-
-<Youtube id="H39Z_720T5s"/>
-
-## F
-
-### 특징 추출 (feature extraction)
-
-머신러닝 알고리즘이 더 효과적으로 학습할 수 있도록, 원시 데이터를 선택하고 변환하여 더 유용한 특징(feature) 집합으로 만드는 과정입니다. 예를 들어, 원시 텍스트를 워드 임베딩으로 변환하거나 이미지나 비디오 데이터에서 윤곽선이나 형태와 같은 중요한 특징을 추출하는 것이 있습니다.
-
-### 피드 포워드 청킹 (feed forward chunking)
-
-트랜스포머의 각 residual attention Block에서는 self-Attention Layer 다음에 보통 두 개의 Feed Forward Layer가 이어집니다. 이 Feed Forward Layers의 중간 임베딩 크기는 종종 모델의 히든 사이즈(hidden size)보다 큽니다(예:
-`google-bert/bert-base-uncased` 모델의 경우).
-
-입력 크기가 `[batch_size, sequence_length]`일 경우, 중간 Feed Forward 임베딩
-`[batch_size, sequence_length, config.intermediate_size]`을 저장하는 데 필요한 메모리는 전체 메모리 사용량의 큰 부분을 차지할 수 있습니다.
-[Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) 논문의 저자들은 이 연산이 `sequence_length` 차원에 대해 독립적이기 때문에,토큰마다 Feed Forward Layer의 출력 임베딩을 각 토큰별로 `[batch_size, config.hidden_size]`을 개별적으로 계산한 뒤, 이를 이어 붙여 `[batch_size, sequence_length, config.hidden_size]` 형태로 만들 수 있습니다.`n = sequence_length`. 이 방식은 계산 시간은 늘어나지만, 메모리 사용량은 줄어들게 됩니다.
-
-[`apply_chunking_to_forward`] 함수를 사용하는 모델의 경우, `chunk_size`는 병렬로 계산되는 출력 임베딩의 개수를 정의하며, 이는 메모리 사용량과 계산 시간 간의 트레이드오프를 결정합니다.
-`chunk_size`가 0으로 설정되면, 피드 포워드 청킹(Feed Forward Chunking)은 수행되지 않습니다.
-
-### 파인튜닝 모델 (finetuned models)
-
-파인튜닝(Finetuning)은 전이 학습(transfer learning)의 한 형태로, 사전 학습된 (pretrained) 모델을 사용하여 가중치를 고정(freeze)하고, 출력층을 새롭게 추가된 [모델 헤드](#head)로 교체한 뒤, 해당 모델 헤드를 목표 데이터셋에 맞게 학습시키는 방식입니다.
-
-자세한 내용은 [Fine-tune a pretrained model](https://huggingface.co/docs/transformers/training) 튜토리얼을 참고하시고, 🤗 Transformers를 사용해 모델을 파인 튜닝하는 방법도 함께 확인해 보세요.
-
-## H
-
-### 헤드 (head)
-
-모델 헤드(model head)란 신경망의 마지막 층을 의미하며, 이 층은 이전 층에서 나온 히든 상태(hidden states)를 받아 다른 차원으로 변환합니다. 각 작업(task)에 따라 서로 다른 모델 헤드가 사용됩니다. 예를 들어:
-
-  * [`GPT2ForSequenceClassification`]은 기본 [`GPT2Model`] 위에 시퀀스 분류를 위한 선형계층(linear layer)을 추가한 모델 헤드입니다.
-  * [`ViTForImageClassification`]은 이미지 분류를 위한 모델 헤드로, 기본 [`ViTModel`] 위에 `CLS` 토큰의 마지막 히든 상태에 선형 계층(linear layer)을 추가한 구조입니다.
-  * [`Wav2Vec2ForCTC`]는 기본 [`Wav2Vec2Model`] 위에 [CTC](#connectionist-temporal-classification-ctc)를 적용한 언어 모델링 헤드입니다.
-
-## I
-
-### 이미지 패치 (image patch)
-
-비전 기반 Transformer 모델은 이미지를 작은 패치로 분할한 후, 각 패치를 선형 임베딩하여 시퀀스로 모델에 입력합니다. 모델의 구성 파일에서 `patch_size`(또는 해상도)를 확인할 수 있습니다.
-
-### 인퍼런스 (inference)
-
-인퍼런스는 학습이 완료된 모델에 새로운 데이터를 입력하여 예측을 수행하는 과정입니다. 🤗 Transformer에서 인퍼런스를 수행하는 방법은 [Pipeline for inference](https://huggingface.co/docs/transformers/pipeline_tutorial) 튜토리얼을 참고하세요.
-
-### 입력 ID (input IDs)
-
-입력 ID는 종종 모델에 입력으로 전달해야 하는 유일한 필수 파라미터입니다. 이들은 토큰의 인덱스로, 모델이 입력으로 사용할 시퀀스를 구성하는 토큰들의 숫자 표현입니다.
-
-<Youtube id="VFp38yj8h3A"/>
-
-토크나이저마다 작동 방식은 다르지만, 기본 메커니즘은 동일합니다. 다음은 [WordPiece](https://arxiv.org/pdf/1609.08144.pdf) 토크나이저인 BERT 토크나이저를 사용한 예시입니다:
-
-```python
->>> from transformers import BertTokenizer
-
->>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-cased")
-
->>> sequence = "A Titan RTX has 24GB of VRAM"
-```
-
-토크나이저는 시퀀스를 토크나이저의 토큰 목록에 있는 항목으로 분리합니다.
-
-```python
->>> tokenized_sequence = tokenizer.tokenize(sequence)
-```
-
-토큰은 단어이거나 서브 워드(subword)입니다. 예를 들어, "VRAM"은 모델의 어휘 사전에 없는 단어이기 때문에 "V", "RA", "M"으로 나뉘었습니다. 이 토큰들이 개별 단어가 아니라 같은 단어의 일부임을 나타내기 위해 "RA"와 "M" 앞에 더블 해시(`##`)가 추가 됩니다.
-
-```python
->>> print(tokenized_sequence)
-['A', 'Titan', 'R', '##T', '##X', 'has', '24', '##GB', 'of', 'V', '##RA', '##M']
-```
-
-이러한 토큰들은 모델이 이해할 수 있는 ID로 변환될 수 있습니다. 이 과정은 문장을 바로 토크나이저에 입력함으로써 수행되며, 성능 최적화를 위해 [🤗 Tokenizers](https://github.com/huggingface/tokenizers)의 Rust 구현을 활용합니다.
-
-```python
->>> inputs = tokenizer(sequence)
-```
-
-토크나이저는 해당 모델이 올바르게 작동하는 데 필요한 모든 인자를 포함한 딕셔너리를 반환합니다. 토큰 인덱스는 `input_ids`라는 키에 저장됩니다.
-
-```python
->>> encoded_sequence = inputs["input_ids"]
->>> print(encoded_sequence)
-[101, 138, 18696, 155, 1942, 3190, 1144, 1572, 13745, 1104, 159, 9664, 2107, 102]
-```
-
-토크나이저는 (연결된 모델이 이를 사용하는 경우) 자동으로 "특수 토큰"을 추가합니다. 이들은 모델이 특정 상황에서 사용하는 특별한 ID입니다.
-
-이전의 ID 시퀀스를 디코딩하면,
-
-```python
->>> decoded_sequence = tokenizer.decode(encoded_sequence)
-```
-
-우리는 다음과 같은 결과를 보게 될 것입니다.
-
-```python
->>> print(decoded_sequence)
-[CLS] A Titan RTX has 24GB of VRAM [SEP]
-```
-
-이는 [`BertModel`]이 입력값을 기대하는 방식이기 때문입니다.
-
-## L
-
-### 레이블 (labels)
-
-레이블은 모델이 손실(loss)을 직접 계산할 수 있도록 전달되는 선택적 인자입니다. 이 레이블은 모델이 예측해야 할 정답 값을 의미하며, 모델은 예측값과 이 정답(label) 사이의 차이를 표준 손실 함수를 이용해 계산하게 됩니다.
-
-이 레이블(label)의 형태는 모델 헤드(model head)의 종류에 따라 달라집니다. 예를 들어:
-
- 시퀀스 분류 모델([`BertForSequenceClassification`] 등)의 경우, 모델은
-  `(batch_size)` 차원의 텐서를 입력으로 받으며, 배치의 각 값은 전체 시퀀스에 대한 예상 레이블을 나타냅니다.
- 토큰 분류 모델([`BertForTokenClassification`] 등)의 경우, 모델은 `(batch_size, seq_length)` 차원의 텐서를 입력으로 받으며, 각 값은 개별 토큰에 대한 예상 레이블을 나타냅니다.
- 마스킹 언어 모델([`BertForMaskedLM`])의 경우, 모델은 `(batch_size,seq_length)` 차원의 텐서를 입력으로 받으며, 각 값은 개별 토큰에 대한 예상 레이블을 나타냅니다. 레이블은 마스킹 된 토큰의 토큰 ID이며, 나머지 토큰에 대해서는 무시할 값을 사용합니다(일반적으로 -100).
- 시퀀스 투 시퀀스 작업([`BartForConditionalGeneration`], [`MBartForConditionalGeneration`]등)의 경우, 모델은 `(batch_size, tgt_seq_length)` 차원의 텐서를 입력으로 받으며, 각 값은 입력 시퀀스에 대응하는 타겟 시퀀스를 나타냅니다. 학습 중에는 BART와 T5가 적절한 `decoder_input_ids`와 디코더 attention 마스크를 내부적으로 생성하므로, 일반적으로 따로 제공할 필요가 없습니다. 단, 이는 Encoder-Decoder 프레임워크를 직접 활용하는 모델에는 적용되지 않습니다.
- 이미지 분류 모델([`ViTForImageClassification`] 등)의 경우, 모델은 `(batch_size)` 차원의 텐서를 입력으로 받으며, 배치의 각 값은 개별 이미지에 대한 예상 레이블을 나타냅니다.
- 시멘틱 세그멘테이션 모델([`SegformerForSemanticSegmentation`] 등)의 경우, 모델은 `(batch_size, height, width)` 차원의 텐서를 입력으로 받으며, 배치의 각 값은 개별 픽셀에 대한 예상 레이블을 나타냅니다.
- 객체 탐지 모델([`DetrForObjectDetection`] 등)의 경우, 모델은 `class_labels`와 `boxes` 키를 포함하는 딕셔너리들의 리스트를 입력으로 받습니다. 배치의 각 값은 개별 이미지에 대한 예상 클래스 레이블과 바운딩 박스 정보를 나타냅니다.
- 자동 음성 인식 모델([`Wav2Vec2ForCTC`] 등)의 경우 모델은 `(batch_size,target_length)` 차원의 텐서를 입력으로 받으며, 각 값은 개별 토큰에 대한 예상 레이블을 나타냅니다.
-  
-<Tip>
-
-모델마다 요구하는 레이블 형식이 다를 수 있으므로, 각 모델의 문서를 확인하여 해당 모델에 맞는 레이블 형식을 반드시 확인하세요!
-
-</Tip>
-
-기본 모델([`BertModel`] 등)은 레이블을 입력으로 받지 않습니다. 이러한 모델은 단순히 특징(feature)을 출력하는 기본 트랜스포머 모델이기 때문입니다.
-
-### 대규모 언어 모델 (LLM)
-
-대규모 데이터로 학습된 트랜스포머 언어 모델(GPT-3, BLOOM, OPT 등)을 지칭하는 일반적인 용어입니다. 이러한 모델은 학습할 수 있는 파라미터(parameter)의 수가 매우 많으며, 예를 들어 GPT-3는 약 1,750억 개의 파라미터를 가지고 있습니다.
-
-## M
-
-### 마스킹된 언어 모델링 (MLM)
-
-사전 학습 단계 중 하나로, 모델은 일부 토큰이 무작위로 마스킹 된 손상된 문장을 입력받고, 원래의 문장을 예측해야 합니다.
-
-### 멀티모달 (multimodal)
-
-텍스트와 이미지와 같은 다른 형태의 입력을 함께 사용하는 작업입니다.
-
-## N
-
-### 자연어 생성 (NLG)
-
-텍스트를 생성하는 모든 작업을 의미합니다. (예: [Write With Transformers](https://transformer.huggingface.co/), 번역 등).
-
-### 자연어 처리 (NLP)
-
-텍스트를 다루는 작업 전반을 지칭하는 일반적인 용어입니다.
-
-### 자연어 이해 (NLU)
-
-텍스트에 담긴 의미를 이해하는 모든 작업을 포함합니다. (예: 전체 문서 분류, 개별 단어 분류 등).
-
-## P
-
-### 파이프라인 (pipeline)
-
-🤗 Transformers에서 파이프라인은 데이터를 전처리하고 변환한 후, 모델을 통해 예측값을 반환하는 일련의 단계를 순차적으로 수행하는 추상화된 개념입니다. 파이프라인에 포함될 수 있는 단계로는 데이터 전처리, 특징 추출(feature extraction), 정규화(normalization) 등이 있습니다.   
-
-자세한 내용은 [Pipelines for inference](https://huggingface.co/docs/transformers/pipeline_tutorial) 문서를 참고하세요.
-
-### 파이프라인 병렬화 (PP)
-
-모델을 수직 방향(레이어 단위)으로 여러 GPU에 분할하여 병렬로 처리하는 병렬화 기법입니다. 각 GPU는 모델의 하나 또는 여러 개의 레이어만을 담당하며, 전체 파이프라인의 서로 다른 단계를 병렬로 처리하게 됩니다. 또한 각 GPU는 배치(batch)의 일부 작은 조각만 처리합니다. Pipeline Parallel 방식에 대해 더 알아보려면 [이 문서](perf_train_gpu_many#from-naive-model-parallelism-to-pipeline-parallelism)를 참고하세요.
-
-### 픽셀 값 (pixel values)
-
-이미지를 수치상으로 표현한 텐서로, 모델에 입력으로 전달됩니다. 이 텐서는 이미지 프로세서를 통해 생성되면, 값은 [`batch_size`, `num_channels`, `height`, `width`] 형태의 차원을 가집니다.
-
-### 풀링 (pooling)
-
-행렬의 특정 차원에서 최댓값이나 평균값을 취하여 더 작은 행렬로 줄이는 연산입니다. 풀링 계층은 주로 합성곱 계층 사이에 위치하여 특징 표현을 다운샘플링 하는 데 사용됩니다.
-
-### 포지션 ID (position IDs)
-
-RNN 모델과 달리 트랜스포머는 각 토큰의 위치 정보를 내부적으로 가지고 있지 않습니다. 따라서 모델은 `position_ids`를 사용하여 각 토큰이 시퀀스 내에서 어느 위치에 있는지를 인식합니다. 이 값은 선택적인 파라미터입니다. 모델에 `position_ids`를 전달하지 않으면, 절대 위치 임베딩 방식으로 자동 생성됩니다. 절대 위치 임베딩은 `[0, config.max_position_embeddings - 1]` 범위 내에서 선택됩니다. 일부 모델은 사인파 형태의 위치 임베딩(sinusoidal position embeddings) 또는 상대 위치 임베딩(relative position embeddings)과 같은 다른 유형의 위치 임베딩을 사용하기도 합니다.
-
-### 전처리 (preprocessing)
-
-머신러닝 모델이 쉽게 처리할 수 있도록 가공되지 않은 데이터를 정제하는 작업입니다. 예를 들어, 텍스트는 일반적으로 토큰화(tokenization) 과정을 거칩니다. 다른 입력 유형에 대한 전처리 방식이 궁금하다면 [Preprocess](https://huggingface.co/docs/transformers/preprocessing) 튜토리얼을 참고해 보세요.
-
-### 사전 학습된 모델 (pretrained model)
-
-일부 데이터(예: 위키피디아 전체)로 사전 학습(pretraining)된 모델입니다. 사전 학습은 자기 지도 학습(self-supervised learning)의 목표를 포함하며, 예를 들어 문장을 읽고 다음 단어를 예측하거나 ([causal language modeling](#causal-language-modeling)) 참고, 일부 단어를 마스킹하고 이를 예측하는 방식([masked language modeling](#masked-language-modeling-mlm))이 있습니다.
-
-음성 및 비전 모델은 고유의 사전 학습 목표를 가지고 있습니다. 예를 들어, Wav2Vec2는 음성 표현 중 "진짜"를 "가짜" 중에서 구분하는 대조 학습(contrastive learning) 방식으로 사전 학습된 음성 모델입니다. 반면, BEiT는 이미지 패치 중 일부를 마스킹하고 이를 예측하는 마스킹 이미지 모델링 방식으로 사전 학습된 비전 모델입니다. 이는 마스킹 언어 모델링과 유사한 방식입니다.
-
-## R
-
-### 순환 신경망 (RNN)
-
-텍스트와 같은 시퀀스 데이터를 처리하기 위해 레이어에 반복 구조(루프)를 사용하는 신경망 모델의 한 종류입니다.
-
-### 표현학습 (representation learning)
-
-머신러닝의 하위 분야로, 원시 데이터로부터 의미 있는 표현을 학습하는 데 중점을 둡니다. 대표적인 기법으로는 단어 임베딩, 오토인코더(autoencoder), 생성적 적대 신경망(GAN) 등이 있습니다.
-
-## S
-
-### 샘플링 속도 (sampling rate)
-
-샘플링 속도는 1초에 추출하는 (오디오 신호) 샘플의 개수를 헤르츠(Hz) 단위로 나타낸 측정값입니다. 이는 음성처럼 연속적인 신호를 디지털화하여 이산적인 형태로 만드는 결과입니다.
-
-### 셀프 어텐션 (self-attention)
-
-입력의 각 요소가 다른 어떤 요소에 주목해야 하는지를 스스로 판단하는 메커니즘입니다. 이는 모델이 문장에서 특정 단어만을 보는 것이 아니라, 다른 단어들과의 관계를 고려하여 어떤 정보에 더 집중해야 할지를 학습하게 합니다.
-
-### 자기지도 학습 (self-supervised learning) 
-
-레이블이 없는 데이터로부터 모델이 스스로 학습 목표를 정의하여 학습하는 머신러닝 기법의 한 종류입니다. [비지도 학습](#unsupervised-learning)이나 [지도 학습](#supervised-learning)과 달리, 학습 과정 자체는 감독 방식 되지만, 라벨이 명시적으로 주어지는 것은 아닙니다.
-
-예시로는 [마스크 언어 모델링](#masked-language-modeling-mlm)이 있으며, 이는 문장의 일부 토큰을 제거한 상태로 모델에 입력하고, 모델이 해당 토큰을 예측하도록 학습하는 방식입니다.
-
-### 준지도 학습 (semi-supervised learning)
-
-소량의 라벨이 달린 데이터와 대량의 라벨이 없는 데이터를 함께 사용하여 모델의 정확도를 높이는 머신러닝 훈련 기법의 넓은 범주입니다. 이는 [지도 학습](#supervised-learning)이나 [비지도 학습](#unsupervised-learning)과는 다른 방식입니다.
-
-준지도 학습 기법의 예로는 "자기 학습(self-training)"이 있습니다. 이 방식은 먼저 라벨이 있는 데이터로 모델을 학습시키고, 그 모델을 사용해 라벨이 없는 데이터에 대한 예측을 수행합니다. 모델이 가장 높은 확신을 가지고 예측한 라벨이 없는 데이터 일부를 라벨이 있는 데이터로 추가하고, 이를 통해 모델을 다시 학습시킵니다.
-
-### 시퀀스 투 시퀀스 (seq2seq)
-
-입력으로부터 새로운 시퀀스를 생성하는 모델입니다. 예를 들어 번역 모델이나 요약 모델이 이에 해당하며, 대표적인 예로는 [Bart](model_doc/bart)나[T5](model_doc/t5) 모델이 있습니다.
-
-### 분할 DDP (Sharded DDP)
-
-[ZeRO](#zero-redundancy-optimizer-zero) 개념을 기반으로 다양한 구현에서 사용되는 다른 이름으로 불립니다.
-
-### 스트라이드 (stride)
-
-[convolution](#convolution) 또는 [pooling](#pooling)에서 스트라이드(stride)는 커널이 행렬 위를 이동하는 간격을 의미합니다. 스트라이드가 1이면 커널이 한 픽셀씩 이동하고, 2이면 두 픽셀씩 이동합니다.
-
-### 지도학습 (supervised learning)
-
-정답이 포함된 라벨링된 데이터를 직접 사용하여 모델의 성능을 개선하는 학습 방식입니다. 학습 중인 모델에 데이터를 입력하고, 예측 결과를 정답과 비교하여 오차를 계산합니다. 모델은 이 오차를 기반으로 가중치를 업데이트하며, 이러한 과정을 반복하여 성능을 최적화합니다.
-
-## T
-
-### 텐서 병렬화 (TP)
-
-여러 GPU에서 훈련하기 위한 병렬화 기법으로, 각 텐서를 여러 덩어리(chunk)로 나눕니다. 따라서 전체 텐서가 단일 GPU에 상주하는 대신, 텐서의 각 조각(shard)이 지정된 GPU에 상주하게 됩니다. 이 조각들은 각각 다른 GPU에서 개별적으로 병렬 처리되며, 처리 단계가 끝날 때 결과가 동기화됩니다. 이러한 분할이 수평 방향으로 일어나기 때문에, 이는 때때로 수평적 병렬화라고 불립니다. Tensor Parallelism에 대해 더 알아보려면 [여기](perf_train_gpu_many#tensor-parallelism)를 참고하세요.
-
-### 토큰 (token)
-
-일반적인 단어 단위이지만, 때에 따라 서브 워드(자주 사용되지 않는 단어는 서브 워드로 분리됨)나 문장 부호도 포함될 수 있는 문장의 구성 요소입니다.
-
-### 토큰 타입 ID (token type IDs)
-
-일부 모델은 문장 쌍 분류나 질의 응답 작업을 수행하는 데 사용됩니다.
-
-<Youtube id="0u3ioSwev3s"/>
-
-이러한 작업에서는 두 개의 서로 다른 시퀀스를 하나의 "input_ids" 항목으로 결합해야 하며, 일반적으로 `[CLS]` 분류용 및 `[SEP]` 구분용과 같은 특수 토큰을 사용하여 처리합니다. 예를 들어, BERT 모델은 두 개의 시퀀스를 다음과 같은 방식으로 구성합니다:
-
-```python
->>> # [CLS] SEQUENCE_A [SEP] SEQUENCE_B [SEP]
-```
-
-두 개의 시퀀스를 `tokenizer`에 리스트가 아닌 개별 인자로 전달하면, 토크나이저가 자동으로 이러한 문장을 생성해 줍니다. 예시는 다음과 같습니다:
-
-```python
->>> from transformers import BertTokenizer
-
->>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-cased")
->>> sequence_a = "HuggingFace is based in NYC"
->>> sequence_b = "Where is HuggingFace based?"
-
->>> encoded_dict = tokenizer(sequence_a, sequence_b)
->>> decoded = tokenizer.decode(encoded_dict["input_ids"])
-```
-
-결과는 아래와 같습니다:
-
-```python
->>> print(decoded)
-[CLS] HuggingFace is based in NYC [SEP] Where is HuggingFace based? [SEP]
-```
-
-이 코드는 일부 모델이 두 개의 시퀀스를 어떻게 구분하는지 이해하는 데 충분합니다. 그러나 BERT와 같은 다른 모델은 토큰 타입 ID(또는 세그먼트 ID)를 추가로 사용합니다. 이 ID는 0과 1로 구성된 이진 마스크로, 두 시퀀스를 구분하는 역할을 합니다.
-
-토크나이저는 이 마스크를 "token_type_id" 항목으로 반환합니다:
-
-```python
->>> encoded_dict["token_type_ids"]
-[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
-```
-
-질문에 사용되는 첫 번째 시퀀스인 "context"는 모든 토큰이 `0`으로 표시됩니다. 반면 두 번째 시퀀스인 "question"은 모든 토큰이 `1`로 표시됩니다.
-
-일부 모델(예: [`XLNetModel`])은 `2`로 표시되는 추가 토큰을 사용하기도 합니다.
-
-### 전이학습 (transfer learning)
-
-사전 학습된(pretrained) 모델을 가져와 특정 작업에 맞는 데이터셋에 대해 추가 학습하는 기술입니다. 모델을 처음부터 학습시키는 대신, 기존 모델이 학습한 지식을 출발점으로 삼아 더욱 빠르게 학습할 수 있습니다. 이를 통해 학습 속도를 높이고 필요한 데이터양도 줄일 수 있습니다.
-
-### 트랜스포머 (transformer)
-
-셀프 어텐션 메커니즘을 기반으로 한 딥러닝 모델 아키텍처입니다.
-
-## U
-
-### 비지도 학습 (unsupervised learning)
-
-정답(레이블)이 포함되지 않은 데이터를 이용해 모델을 학습시키는 방식입니다. 비지도 학습은 데이터 분포의 통계적 특성을 활용해 유용한 패턴을 찾아냅니다.
-
-## Z
-
-### Zero Redundancy Optimizer (ZeRO)
-
-[TensorParallel](#tensor-parallelism-tp)과 유사하게 텐서를 샤딩(sharding)하는 병렬 처리 기법이지만, 순전파(forward)나 역전파(backward) 계산 시점에 전체 텐서를 다시 복원한다는 점에서 차이가 있습니다. 따라서 모델 자체를 수정할 필요가 없습니다. 이 방법은 GPU 메모리가 부족할 경우 이를 보완하기 위한 다양한 오프로딩 (offloading) 기법도 지원합니다. 
-ZeRO에 대해 더 알아보려면 [이 문서](perf_train_gpu_many#zero-data-parallelism)를 참고하세요.
--- a/docs/source/ko/testing.md
+++ b/docs/source/ko/testing.md
@ -473,6 +473,13 @@ GPU 요구 사항을 표로 정리하면 아래와 같습니디ㅏ:
 def test_example_with_multi_gpu():
 ```

+`tensorflow`가 필요한 경우 `require_tf` 데코레이터를 사용합니다. 예를 들어 다음과 같습니다:
+
+```python no-style
+@require_tf
+def test_tf_thing_with_tensorflow():
+```
+
 이러한 데코레이터는 중첩될 수 있습니다.
 예를 들어, 느린 테스트로 진행되고 pytorch에서 적어도 하나의 GPU가 필요한 경우 다음과 같이 설정할 수 있습니다:

--- a/examples/flax/question-answering/run_qa.py
+++ b/examples/flax/question-answering/run_qa.py
@ -60,7 +60,7 @@ from transformers.utils import check_min_version, send_example_telemetry
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.54.0.dev0")
+check_min_version("4.53.0.dev0")

 Array = Any
 Dataset = datasets.arrow_dataset.Dataset
--- a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
+++ b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
@ -59,7 +59,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risk.
-check_min_version("4.54.0.dev0")
+check_min_version("4.53.0.dev0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/flax/speech-recognition/requirements.txt")

--- a/examples/flax/text-classification/run_flax_glue.py
+++ b/examples/flax/text-classification/run_flax_glue.py
@ -55,7 +55,7 @@ from transformers.utils import check_min_version, send_example_telemetry

 logger = logging.getLogger(__name__)
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.54.0.dev0")
+check_min_version("4.53.0.dev0")

 Array = Any
 Dataset = datasets.arrow_dataset.Dataset
--- a/examples/flax/token-classification/run_flax_ner.py
+++ b/examples/flax/token-classification/run_flax_ner.py
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version

 logger = logging.getLogger(__name__)
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.54.0.dev0")
+check_min_version("4.53.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")

--- a/examples/modular-transformers/configuration_my_new_model.py
+++ b/examples/modular-transformers/configuration_my_new_model.py
@ -125,6 +125,8 @@ class MyNewModelConfig(PretrainedConfig):
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
+        new_param (`int`, *optional*, defaults to `False`):
+            A fun new parameter
    """

    model_type = "my_new_model"
--- a/examples/modular-transformers/modeling_dummy.py
+++ b/examples/modular-transformers/modeling_dummy.py
@ -0,0 +1,446 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from examples/modular-transformers/modular_dummy.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_dummy.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+from typing import Callable, Optional
+
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...integrations import use_kernel_forward_from_hub
+from ...masking_utils import create_causal_mask
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import auto_docstring, can_return_tuple, logging
+from .configuration_dummy import DummyConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+@use_kernel_forward_from_hub("RMSNorm")
+class DummyRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        DummyRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class DummyRotaryEmbedding(nn.Module):
+    def __init__(self, config: DummyConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+class DummyMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 4]
+    x2 = x[..., x.shape[-1] // 4 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class DummyAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: DummyConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_value: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class DummyDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: DummyConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = DummyAttention(config=config, layer_idx=layer_idx)
+
+        self.mlp = DummyMLP(config)
+        self.input_layernorm = DummyRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = DummyRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs
+
+
+@auto_docstring
+class DummyPreTrainedModel(PreTrainedModel):
+    config_class = DummyConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["DummyDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+    _supports_attention_backend = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, DummyRMSNorm):
+            module.weight.data.fill_(1.0)
+
+
+@auto_docstring
+class DummyModel(DummyPreTrainedModel):
+    def __init__(self, config: DummyConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [DummyDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = DummyRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = DummyRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
+    ) -> BaseModelOutputWithPast:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
+        if not isinstance(past_key_values, (type(None), Cache)):
+            raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache()
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+        )
+
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **flash_attn_kwargs,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
--- a/examples/modular-transformers/modeling_global_indexing.py
+++ b/examples/modular-transformers/modeling_global_indexing.py
@ -1,169 +0,0 @@
-#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-#           This file was automatically generated from examples/modular-transformers/modular_global_indexing.py.
-#               Do NOT edit this file manually as any edits will be overwritten by the generation of
-#             the file from the modular. If any change should be done, please apply the change to the
-#                          modular_global_indexing.py file directly. One of our CI enforces this.
-#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-from typing import Callable, Optional
-
-import torch
-from torch import nn
-
-from transformers.modeling_utils import AttentionInterface
-
-from ...cache_utils import Cache
-from ...processing_utils import Unpack
-from ...utils import TransformersKwargs
-from .configuration_global_indexing import GlobalIndexingConfig
-
-
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
-    """Applies Rotary Position Embedding to the query and key tensors.
-
-    Args:
-        q (`torch.Tensor`): The query tensor.
-        k (`torch.Tensor`): The key tensor.
-        cos (`torch.Tensor`): The cosine part of the rotary embedding.
-        sin (`torch.Tensor`): The sine part of the rotary embedding.
-        position_ids (`torch.Tensor`, *optional*):
-            Deprecated and unused.
-        unsqueeze_dim (`int`, *optional*, defaults to 1):
-            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
-            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
-            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
-            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
-            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
-            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
-    Returns:
-        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
-    """
-    cos = cos.unsqueeze(unsqueeze_dim)
-    sin = sin.unsqueeze(unsqueeze_dim)
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-
-
-def eager_attention_forward(
-    module: nn.Module,
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    attention_mask: Optional[torch.Tensor],
-    scaling: float,
-    dropout: float = 0.0,
-    **kwargs: Unpack[TransformersKwargs],
-):
-    key_states = repeat_kv(key, module.num_key_value_groups)
-    value_states = repeat_kv(value, module.num_key_value_groups)
-
-    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
-    if attention_mask is not None:
-        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
-        attn_weights = attn_weights + causal_mask
-
-    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
-    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
-    attn_output = torch.matmul(attn_weights, value_states)
-    attn_output = attn_output.transpose(1, 2).contiguous()
-
-    return attn_output, attn_weights
-
-
-def custom_flex(x, **kwargs):
-    """Dummy function."""
-    return x
-
-
-ALL_ATTENTION_FUNCTIONS = AttentionInterface()
-# This indexing statement and associated function should be exported correctly!
-ALL_ATTENTION_FUNCTIONS["flex_attention"] = custom_flex
-
-
-class GlobalIndexingAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-
-    def __init__(self, config: GlobalIndexingConfig, layer_idx: int):
-        super().__init__()
-        self.config = config
-        self.layer_idx = layer_idx
-        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
-        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
-        self.scaling = self.head_dim**-0.5
-        self.attention_dropout = config.attention_dropout
-        self.is_causal = True
-
-        self.q_proj = nn.Linear(
-            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
-        )
-        self.k_proj = nn.Linear(
-            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
-        )
-        self.v_proj = nn.Linear(
-            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
-        )
-        self.o_proj = nn.Linear(
-            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
-        )
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        position_embeddings: tuple[torch.Tensor, torch.Tensor],
-        attention_mask: Optional[torch.Tensor],
-        past_key_value: Optional[Cache] = None,
-        cache_position: Optional[torch.LongTensor] = None,
-        **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor, torch.Tensor]:
-        input_shape = hidden_states.shape[:-1]
-        hidden_shape = (*input_shape, -1, self.head_dim)
-
-        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
-        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
-        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
-
-        cos, sin = position_embeddings
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
-
-        if past_key_value is not None:
-            # sin and cos are specific to RoPE models; cache_position needed for the static cache
-            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
-
-        attn_output, attn_weights = attention_interface(
-            self,
-            query_states,
-            key_states,
-            value_states,
-            attention_mask,
-            dropout=0.0 if not self.training else self.attention_dropout,
-            scaling=self.scaling,
-            **kwargs,
-        )
-
-        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
-        attn_output = self.o_proj(attn_output)
-        return attn_output, attn_weights
--- a/examples/modular-transformers/modeling_multimodal1.py
+++ b/examples/modular-transformers/modeling_multimodal1.py
@ -0,0 +1,446 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from examples/modular-transformers/modular_multimodal1.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_multimodal1.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+from typing import Callable, Optional
+
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...integrations import use_kernel_forward_from_hub
+from ...masking_utils import create_causal_mask
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import auto_docstring, can_return_tuple, logging
+from .configuration_multimodal1 import Multimodal1TextConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+@use_kernel_forward_from_hub("RMSNorm")
+class Multimodal1TextRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        Multimodal1TextRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class Multimodal1TextRotaryEmbedding(nn.Module):
+    def __init__(self, config: Multimodal1TextConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+class Multimodal1TextMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class Multimodal1TextAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: Multimodal1TextConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_value: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class Multimodal1TextDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: Multimodal1TextConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = Multimodal1TextAttention(config=config, layer_idx=layer_idx)
+
+        self.mlp = Multimodal1TextMLP(config)
+        self.input_layernorm = Multimodal1TextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Multimodal1TextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs
+
+
+@auto_docstring
+class Multimodal1TextPreTrainedModel(PreTrainedModel):
+    config_class = Multimodal1TextConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Multimodal1TextDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+    _supports_attention_backend = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, Multimodal1TextRMSNorm):
+            module.weight.data.fill_(1.0)
+
+
+@auto_docstring
+class Multimodal1TextModel(Multimodal1TextPreTrainedModel):
+    def __init__(self, config: Multimodal1TextConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [Multimodal1TextDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = Multimodal1TextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = Multimodal1TextRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
+    ) -> BaseModelOutputWithPast:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
+        if not isinstance(past_key_values, (type(None), Cache)):
+            raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache()
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+        )
+
+        hidden_states = inputs_embeds
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **flash_attn_kwargs,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
--- a/examples/modular-transformers/modeling_multimodal2.py
+++ b/examples/modular-transformers/modeling_multimodal2.py
@ -289,6 +289,7 @@ class Multimodal2VisionEncoder(nn.Module):
        self.layers = nn.ModuleList([Multimodal2VisionEncoderLayer(config) for _ in range(config.num_hidden_layers)])
        self.gradient_checkpointing = False

+    @can_return_tuple
    def forward(
        self,
        inputs_embeds,
@ -454,6 +455,7 @@ class Multimodal2VisionTransformer(nn.Module):
        self.encoder = Multimodal2VisionEncoder(config)
        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)

+    @can_return_tuple
    @auto_docstring
    def forward(
        self,
--- a/examples/modular-transformers/modeling_my_new_model2.py
+++ b/examples/modular-transformers/modeling_my_new_model2.py
@ -12,13 +12,13 @@ from torch import nn
 from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache
 from ...masking_utils import create_causal_mask
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutputWithPast, SequenceClassifierOutputWithPast
 from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
-from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
-from ...utils.generic import check_model_inputs
+from ...utils import auto_docstring, can_return_tuple, logging
 from .configuration_my_new_model2 import MyNewModel2Config


@ -149,7 +149,7 @@ def eager_attention_forward(
    attention_mask: Optional[torch.Tensor],
    scaling: float,
    dropout: float = 0.0,
-    **kwargs: Unpack[TransformersKwargs],
+    **kwargs,
 ):
    key_states = repeat_kv(key, module.num_key_value_groups)
    value_states = repeat_kv(value, module.num_key_value_groups)
@ -200,8 +200,8 @@ class MyNewModel2Attention(nn.Module):
        attention_mask: Optional[torch.Tensor],
        past_key_value: Optional[Cache] = None,
        cache_position: Optional[torch.LongTensor] = None,
-        **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor, torch.Tensor]:
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
        input_shape = hidden_states.shape[:-1]
        hidden_shape = (*input_shape, -1, self.head_dim)

@ -254,19 +254,22 @@ class MyNewModel2DecoderLayer(GradientCheckpointingLayer):
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = False,
        cache_position: Optional[torch.LongTensor] = None,
        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
-        **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor]:
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
        residual = hidden_states
        hidden_states = self.input_layernorm(hidden_states)
+
        # Self Attention
-        hidden_states, _ = self.self_attn(
+        hidden_states, self_attn_weights = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_value=past_key_value,
+            output_attentions=output_attentions,
            use_cache=use_cache,
            cache_position=cache_position,
            position_embeddings=position_embeddings,
@ -279,7 +282,12 @@ class MyNewModel2DecoderLayer(GradientCheckpointingLayer):
        hidden_states = self.post_attention_layernorm(hidden_states)
        hidden_states = self.mlp(hidden_states)
        hidden_states = residual + hidden_states
-        return hidden_states
+
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs


@auto_docstring
@ -296,10 +304,6 @@ class MyNewModel2PreTrainedModel(PreTrainedModel):
    _supports_quantized_cache = True
    _supports_static_cache = True
    _supports_attention_backend = True
-    _can_record_outputs = {
-        "hidden_states": MyNewModel2DecoderLayer,
-        "attentions": MyNewModel2Attention,
-    }

    def _init_weights(self, module):
        std = self.config.initializer_range
@ -339,7 +343,7 @@ class MyNewModel2Model(MyNewModel2PreTrainedModel):
    def set_input_embeddings(self, value):
        self.embed_tokens = value

-    @check_model_inputs
+    @can_return_tuple
    @auto_docstring
    def forward(
        self,
@ -349,12 +353,26 @@ class MyNewModel2Model(MyNewModel2PreTrainedModel):
        past_key_values: Optional[Cache] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
-        **kwargs: Unpack[TransformersKwargs],
+        **kwargs: Unpack[FlashAttentionKwargs],
    ) -> BaseModelOutputWithPast:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
        if (input_ids is None) ^ (inputs_embeds is not None):
            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")

+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(input_ids)

@ -376,7 +394,6 @@ class MyNewModel2Model(MyNewModel2PreTrainedModel):
            attention_mask=attention_mask,
            cache_position=cache_position,
            past_key_values=past_key_values,
-            position_ids=position_ids,
        )

        # embed positions
@ -391,21 +408,42 @@ class MyNewModel2Model(MyNewModel2PreTrainedModel):
        normalizer = torch.tensor(self.config.hidden_size**0.5, dtype=hidden_states.dtype)
        hidden_states = hidden_states * normalizer

+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
-            hidden_states = decoder_layer(
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = decoder_layer(
                hidden_states,
                attention_mask=causal_mask,
                position_ids=position_ids,
                past_key_value=past_key_values,
+                output_attentions=output_attentions,
                use_cache=use_cache,
                cache_position=cache_position,
                position_embeddings=position_embeddings,
                **kwargs,
            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
        return BaseModelOutputWithPast(
            last_hidden_state=hidden_states,
            past_key_values=past_key_values if use_cache else None,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
        )


@ -450,7 +488,8 @@ class MyNewModel2ForSequenceClassification(MyNewModel2PreTrainedModel):
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
-        **kwargs: Unpack[TransformersKwargs],
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
    ) -> SequenceClassifierOutputWithPast:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@ -466,7 +505,8 @@ class MyNewModel2ForSequenceClassification(MyNewModel2PreTrainedModel):
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
-            **kwargs,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
        )
        hidden_states = transformer_outputs.last_hidden_state
        logits = self.score(hidden_states)
--- a/examples/modular-transformers/modeling_new_task_model.py
+++ b/examples/modular-transformers/modeling_new_task_model.py
@ -118,8 +118,6 @@ class NewTaskModelPreTrainedModel(PreTrainedModel):
 )
 class NewTaskModelModel(NewTaskModelPreTrainedModel):
    _checkpoint_conversion_mapping = {"language_model.model": "language_model"}
-    # we are filtering the logits/labels so we shouldn't divide the loss based on num_items_in_batch
-    accepts_loss_kwargs = False

    def __init__(self, config: NewTaskModelConfig):
        super().__init__(config)
@ -315,11 +313,9 @@ class NewTaskModelModel(NewTaskModelPreTrainedModel):
                special_image_mask = inputs_embeds == self.get_input_embeddings()(
                    torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
                )
-                special_image_mask = special_image_mask.all(-1)
            else:
-                special_image_mask = input_ids == self.config.image_token_id
-
-            special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+                special_image_mask = (input_ids == self.config.image_token_id).unsqueeze(-1)
+                special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)

            if not is_torchdynamo_compiling() and inputs_embeds[special_image_mask].numel() != image_features.numel():
                image_tokens_in_text = (special_image_mask).sum(dim=1).sum(dim=0)[0]
@ -437,6 +433,32 @@ class NewTaskModelForNewTask(NewTaskModelPreTrainedModel, GenerationMixin):
        num_logits_to_keep: int = 0,
    ) -> Union[tuple, NewTaskModelCausalLMOutputWithPast]:
        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, NewTaskModelForNewTask
+
+        >>> model = NewTaskModelForNewTask.from_pretrained("google/new_task_model2-3b-mix-224")
+        >>> processor = AutoProcessor.from_pretrained("google/new_task_model2-3b-mix-224")
+
+        >>> prompt = "Where is the cat standing?"
+        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(**inputs,)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Where is the cat standing?\nsnow"
+        ```
        Returns:
        """
        vlm_outputs = super().forward(
--- a/examples/modular-transformers/modeling_super.py
+++ b/examples/modular-transformers/modeling_super.py
@ -14,12 +14,12 @@ from transformers.modeling_outputs import CausalLMOutputWithPast
 from ...activations import ACT2FN
 from ...cache_utils import Cache
 from ...integrations import use_kernel_forward_from_hub
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
-from ...utils import TransformersKwargs, auto_docstring
-from ...utils.generic import check_model_inputs
+from ...utils import auto_docstring, can_return_tuple
 from .configuration_super import SuperConfig


@ -148,7 +148,7 @@ def eager_attention_forward(
    attention_mask: Optional[torch.Tensor],
    scaling: float,
    dropout: float = 0.0,
-    **kwargs: Unpack[TransformersKwargs],
+    **kwargs,
 ):
    key_states = repeat_kv(key, module.num_key_value_groups)
    value_states = repeat_kv(value, module.num_key_value_groups)
@ -199,8 +199,8 @@ class SuperAttention(nn.Module):
        attention_mask: Optional[torch.Tensor],
        past_key_value: Optional[Cache] = None,
        cache_position: Optional[torch.LongTensor] = None,
-        **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor, torch.Tensor]:
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
        input_shape = hidden_states.shape[:-1]
        hidden_shape = (*input_shape, -1, self.head_dim)

@ -253,19 +253,22 @@ class SuperDecoderLayer(GradientCheckpointingLayer):
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = False,
        cache_position: Optional[torch.LongTensor] = None,
        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
-        **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor]:
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
        residual = hidden_states
        hidden_states = self.input_layernorm(hidden_states)
+
        # Self Attention
-        hidden_states, _ = self.self_attn(
+        hidden_states, self_attn_weights = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_value=past_key_value,
+            output_attentions=output_attentions,
            use_cache=use_cache,
            cache_position=cache_position,
            position_embeddings=position_embeddings,
@ -278,7 +281,12 @@ class SuperDecoderLayer(GradientCheckpointingLayer):
        hidden_states = self.post_attention_layernorm(hidden_states)
        hidden_states = self.mlp(hidden_states)
        hidden_states = residual + hidden_states
-        return hidden_states
+
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs


@auto_docstring
@ -295,10 +303,6 @@ class SuperPreTrainedModel(PreTrainedModel):
    _supports_quantized_cache = True
    _supports_static_cache = True
    _supports_attention_backend = True
-    _can_record_outputs = {
-        "hidden_states": SuperDecoderLayer,
-        "attentions": SuperAttention,
-    }

    def _init_weights(self, module):
        std = self.config.initializer_range
@ -338,7 +342,7 @@ class SuperModel(SuperPreTrainedModel):
    def set_input_embeddings(self, value):
        self.embed_tokens = value

-    @check_model_inputs
+    @can_return_tuple
    @auto_docstring
    def forward(
        self,
--- a/examples/modular-transformers/modeling_switch_function.py
+++ b/examples/modular-transformers/modeling_switch_function.py
@ -11,9 +11,9 @@ import torch
 from torch import nn

 from ...cache_utils import Cache
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
 from ...processing_utils import Unpack
-from ...utils import TransformersKwargs
 from .configuration_switch_function import SwitchFunctionConfig


@ -72,7 +72,7 @@ def eager_attention_forward(
    attention_mask: Optional[torch.Tensor],
    scaling: float,
    dropout: float = 0.0,
-    **kwargs: Unpack[TransformersKwargs],
+    **kwargs,
 ):
    key_states = repeat_kv(key, module.num_key_value_groups)
    value_states = repeat_kv(value, module.num_key_value_groups)
@ -123,8 +123,8 @@ class SwitchFunctionAttention(nn.Module):
        attention_mask: Optional[torch.Tensor],
        past_key_value: Optional[Cache] = None,
        cache_position: Optional[torch.LongTensor] = None,
-        **kwargs: Unpack[TransformersKwargs],
-    ) -> tuple[torch.Tensor, torch.Tensor]:
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
        input_shape = hidden_states.shape[:-1]
        hidden_shape = (*input_shape, -1, self.head_dim)

--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
ydshieh	914b2a9a23	fix	2025-06-25 17:28:16 +02:00
ydshieh	e36afb4399	fix	2025-06-25 17:06:35 +02:00
ydshieh	cc40cac02e	try 1	2025-06-25 17:04:02 +02:00