set seed

revert unrelated
merge
2025-11-12 01:04:36 +08:00 · 2025-07-04 14:24:19 +02:00 · 2025-07-04 14:23:48 +02:00 · 2025-07-04 14:22:16 +02:00 · 2025-07-04 13:35:53 +02:00 · 2025-07-04 12:48:10 +02:00
444 changed files with 14627 additions and 7724 deletions
--- a/.github/workflows/check_failed_tests.yml
+++ b/.github/workflows/check_failed_tests.yml
@ -41,7 +41,7 @@ jobs:
  check_new_failures:
    name: " "
    runs-on:
-      group: aws-g4dn-4xlarge-cache
+      group: aws-g5-4xlarge-cache
    container:
      image: ${{ inputs.docker }}
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
--- a/.github/workflows/doctest_job.yml
+++ b/.github/workflows/doctest_job.yml
@ -28,7 +28,7 @@ jobs:
      matrix:
        split_keys: ${{ fromJson(inputs.split_keys) }}
    runs-on: 
-      group: aws-g4dn-4xlarge-cache
+      group: aws-g5-4xlarge-cache
    container:
      image: huggingface/transformers-all-latest-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
--- a/.github/workflows/doctests.yml
+++ b/.github/workflows/doctests.yml
@ -15,7 +15,7 @@ jobs:
  setup:
    name: Setup
    runs-on: 
-      group: aws-g4dn-4xlarge-cache
+      group: aws-g5-4xlarge-cache
    container:
      image: huggingface/transformers-all-latest-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
--- a/.github/workflows/get-pr-info.yml
+++ b/.github/workflows/get-pr-info.yml
@ -0,0 +1,157 @@
+name: Get PR commit SHA
+on:
+  workflow_call:
+    inputs:
+      pr_number:
+        required: true
+        type: string
+    outputs:
+      PR_HEAD_REPO_FULL_NAME:
+        description: "The full name of the repository from which the pull request is created"
+        value: ${{ jobs.get-pr-info.outputs.PR_HEAD_REPO_FULL_NAME }}
+      PR_BASE_REPO_FULL_NAME:
+        description: "The full name of the repository to which the pull request is created"
+        value: ${{ jobs.get-pr-info.outputs.PR_BASE_REPO_FULL_NAME }}
+      PR_HEAD_REPO_OWNER:
+        description: "The owner of the repository from which the pull request is created"
+        value: ${{ jobs.get-pr-info.outputs.PR_HEAD_REPO_OWNER }}
+      PR_BASE_REPO_OWNER:
+        description: "The owner of the repository to which the pull request is created"
+        value: ${{ jobs.get-pr-info.outputs.PR_BASE_REPO_OWNER }}
+      PR_HEAD_REPO_NAME:
+        description: "The name of the repository from which the pull request is created"
+        value: ${{ jobs.get-pr-info.outputs.PR_HEAD_REPO_NAME }}
+      PR_BASE_REPO_NAME:
+        description: "The name of the repository to which the pull request is created"
+        value: ${{ jobs.get-pr-info.outputs.PR_BASE_REPO_NAME }}
+      PR_HEAD_REF:
+        description: "The branch name of the pull request in the head repository"
+        value: ${{ jobs.get-pr-info.outputs.PR_HEAD_REF }}
+      PR_BASE_REF:
+        description: "The branch name in the base repository (to merge into)"
+        value: ${{ jobs.get-pr-info.outputs.PR_BASE_REF }}
+      PR_HEAD_SHA:
+        description: "The head sha of the pull request branch in the head repository"
+        value: ${{ jobs.get-pr-info.outputs.PR_HEAD_SHA }}
+      PR_BASE_SHA:
+        description: "The head sha of the target branch in the base repository"
+        value: ${{ jobs.get-pr-info.outputs.PR_BASE_SHA }}
+      PR_MERGE_COMMIT_SHA:
+        description: "The sha of the merge commit for the pull request (created by GitHub) in the base repository"
+        value: ${{ jobs.get-pr-info.outputs.PR_MERGE_COMMIT_SHA }}
+      PR_HEAD_COMMIT_DATE:
+        description: "The date of the head sha of the pull request branch in the head repository"
+        value: ${{ jobs.get-pr-info.outputs.PR_HEAD_COMMIT_DATE }}
+      PR_MERGE_COMMIT_DATE:
+        description: "The date of the merge commit for the pull request (created by GitHub) in the base repository"
+        value: ${{ jobs.get-pr-info.outputs.PR_MERGE_COMMIT_DATE }}
+      PR_HEAD_COMMIT_TIMESTAMP:
+        description: "The timestamp of the head sha of the pull request branch in the head repository"
+        value: ${{ jobs.get-pr-info.outputs.PR_HEAD_COMMIT_TIMESTAMP }}
+      PR_MERGE_COMMIT_TIMESTAMP:
+        description: "The timestamp of the merge commit for the pull request (created by GitHub) in the base repository"
+        value: ${{ jobs.get-pr-info.outputs.PR_MERGE_COMMIT_TIMESTAMP }}
+      PR:
+        description: "The PR"
+        value: ${{ jobs.get-pr-info.outputs.PR }}
+      PR_FILES:
+        description: "The files touched in the PR"
+        value: ${{ jobs.get-pr-info.outputs.PR_FILES }}
+
+
+jobs:
+  get-pr-info:
+    runs-on: ubuntu-22.04
+    name: Get PR commit SHA better
+    outputs:
+      PR_HEAD_REPO_FULL_NAME: ${{ steps.pr_info.outputs.head_repo_full_name }}
+      PR_BASE_REPO_FULL_NAME: ${{ steps.pr_info.outputs.base_repo_full_name }}
+      PR_HEAD_REPO_OWNER: ${{ steps.pr_info.outputs.head_repo_owner }}
+      PR_BASE_REPO_OWNER: ${{ steps.pr_info.outputs.base_repo_owner }}
+      PR_HEAD_REPO_NAME: ${{ steps.pr_info.outputs.head_repo_name }}
+      PR_BASE_REPO_NAME: ${{ steps.pr_info.outputs.base_repo_name }}
+      PR_HEAD_REF: ${{ steps.pr_info.outputs.head_ref }}
+      PR_BASE_REF: ${{ steps.pr_info.outputs.base_ref }}
+      PR_HEAD_SHA: ${{ steps.pr_info.outputs.head_sha }}
+      PR_BASE_SHA: ${{ steps.pr_info.outputs.base_sha }}
+      PR_MERGE_COMMIT_SHA: ${{ steps.pr_info.outputs.merge_commit_sha }}
+      PR_HEAD_COMMIT_DATE: ${{ steps.pr_info.outputs.head_commit_date }}
+      PR_MERGE_COMMIT_DATE: ${{ steps.pr_info.outputs.merge_commit_date }}
+      PR_HEAD_COMMIT_TIMESTAMP: ${{ steps.get_timestamps.outputs.head_commit_timestamp }}
+      PR_MERGE_COMMIT_TIMESTAMP: ${{ steps.get_timestamps.outputs.merge_commit_timestamp }}
+      PR: ${{ steps.pr_info.outputs.pr }}
+      PR_FILES: ${{ steps.pr_info.outputs.files }}
+    if: ${{ inputs.pr_number != '' }}
+    steps:
+      - name: Extract PR details
+        id: pr_info
+        uses: actions/github-script@v6
+        with:
+          script: |            
+            const { data: pr } = await github.rest.pulls.get({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              pull_number: ${{ inputs.pr_number }}
+            });
+
+            const { data: head_commit }  = await github.rest.repos.getCommit({
+              owner: pr.head.repo.owner.login,
+              repo: pr.head.repo.name,
+              ref: pr.head.ref
+            });
+
+            const { data: merge_commit }  = await github.rest.repos.getCommit({
+              owner: pr.base.repo.owner.login,
+              repo: pr.base.repo.name,
+              ref: pr.merge_commit_sha,
+            });
+
+            const { data: files } = await github.rest.pulls.listFiles({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              pull_number: ${{ inputs.pr_number }}
+            });
+
+            core.setOutput('head_repo_full_name', pr.head.repo.full_name);
+            core.setOutput('base_repo_full_name', pr.base.repo.full_name);
+            core.setOutput('head_repo_owner', pr.head.repo.owner.login);
+            core.setOutput('base_repo_owner', pr.base.repo.owner.login);
+            core.setOutput('head_repo_name', pr.head.repo.name);
+            core.setOutput('base_repo_name', pr.base.repo.name);
+            core.setOutput('head_ref', pr.head.ref);
+            core.setOutput('base_ref', pr.base.ref);
+            core.setOutput('head_sha', pr.head.sha);
+            core.setOutput('base_sha', pr.base.sha);
+            core.setOutput('merge_commit_sha', pr.merge_commit_sha);
+            core.setOutput('pr', pr);
+
+            core.setOutput('head_commit_date', head_commit.commit.committer.date);
+            core.setOutput('merge_commit_date', merge_commit.commit.committer.date);
+            
+            core.setOutput('files', files);            
+            
+            console.log('PR head commit:', {
+              head_commit: head_commit,
+              commit: head_commit.commit,
+              date: head_commit.commit.committer.date
+            });
+
+            console.log('PR merge commit:', {
+              merge_commit: merge_commit,
+              commit: merge_commit.commit,
+              date: merge_commit.commit.committer.date
+            });
+
+      - name: Convert dates to timestamps
+        id: get_timestamps
+        run: |
+          head_commit_date=${{ steps.pr_info.outputs.head_commit_date }}
+          merge_commit_date=${{ steps.pr_info.outputs.merge_commit_date }}
+          echo $head_commit_date
+          echo $merge_commit_date
+          head_commit_timestamp=$(date -d "$head_commit_date" +%s)
+          merge_commit_timestamp=$(date -d "$merge_commit_date" +%s)
+          echo $head_commit_timestamp
+          echo $merge_commit_timestamp
+          echo "head_commit_timestamp=$head_commit_timestamp" >> $GITHUB_OUTPUT
+          echo "merge_commit_timestamp=$merge_commit_timestamp" >> $GITHUB_OUTPUT
--- a/.github/workflows/get-pr-number.yml
+++ b/.github/workflows/get-pr-number.yml
@ -0,0 +1,36 @@
+name: Get PR number
+on:
+  workflow_call:
+    outputs:
+      PR_NUMBER:
+        description: "The extracted PR number"
+        value: ${{ jobs.get-pr-number.outputs.PR_NUMBER }}
+
+jobs:
+  get-pr-number:
+    runs-on: ubuntu-22.04
+    name: Get PR number
+    outputs:
+      PR_NUMBER: ${{ steps.set_pr_number.outputs.PR_NUMBER }}
+    steps:
+      - name: Get PR number
+        shell: bash
+        run: |
+          if [[ "${{ github.event.issue.number }}" != "" && "${{ github.event.issue.pull_request }}" != "" ]]; then
+            echo "PR_NUMBER=${{ github.event.issue.number }}" >> $GITHUB_ENV
+          elif [[ "${{ github.event.pull_request.number }}" != "" ]]; then
+            echo "PR_NUMBER=${{ github.event.pull_request.number }}" >> $GITHUB_ENV
+          elif [[ "${{ github.event.pull_request }}" != "" ]]; then
+            echo "PR_NUMBER=${{ github.event.number }}" >> $GITHUB_ENV
+          else
+            echo "PR_NUMBER=" >> $GITHUB_ENV
+          fi
+
+      - name: Check PR number
+        shell: bash
+        run: |
+          echo "${{ env.PR_NUMBER }}"
+
+      - name: Set PR number
+        id: set_pr_number
+        run: echo "PR_NUMBER=${{ env.PR_NUMBER }}" >> "$GITHUB_OUTPUT"
--- a/.github/workflows/model_jobs.yml
+++ b/.github/workflows/model_jobs.yml
@ -107,9 +107,9 @@ jobs:
        run: |
          echo "${{ inputs.machine_type }}"

-          if [ "${{ inputs.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
+          if [ "${{ inputs.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ inputs.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
+          elif [ "${{ inputs.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ inputs.machine_type }}
--- a/.github/workflows/pr_run_slow_ci.yml
+++ b/.github/workflows/pr_run_slow_ci.yml
@ -0,0 +1,163 @@
+name: PR slow CI
+on:
+  pull_request_target:
+    types: [opened, synchronize, reopened]
+
+jobs:
+  get-pr-number:
+    name: Get PR number
+    uses: ./.github/workflows/get-pr-number.yml
+
+  get-pr-info:
+    name: Get PR commit SHA
+    needs: get-pr-number
+    if: ${{ needs.get-pr-number.outputs.PR_NUMBER != ''}}
+    uses: ./.github/workflows/get-pr-info.yml
+    with:
+      pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
+
+  # We only need to verify the timestamp if the workflow is triggered by `issue_comment`.
+  verity_pr_commit:
+    name: Verity PR commit corresponds to a specific event by comparing timestamps
+    if: ${{ github.event.comment.created_at != '' }}
+    runs-on: ubuntu-22.04
+    needs: get-pr-info
+    env:
+      COMMENT_DATE: ${{ github.event.comment.created_at }}
+      PR_MERGE_COMMIT_DATE: ${{ needs.get-pr-info.outputs.PR_MERGE_COMMIT_DATE }}
+      PR_MERGE_COMMIT_TIMESTAMP: ${{ needs.get-pr-info.outputs.PR_MERGE_COMMIT_TIMESTAMP }}
+    steps:
+      - run: |
+          COMMENT_TIMESTAMP=$(date -d "${COMMENT_DATE}" +"%s")
+          echo "COMMENT_DATE: $COMMENT_DATE"
+          echo "PR_MERGE_COMMIT_DATE: $PR_MERGE_COMMIT_DATE"
+          echo "COMMENT_TIMESTAMP: $COMMENT_TIMESTAMP"
+          echo "PR_MERGE_COMMIT_TIMESTAMP: $PR_MERGE_COMMIT_TIMESTAMP"
+          if [ $COMMENT_TIMESTAMP -le $PR_MERGE_COMMIT_TIMESTAMP ]; then
+            echo "Last commit on the pull request is newer than the issue comment triggering this run! Abort!";
+            exit -1;
+          fi
+
+  get-jobs:
+    name: Get test files to run
+    runs-on: ubuntu-22.04
+    needs: [get-pr-number, get-pr-info]
+    outputs:
+      jobs: ${{ steps.get_jobs.outputs.jobs_to_run }}
+    steps:
+      - name: Get repository content
+        id: repo_content
+        uses: actions/github-script@v6
+        with:
+          script: |
+            const { data: tests_dir } = await github.rest.repos.getContent({
+              owner: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_OWNER }}',
+              repo: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_NAME }}',
+              path: 'tests',
+              ref: '${{ needs.get-pr-info.outputs.PR_HEAD_SHA }}',
+            });
+
+            const { data: tests_models_dir } = await github.rest.repos.getContent({
+              owner: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_OWNER }}',
+              repo: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_NAME }}',
+              path: 'tests/models',
+              ref: '${{ needs.get-pr-info.outputs.PR_HEAD_SHA }}',
+            });
+
+            const { data: tests_quantization_dir } = await github.rest.repos.getContent({
+              owner: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_OWNER }}',
+              repo: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_NAME }}',
+              path: 'tests/quantization',
+              ref: '${{ needs.get-pr-info.outputs.PR_HEAD_SHA }}',
+            });
+
+            core.setOutput('tests_dir', tests_dir);
+            core.setOutput('tests_models_dir', tests_models_dir);
+            core.setOutput('tests_quantization_dir', tests_quantization_dir);
+
+      # This checkout to the main branch
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: "0"
+
+      - name: Write pr_files file
+        run: |
+          cat > pr_files.txt << 'EOF'
+          ${{ needs.get-pr-info.outputs.PR_FILES }}
+          EOF
+
+      - name: Write tests_dir file
+        run: |
+          cat > tests_dir.txt << 'EOF'
+          ${{ steps.repo_content.outputs.tests_dir }}
+          EOF
+
+      - name: Write tests_models_dir file
+        run: |
+          cat > tests_models_dir.txt << 'EOF'
+          ${{ steps.repo_content.outputs.tests_models_dir }}
+          EOF
+
+      - name: Write tests_quantization_dir file
+        run: |
+          cat > tests_quantization_dir.txt << 'EOF'
+          ${{ steps.repo_content.outputs.tests_quantization_dir }}
+          EOF
+
+      - name: Run script to get jobs to run
+        id: get_jobs
+        run: |
+          python utils/get_pr_run_slow_jobs.py | tee output.txt
+          echo "jobs_to_run: $(tail -n 1 output.txt)"
+          echo "jobs_to_run=$(tail -n 1 output.txt)" >> $GITHUB_OUTPUT
+
+  send_comment:
+    name: Send a comment to suggest jobs to run
+    if: ${{ needs.get-jobs.outputs.jobs != '' }}
+    needs: [get-pr-number, get-jobs]
+    permissions:
+      pull-requests: write
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Delete existing comment and send new one
+        uses: actions/github-script@v7
+        env:
+          BODY: "\n\nrun-slow: ${{ needs.get-jobs.outputs.jobs }}"
+        with:
+          script: |
+            const prNumber = ${{ needs.get-pr-number.outputs.PR_NUMBER }};
+            const commentPrefix = "**[For maintainers]** Suggested jobs to run (before merge)";
+            
+            // Get all comments on the PR
+            const { data: comments } = await github.rest.issues.listComments({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: prNumber
+            });
+            
+            // Find existing comment(s) that start with our prefix
+            const existingComments = comments.filter(comment => 
+              comment.user.login === 'github-actions[bot]' && 
+              comment.body.startsWith(commentPrefix)
+            );
+            
+            // Delete existing comment(s)
+            for (const comment of existingComments) {
+              console.log(`Deleting existing comment #${comment.id}`);
+              await github.rest.issues.deleteComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                comment_id: comment.id
+              });
+            }
+            
+            // Create new comment
+            const newBody = `${commentPrefix}${process.env.BODY}`;
+            await github.rest.issues.createComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: prNumber,
+              body: newBody
+            });
+            
+            console.log('✅ Comment updated successfully');
--- a/.github/workflows/self-comment-ci.yml
+++ b/.github/workflows/self-comment-ci.yml
@ -185,7 +185,7 @@ jobs:
      fail-fast: false
      matrix:
        folders: ${{ fromJson(needs.get-tests.outputs.models) }}
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
    runs-on:
       group: '${{ matrix.machine_type }}'
    container:
@ -239,9 +239,9 @@ jobs:
        shell: bash
        run: |
          echo "${{ matrix.machine_type }}"
-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
@ -292,7 +292,7 @@ jobs:
      fail-fast: false
      matrix:
        folders: ${{ fromJson(needs.get-tests.outputs.quantizations) }}
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -338,9 +338,9 @@ jobs:
        shell: bash
        run: |
          echo "${{ matrix.machine_type }}"
-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@ -31,7 +31,7 @@ jobs:
    name: Setup
    strategy:
      matrix:
-        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -131,7 +131,7 @@ jobs:
      fail-fast: false
      matrix:
        folders: ${{ fromJson(needs.setup.outputs.matrix) }}
-        machine_type: [aws-g4dn-2xlarge-cache]
+        machine_type: [aws-g5-4xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -169,9 +169,9 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
@ -244,7 +244,7 @@ jobs:
      fail-fast: false
      matrix:
        folders: ${{ fromJson(needs.setup.outputs.matrix) }}
-        machine_type: [aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g5-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -282,9 +282,9 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
@ -357,7 +357,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-2xlarge-cache]
+        machine_type: [aws-g5-4xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -395,9 +395,9 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
@ -467,7 +467,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g5-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -505,9 +505,9 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
--- a/.github/workflows/self-scheduled-intel-gaudi.yml
+++ b/.github/workflows/self-scheduled-intel-gaudi.yml
@ -84,8 +84,6 @@ jobs:
      machine_type: ${{ matrix.machine_type }}
      folder_slices: ${{ needs.setup.outputs.folder_slices }}
      runner: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
-      report_name_prefix: run_models_gpu
-
    secrets: inherit

  run_trainer_and_fsdp_gpu:
@ -104,11 +102,10 @@ jobs:
      folder_slices: ${{ needs.setup.outputs.folder_slices }}
      runner: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
      report_name_prefix: run_trainer_and_fsdp_gpu
-
    secrets: inherit

-  run_pipelines_gpu:
-    if: ${{ inputs.job == 'run_pipelines_gpu' }}
+  run_pipelines_torch_gpu:
+    if: ${{ inputs.job == 'run_pipelines_torch_gpu' }}
    name: Pipelines
    strategy:
      fail-fast: false
@ -161,20 +158,20 @@ jobs:

      - name: Run all pipeline tests on Intel Gaudi
        run: |
-          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_pipelines_gpu_test_reports tests/pipelines -m "not not_device_test"
+          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports tests/pipelines -m "not not_device_test"

      - name: Failure short reports
        if: ${{ failure() }}
        continue-on-error: true
        run: |
-          cat reports/${{ env.machine_type }}_run_pipelines_gpu_test_reports/failures_short.txt
+          cat reports/${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports/failures_short.txt

-      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_pipelines_gpu_test_reports"
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports"
        if: ${{ always() }}
        uses: actions/upload-artifact@v4
        with:
-          name: ${{ env.machine_type }}_run_pipelines_gpu_test_reports
-          path: reports/${{ env.machine_type }}_run_pipelines_gpu_test_reports
+          name: ${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports
+          path: reports/${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports

  run_examples_gpu:
    if: ${{ inputs.job == 'run_examples_gpu' }}
@ -248,8 +245,8 @@ jobs:
          name: ${{ env.machine_type }}_run_examples_gpu_test_reports
          path: reports/${{ env.machine_type }}_run_examples_gpu_test_reports

-  run_deepspeed_gpu:
-    if: ${{ inputs.job == 'run_deepspeed_gpu' }}
+  run_torch_cuda_extensions_gpu:
+    if: ${{ inputs.job == 'run_torch_cuda_extensions_gpu' }}
    name: Intel Gaudi deepspeed tests
    strategy:
      fail-fast: false
@ -305,20 +302,20 @@ jobs:

      - name: Run all deepspeed tests on intel Gaudi
        run: |
-          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_deepspeed_gpu_test_reports tests/deepspeed -m "not not_device_test"
+          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed -m "not not_device_test"

      - name: Failure short reports
        if: ${{ failure() }}
        continue-on-error: true
        run: |
-          cat reports/${{ env.machine_type }}_run_deepspeed_gpu_test_reports/failures_short.txt
+          cat reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt

-      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_deepspeed_gpu_test_reports"
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports"
        if: ${{ always() }}
        uses: actions/upload-artifact@v4
        with:
-          name: ${{ env.machine_type }}_run_deepspeed_gpu_test_reports
-          path: reports/${{ env.machine_type }}_run_deepspeed_gpu_test_reports
+          name: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
+          path: reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports

  send_results:
    name: Slack Report
@ -327,8 +324,8 @@ jobs:
        setup,
        run_models_gpu,
        run_examples_gpu,
-        run_pipelines_gpu,
-        run_deepspeed_gpu,
+        run_torch_cuda_extensions_gpu,
+        run_pipelines_torch_gpu,
        run_trainer_and_fsdp_gpu,
      ]
    if: ${{ always() }}
--- a/.github/workflows/self-scheduled-intel-gaudi3-caller.yml
+++ b/.github/workflows/self-scheduled-intel-gaudi3-caller.yml
@ -23,7 +23,7 @@ jobs:
    name: Pipeline CI
    uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
    with:
-      job: run_pipelines_gpu
+      job: run_pipelines_torch_gpu
      ci_event: Scheduled CI (Intel) - Gaudi3
      runner_scale_set: itac-bm-emr-gaudi3-dell
      slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
@ -47,7 +47,7 @@ jobs:
    name: DeepSpeed CI
    uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
    with:
-      job: run_deepspeed_gpu
+      job: run_torch_cuda_extensions_gpu
      ci_event: Scheduled CI (Intel) - Gaudi3
      runner_scale_set: itac-bm-emr-gaudi3-dell
      slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@ -50,7 +50,7 @@ jobs:
    name: Setup
    strategy:
      matrix:
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -128,13 +128,14 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
        slice_id: [0, 1]
    uses: ./.github/workflows/model_jobs.yml
    with:
      folder_slices: ${{ needs.setup.outputs.folder_slices }}
      machine_type: ${{ matrix.machine_type }}
      slice_id: ${{ matrix.slice_id }}
+      runner_map: ${{ needs.setup.outputs.runner_map }}
      docker: ${{ inputs.docker }}
      report_name_prefix: run_trainer_and_fsdp_gpu
    secrets: inherit
@ -145,7 +146,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -179,9 +180,9 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
@ -213,7 +214,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-4xlarge-cache]
+        machine_type: [aws-g5-4xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -247,9 +248,9 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
@ -282,7 +283,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -344,9 +345,9 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
@ -381,7 +382,7 @@ jobs:
      fail-fast: false
      matrix:
        folders: ${{ fromJson(needs.setup.outputs.quantization_matrix) }}
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -424,9 +425,9 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
+          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
            machine_type=multi-gpu
          else
            machine_type=${{ matrix.machine_type }}
--- a/awesome-transformers.md
+++ b/awesome-transformers.md
@ -288,7 +288,7 @@ Keywords: Music understanding, Music generation

 ## [dalle-flow](https://github.com/jina-ai/dalle-flow)

-DALL·E Flow is an interactive workflow for generating high-definition images from a text prompt. Itt leverages DALL·E-Mega, GLID-3 XL, and Stable Diffusion to generate image candidates, and then calls CLIP-as-service to rank the candidates w.r.t. the prompt.
+DALL·E Flow is an interactive workflow for generating high-definition images from a text prompt. It leverages DALL·E-Mega, GLID-3 XL, and Stable Diffusion to generate image candidates, and then calls CLIP-as-service to rank the candidates w.r.t. the prompt.
 The preferred candidate is fed to GLID-3 XL for diffusion, which often enriches the texture and background. Finally, the candidate is upscaled to 1024x1024 via SwinIR.

 Keywords: High-definition image generation, Stable Diffusion, DALL-E Mega, GLID-3 XL, CLIP, SwinIR
@ -526,7 +526,7 @@ Keywords: Model deployment, CLoud, Mobile, Edge

 ## [underthesea](https://github.com/undertheseanlp/underthesea)

-[underthesea](https://github.com/undertheseanlp/underthesea) is a Vietnamese NLP toolkit. Underthesea is a suite of open source Python modules data sets and tutorials supporting research and development in Vietnamese Natural Language Processing. We provides extremely easy API to quickly apply pretrained NLP models to your Vietnamese text, such as word segmentation, part-of-speech tagging (PoS), named entity recognition (NER), text classification and dependency parsing.
+[underthesea](https://github.com/undertheseanlp/underthesea) is a Vietnamese NLP toolkit. Underthesea is a suite of open source Python modules data sets and tutorials supporting research and development in Vietnamese Natural Language Processing. We provide extremely easy API to quickly apply pretrained NLP models to your Vietnamese text, such as word segmentation, part-of-speech tagging (PoS), named entity recognition (NER), text classification and dependency parsing.

 Keywords: Vietnamese, NLP

--- a/docker/transformers-quantization-latest-gpu/Dockerfile
+++ b/docker/transformers-quantization-latest-gpu/Dockerfile
@ -93,6 +93,9 @@ RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch]
 # `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
 RUN python3 -m pip uninstall -y kernels

+# Uninstall flash-attn installed by autoawq, it causes issues here : https://github.com/huggingface/transformers/actions/runs/15915442841/job/44892146131
+RUN python3 -m pip uninstall -y flash-attn
+
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@ -17,12 +17,12 @@
      title: Customizing model components
    - local: model_sharing
      title: Sharing
-    - local: add_new_model
-      title: Adding a new model to Transformers
    - local: modular_transformers
-      title: Modular Transformers
+      title: Contributing a new model to Transformers
+    - local: add_new_model
+      title: Legacy model contribution
    - local: auto_docstring
-      title: Document your models
+      title: Documenting a model
    - local: attention_interface
      title: Customizing attention function
    title: Models
@ -97,11 +97,9 @@
    - local: perf_infer_gpu_one
      title: GPU
    - local: perf_infer_gpu_multi
-      title: Distributed GPU inference
+      title: Distributed inference
    - local: perf_infer_cpu
      title: CPU
-    - local: tf_xla
-      title: XLA
    title: Optimization
  - local: agents
    title: Agents
@ -141,8 +139,6 @@
      title: GPU
    - local: perf_train_cpu
      title: CPU
-    - local: perf_train_tpu_tf
-      title: TPU
    - local: perf_train_special
      title: Apple Silicon
    - local: perf_train_gaudi
@ -737,6 +733,8 @@
        title: EfficientFormer
      - local: model_doc/efficientnet
        title: EfficientNet
+      - local: model_doc/eomt
+        title: EoMT
      - local: model_doc/focalnet
        title: FocalNet
      - local: model_doc/glpn
@ -1142,4 +1140,3 @@
      title: Environment Variables
    title: Reference
  title: API
-
--- a/docs/source/en/add_new_model.md
+++ b/docs/source/en/add_new_model.md
@ -13,7 +13,7 @@ rendered properly in your Markdown viewer.

 -->

-# Adding a new model to Transformers
+# Legacy model contribution

 > [!TIP]
 > Try adding new models with a more [modular](./modular_transformers) approach first. This makes it significantly easier to contribute a model to Transformers!
--- a/docs/source/en/agents.md
+++ b/docs/source/en/agents.md
@ -14,5 +14,9 @@ rendered properly in your Markdown viewer.

 -->

+# Agents
+
+(deprecated)
+
 > [!WARNING]
 > Agents and tools were spun out into the standalone [smolagents](https://huggingface.co/docs/smolagents/index) library. They were removed from `transformers` in v4.52.
--- a/docs/source/en/auto_docstring.md
+++ b/docs/source/en/auto_docstring.md
@ -14,43 +14,26 @@ rendered properly in your Markdown viewer.

 -->

-# Utilizing the @auto_docstring Decorator
+# Documenting a model

-The `@auto_docstring` decorator in the Hugging Face Transformers library helps generate docstrings for model classes and their methods, which will be used to build the documentation for the library. It aims to improve consistency and reduce boilerplate by automatically including standard argument descriptions and allowing for targeted overrides and additions.
+The `@auto_docstring` decorator in Transformers generates consistent docstrings for model classes and their methods. It reduces boilerplate by automatically including standard argument descriptions while also allowing overrides to add new or custom arguments. [Contributing a new model](./modular_transformers) is easier because you don't need to manually add the standard docstrings, and only focus on documenting new arguments.

---
+This guide describes how to use the `@auto_docstring` decorator and how it works.

-## 📜 How it Works
+## @auto_docstring

-The `@auto_docstring` decorator constructs docstrings by:
-
-1.  **Signature Inspection:** It inspects the signature (arguments, types, defaults) of the decorated class's `__init__` method or the decorated function.
-2.  **Centralized Docstring Fetching:** It retrieves predefined docstrings for common arguments (e.g., `input_ids`, `attention_mask`) from internal library sources (like `ModelArgs` or `ImageProcessorArgs` in `utils/args_doc.py`).
-3.  **Overriding or Adding Arguments Descriptions:**
-    * **Direct Docstring Block:** It incorporates custom docstring content from an `r""" """` (or `""" """`) block below the method signature or within the `__init__` docstring. This is for documenting new arguments or overriding standard descriptions.
-    * **Decorator Arguments (`custom_args`):** A `custom_args` docstring block can be passed to the decorator to provide docstrings for specific arguments directly in the decorator call. This can be used to define the docstring block for new arguments once if they are repeated in multiple places in the modeling file.
-4.  **Adding Classes and Functions Introduction:**
-    * **`custom_intro` argument:** Allows prepending a custom introductory paragraph to a class or function docstring.
-    * **Automatic Introduction Generation:** For model classes with standard naming patterns (like `ModelForCausalLM`) or belonging to a pipeline, the decorator automatically generates an appropriate introductory paragraph using `ClassDocstring` in `utils/args_doc.py` as the source.
-5.  **Templating:** The decorator uses a templating system, allowing predefined docstrings to include dynamic information deduced from the `auto_modules` of the library, such as `{{processor_class}}` or `{{config_class}}`.
-6.  **Deducing Relevant Examples:** The decorator attempts to find appropriate usage examples based on the model's task or pipeline compatibility. It extracts checkpoint information from the model's configuration class to provide concrete examples with real model identifiers.
-7.  **Adding Return Value Documentation:** For methods like `forward`, the decorator can automatically generate the "Returns" section based on the method's return type annotation. For example, for a method returning a `ModelOutput` subclass, it will extracts field descriptions from that class's docstring to create a comprehensive return value description. A custom `Returns` section can also be manually specified in the function docstring block.
-8.  **Unrolling Kwargs Typed With Unpack Operator:** For specific methods (defined in `UNROLL_KWARGS_METHODS`) or classes (defined in `UNROLL_KWARGS_CLASSES`), the decorator processes `**kwargs` parameters that are typed with `Unpack[KwargsTypedDict]`. It extracts the documentation from the TypedDict and adds each parameter to the function's docstring. Currently, this functionality is only supported for `FastImageProcessorKwargs`.
-
-
---
-
-## 🚀 How to Use @auto_docstring
-
-### 1. Importing the Decorator
-Import the decorator into your modeling file:
+Start by importing the decorator in the modeling file (`modular_model.py` or `modeling_model.py`).

 ```python
 from ...utils import auto_docstring
 ```

-### 2. Applying to Classes
-Place `@auto_docstring` directly above the class definition. It uses the `__init__` method's signature and its docstring for parameter descriptions.
+Select whether you'd like to apply `@auto_docstring` to a class or function below to see how to use it.
+
+<hfoptions id="type">
+<hfoption id="classes">
+
+Place `@auto_docstring` directly above the class definition. The decorator derives parameter descriptions from the `__init__` method's signature and docstring.

 ```python
 from transformers.modeling_utils import PreTrainedModel
@ -73,9 +56,7 @@ class MyAwesomeModel(PreTrainedModel):
    # ... other methods
 ```

-#### Advanced Class Decoration:
-
-Arguments can be passed directly to `@auto_docstring` for more control:
+Arguments can also be passed directly to `@auto_docstring` for more control. Use the `custom_intro` parameter to describe the argument and the `custom_args` parameter to describe the arguments.

 ```python
@auto_docstring(
@ -93,7 +74,7 @@ class MySpecialModel(PreTrainedModel):
        # ...
 ```

-Or:
+You can also choose to only use `custom_intro` and define the custom arguments directly in the class.

 ```python
@auto_docstring(
@ -111,8 +92,10 @@ class MySpecialModel(PreTrainedModel):
        # ...
 ```

-### 3. Applying to Functions (e.g., `forward` method)
-Apply the decorator above method definitions, such as the `forward` method.
+</hfoption>
+<hfoption id="functions">
+
+Place `@auto_docstring` directly above the method definition. The decorator derives parameter descriptions from the function signature.

 ```python
    @auto_docstring
@ -131,9 +114,10 @@ Apply the decorator above method definitions, such as the `forward` method.
        # ...
 ```

-#### Advanced Function Decoration:
+Arguments can also be passed directly to `@auto_docstring` for more control. Use the `custom_intro` parameter to describe the argument and the `custom_args` parameter to describe the arguments.
+
+The `Returns` and `Examples` parts of the docstring can also be manually specified.

-Arguments can be passed directly to `@auto_docstring` for more control. `Returns` and `Examples` sections can also be manually specified:

 ```python
 MODEL_COMMON_CUSTOM_ARGS = r"""
@ -180,100 +164,117 @@ class MyModel(PreTrainedModel):
        # ...
 ```

---
+</hfoption>
+</hfoptions>

-### ✍️ Documenting Arguments: Approach & Priority
+## Documenting arguments

-1.  **Standard Arguments (e.g., `input_ids`, `attention_mask`, `pixel_values`, `encoder_hidden_states` etc.):**
-    * `@auto_docstring` retrieves descriptions from a central source. Do not redefine these locally if their description and shape are the same as in `args_doc.py`.
+There are some rules for documenting different types of arguments and they're listed below.
+
+- Standard arguments (`input_ids`, `attention_mask`, `pixel_values`, etc.) are defined and retrieved from `args_doc.py`. It is the single source of truth for standard arguments and should not be redefined locally if an argument's description and shape is the same as an argument in `args_doc.py`.
+
+    If a standard argument behaves differently in your model, then you can override it locally in a `r""" """` block. This local definition has a higher priority. For example, the `labels` argument is often customized per model and typically requires overriding.
+
+
+- New or custom arguments should be documented within an `r""" """` block after the signature if it is a function or in the `__init__` method's docstring if it is a class.
+
+    ```py
+    argument_name (`type`, *optional*, defaults to `X`):
+        Description of the argument.
+        Explain its purpose, expected shape/type if complex, and default behavior.
+        This can span multiple lines.
+    ```

-2.  **New or Custom Arguments:**
-    * **Primary Method:** Document these within an `r""" """` docstring block following the signature (for functions) or in the `__init__` method's docstring (for class parameters).
-    * **Format:**
-        ```
-        argument_name (`type`, *optional*, defaults to `X`):
-            Description of the argument.
-            Explain its purpose, expected shape/type if complex, and default behavior.
-            This can span multiple lines.
-        ```
    * Include `type` in backticks.
-    * Add "*optional*" if the argument is not required (has a default value).
-    * Add "defaults to `X`" if it has a default value (no need to specify "defaults to `None`" if the default value is `None`).
+    * Add *optional* if the argument is not required or has a default value.
+    * Add "defaults to X" if it has a default value. You don't need to add "defaults to `None`" if the default value is `None`.

-3.  **Overriding Standard Arguments:**
-    * If a standard argument behaves differently (e.g., different expected shape, model-specific behavior), provide its complete description in the local `r""" """` docstring. This local definition takes precedence.
-    * The `labels` argument is often customized per model and typically requires a specific docstring.
+    These arguments can also be passed to `@auto_docstring` as a `custom_args` argument. It is used to define the docstring block for new arguments once if they are repeated in multiple places in the modeling file.

-4.  **Using Decorator Arguments for Overrides or New Arguments (`custom_args`):**
-    * New or custom arguments docstrings can also be passed to `@auto_docstring` as a `custom_args` argument. This can be used to define the docstring block for new arguments once if they are repeated in multiple places in the modeling file.
+    ```py
+    class MyModel(PreTrainedModel):
+    # ...
+    @auto_docstring(
+        custom_intro="""
+        This is a custom introduction for the function.
+        """
+        custom_args=r"""
+        common_arg_1 (`torch.Tensor`, *optional*, defaults to `default_value`):
+            Description of common_arg_1
+        """
+    )
+    ```

---
+## Checking the docstrings

-### Usage with [modular files](./modular_transformers)
+Transformers includes a utility script to validate the docstrings when you open a Pull Request which triggers CI (continuous integration) checks. The script checks for the following criteria.

-When working with modular files, follow these guidelines for applying the `@auto_docstring` decorator:
+* Ensures `@auto_docstring` is applied to relevant mode classes and public methods.
+* Ensures arguments are complete and consistent. It checks that documented arguments exist in the signature and verifies whether the types and default values in the docstring match the signature. Arguments that aren't known standard arguments or if they lack a local description are flagged.
+* Reminds you to complete placeholders like `<fill_type>` and `<fill_docstring>`.
+* Ensures docstrings are formatted according to the expected docstring style.

- **For standalone models in modular files:**
-  Apply the `@auto_docstring` decorator just as you would in regular modeling files.
-
- **For models inheriting from other library models:**
-  - When inheriting from a parent model, decorators (including `@auto_docstring`) are automatically carried over to the generated modeling file without needing to add them in your modular file.
-  - If you need to modify the `@auto_docstring` behavior, apply the customized decorator in your modular file, making sure to *include all other decorators* that were present on the original function/class.
-
-  > **Warning**: When overriding any decorator in a modular file, you must include ALL decorators that were applied to that function/class in the parent model. If you only override some decorators, the others won't be included in the generated modeling file.
-
-
-**Note**: The `check_auto_docstrings` tool doesn't check modular files directly, but it will check (and modify when using `--fix_and_overwrite`) the generated modeling files. If issues are found in the generated files, you'll need to update your modular files accordingly.
-
---
-
-## ✅ Checking Your Docstrings with `check_auto_docstrings`
-
-The library includes a utility script to validate docstrings. This check is typically run during Continuous Integration (CI).
-
-#### What it Checks:
-
-* **Decorator Presence:** Ensures `@auto_docstring` is applied to relevant model classes and public methods. (TODO)
-* **Argument Completeness & Consistency:**
-    * Flags arguments in the signature that are not known standard arguments and lack a local description.
-    * Ensures documented arguments exist in the signature. (TODO)
-    * Verifies that types and default values in the docstring match the signature. (TODO)
-* **Placeholder Detection:** Reminds you to complete placeholders like `<fill_type>` or `<fill_docstring>`.
-* **Formatting:** Adherence to the expected docstring style.
-
-#### Running the Check Locally:
-
-Run this check locally before committing. The common command is:
+You can run this check locally - before committing - by running the following command.

 ```bash
 make fix-copies
 ```

-Alternatively, to only perform docstrings and auto-docstring checks, you can use:
+`make fix-copies` runs several other checks as well. If you don't need those checks, run the command below to only perform docstring and auto-docstring checks.

 ```bash
 python utils/check_docstrings.py # to only check files included in the diff without fixing them
-# Or: python utils/check_docstrings.py --fix_and_overwrite # to fix and overwrite the files in the diff
-# Or: python utils/check_docstrings.py --fix_and_overwrite --check_all # to fix and overwrite all files
+# python utils/check_docstrings.py --fix_and_overwrite # to fix and overwrite the files in the diff
+# python utils/check_docstrings.py --fix_and_overwrite --check_all # to fix and overwrite all files
 ```

-#### Workflow with the Checker:
+## modular_model.py files

-1.  Add `@auto_docstring(...)` to the class or method.
-2.  For new, custom, or overridden arguments, add descriptions in an `r""" """` block.
-3.  Run `make fix-copies` (or the `check_docstrings.py` utility).
-    * For unrecognized arguments lacking documentation, the utility will create placeholder entries.
-4.  Manually edit these placeholders with accurate types and descriptions.
-5.  Re-run the check to ensure all issues are resolved.
+When working with modular files (`modular_model.py`), follow the guidelines below for applying `@auto_docstring`.

---
+- For standalone models in modular files, apply `@auto_docstring` like you would in a `modeling_model.py` file.
+- For models that inherit from other library models, `@auto_docstring` is automatically carried over to the generated modeling file. You don't need to add `@auto_docstring` in your modular file.

-## 🔑 Key Takeaways & Best Practices
+    If you need to modify the `@auto_docstring` behavior, apply the customized decorator in your modular file. Make sure to **include all other decorators** that are present in the original function or class.

-* Use `@auto_docstring` for new PyTorch model classes (`PreTrainedModel` subclasses) and their primary for methods (e.g., `forward`, `get_text_features` etc.).
-* For classes, the `__init__` method's docstring is the main source for parameter descriptions when using `@auto_docstring` on the class.
-* Rely on standard docstrings; do not redefine common arguments unless their behavior is different in your specific model.
+> [!WARNING]
+> When overriding any decorator in a modular file, you must include **all** decorators that were applied to that function or class in the parent model. If you only override some decorators, the others won't be included in the generated modeling file.
+
+## How it works
+
+The `@auto_docstring` decorator automatically generates docstrings by:
+
+1. Inspecting the signature (arguments, types, defaults) of the decorated class' `__init__` method or the decorated function.
+2. Retrieving the predefined docstrings for common arguments (`input_ids`, `attention_mask`, etc.) from internal library sources like [`ModelArgs`], [`ImageProcessorArgs`], and the `args_doc.py` file.
+3. Adding argument descriptions in one of two ways as shown below.
+
+    | method | description | usage |
+    |---|---|---|
+    | `r""" """` | add custom docstring content directly to a method signature or within the `__init__` docstring | document new arguments or override standard descriptions |
+    | `custom_args` | add custom docstrings for specific arguments directly in `@auto_docstring` | define docstring for new arguments once if they're repeated in multiple places in the modeling file |
+
+4. Adding class and function descriptions. For model classes with standard naming patterns, like `ModelForCausalLM`, or if it belongs to a pipeline, `@auto_docstring` automatically generates the appropriate descriptions with `ClassDocstring` from `args_doc.py`.
+
+    `@auto_docstring` also accepts the `custom_intro` argument to describe a class or function.
+
+5. Using a templating system to allow predefined docstrings to include dynamic information from Transformers' [auto_modules](https://github.com/huggingface/transformers/tree/main/src/transformers/models/auto) such as `{{processor_class}}` and `{{config_class}}`.
+
+6. Finding appropriate usage examples based on the model's task or pipeline compatibility. It extracts checkpoint information form the model's configuration class to provide concrete examples with real model identifiers.
+
+7. Adding return values to the docstring. For methods like `forward`, the decorator automatically generates the `Returns` field in the docstring based on the method's return type annotation.
+
+    For example, if a method returns a [`~transformers.utils.ModelOutput`] subclass, `@auto_docstring` extracts the field descriptions from the class' docstring to create a comprehensive return value description. You can also manually specifiy a custom `Returns` field in a functions docstring.
+
+8. Unrolling kwargs typed with the unpack operator. For specific methods (defined in `UNROLL_KWARGS_METHODS`) or classes (defined in `UNROLL_KWARGS_CLASSES`), the decorator processes `**kwargs` parameters that are typed with `Unpack[KwargsTypedDict]`. It extracts the documentations from the `TypedDict` and adds each parameter to the function's docstring.
+
+    Currently only supported for [`FastImageProcessorKwargs`].
+
+## Best practices
+
+Follow the best practices below to help maintain consistent and informative documentation for Transformers!
+
+* Use `@auto_docstring` for new PyTorch model classes ([`PreTrainedModel`] subclasses) and their primary methods like `forward` or `get_text_features`.
+* For classes, `@auto_docstring` retrieves parameter descriptions from the `__init__` method's docstring.
+* Rely on standard docstrings and do not redefine common arguments unless their behavior is different in your model.
 * Document new or custom arguments clearly.
 * Run `check_docstrings` locally and iteratively.
-
-By following these guidelines, you help maintain consistent and informative documentation for the Hugging Face Transformers library 🤗.
--- a/docs/source/en/chat_templating_multimodal.md
+++ b/docs/source/en/chat_templating_multimodal.md
@ -56,7 +56,7 @@ Create a [`ImageTextToTextPipeline`] and pass the chat to it. For large models,
 import torch
 from transformers import pipeline

-pipeline = pipeline("image-text-to-text", model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", device="cuda", torch_dtype=torch.float16)
+pipeline = pipeline("image-text-to-text", model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", device_map="auto", torch_dtype=torch.float16)
 pipeline(text=messages, max_new_tokens=50, return_full_text=False)
 [{'input_text': [{'role': 'system',
    'content': [{'type': 'text',
@ -175,7 +175,7 @@ processed_chat = processor.apply_chat_template(
    add_generation_prompt=True,
    tokenize=True,
    return_dict=True,
-    video_fps=32,
+    video_fps=16,
    video_load_backend="decord",
 )
 print(processed_chat.keys())
--- a/docs/source/en/conversations.md
+++ b/docs/source/en/conversations.md
@ -25,7 +25,7 @@ Check model leaderboards like [OpenLLM](https://hf.co/spaces/HuggingFaceH4/open_

 This guide shows you how to quickly start chatting with Transformers from the command line, how build and format a conversation, and how to chat using the [`TextGenerationPipeline`].

-## transformers CLI
+## chat CLI

 After you've [installed Transformers](./installation.md), chat with a model directly from the command line as shown below. It launches an interactive session with a model, with a few base commands listed at the start of the session.

@ -49,7 +49,8 @@ For a full list of options, run the command below.
 transformers chat -h
 ```

-The chat is implemented on top of the [AutoClass](./model_doc/auto), using tooling from [text generation](./llm_tutorial) and [chat](./chat_templating).
+The chat is implemented on top of the [AutoClass](./model_doc/auto), using tooling from [text generation](./llm_tutorial) and [chat](./chat_templating). It uses the `transformers serve` CLI under the hood ([docs](./serving.md#serve-cli)).
+

 ## TextGenerationPipeline

--- a/docs/source/en/feature_extractors.md
+++ b/docs/source/en/feature_extractors.md
@ -26,6 +26,7 @@ Pass the audio signal, typically stored in `array`, to the feature extractor and
 from transformers import AutoFeatureExtractor

 feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
+dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
 processed_sample = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=16000)
 processed_sample
 {'input_values': [array([ 9.4472744e-05,  3.0777880e-03, -2.8888427e-03, ...,
--- a/docs/source/en/model_doc/bigbird_pegasus.md
+++ b/docs/source/en/model_doc/bigbird_pegasus.md
@ -14,59 +14,123 @@ rendered properly in your Markdown viewer.

 -->

-# BigBirdPegasus
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+           <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
 </div>

-## Overview
+# BigBirdPegasus

-The BigBird model was proposed in [Big Bird: Transformers for Longer Sequences](https://huggingface.co/papers/2007.14062) by
-Zaheer, Manzil and Guruganesh, Guru and Dubey, Kumar Avinava and Ainslie, Joshua and Alberti, Chris and Ontanon,
-Santiago and Pham, Philip and Ravula, Anirudh and Wang, Qifan and Yang, Li and others. BigBird, is a sparse-attention
-based transformer which extends Transformer based models, such as BERT to much longer sequences. In addition to sparse
-attention, BigBird also applies global attention as well as random attention to the input sequence. Theoretically, it
-has been shown that applying sparse, global, and random attention approximates full attention, while being
-computationally much more efficient for longer sequences. As a consequence of the capability to handle longer context,
-BigBird has shown improved performance on various long document NLP tasks, such as question answering and
-summarization, compared to BERT or RoBERTa.
+[BigBirdPegasus](https://huggingface.co/papers/2007.14062) is an encoder-decoder (sequence-to-sequence) transformer model for long-input summarization. It extends the [BigBird](./big_bird) architecture with an additional pretraining objective borrowed from [Pegasus](./pegasus) called gap sequence generation (GSG). Whole sentences are masked and the model has to fill in the gaps in the document. BigBirdPegasus's ability to keep track of long contexts makes it effective at summarizing lengthy inputs, surpassing the performance of base Pegasus models.

-The abstract from the paper is the following:
+You can find all the original BigBirdPegasus checkpoints under the [Google](https://huggingface.co/google/models?search=bigbird-pegasus) organization.

-*Transformers-based models, such as BERT, have been one of the most successful deep learning models for NLP.
-Unfortunately, one of their core limitations is the quadratic dependency (mainly in terms of memory) on the sequence
-length due to their full attention mechanism. To remedy this, we propose, BigBird, a sparse attention mechanism that
-reduces this quadratic dependency to linear. We show that BigBird is a universal approximator of sequence functions and
-is Turing complete, thereby preserving these properties of the quadratic, full attention model. Along the way, our
-theoretical analysis reveals some of the benefits of having O(1) global tokens (such as CLS), that attend to the entire
-sequence as part of the sparse attention mechanism. The proposed sparse attention can handle sequences of length up to
-8x of what was previously possible using similar hardware. As a consequence of the capability to handle longer context,
-BigBird drastically improves performance on various NLP tasks such as question answering and summarization. We also
-propose novel applications to genomics data.*
+> [!TIP]
+> This model was contributed by [vasudevgupta](https://huggingface.co/vasudevgupta).
+>
+> Click on the BigBirdPegasus models in the right sidebar for more examples of how to apply BigBirdPegasus to different language tasks.

-The original code can be found [here](https://github.com/google-research/bigbird).
+The example below demonstrates how to summarize text with [`Pipeline`], [`AutoModel`], and from the command line.

-## Usage tips
+<hfoptions id="usage">
+<hfoption id="Pipeline">

- For an in-detail explanation on how BigBird's attention works, see [this blog post](https://huggingface.co/blog/big-bird).
- BigBird comes with 2 implementations: **original_full** & **block_sparse**. For the sequence length < 1024, using
-  **original_full** is advised as there is no benefit in using **block_sparse** attention.
- The code currently uses window size of 3 blocks and 2 global blocks.
- Sequence length must be divisible by block size.
- Current implementation supports only **ITC**.
- Current implementation doesn't support **num_random_blocks = 0**.
- BigBirdPegasus uses the [PegasusTokenizer](https://github.com/huggingface/transformers/blob/main/src/transformers/models/pegasus/tokenization_pegasus.py).
- BigBird is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
-  the left.
+```py
+import torch
+from transformers import pipeline
+
+pipeline = pipeline(
+    task="summarization",
+    model="google/bigbird-pegasus-large-arxiv",
+    torch_dtype=torch.float32,
+    device=0
+)
+pipeline("""Plants are among the most remarkable and essential life forms on Earth, possessing a unique ability to produce their own food through a process known as photosynthesis. This complex biochemical process is fundamental not only to plant life but to virtually all life on the planet.
+Through photosynthesis, plants capture energy from sunlight using a green pigment called chlorophyll, which is located in specialized cell structures called chloroplasts. In the presence of light, plants absorb carbon dioxide from the atmosphere through small pores in their leaves called stomata, and take in water from the soil through their root systems.
+These ingredients are then transformed into glucose, a type of sugar that serves as a source of chemical energy, and oxygen, which is released as a byproduct into the atmosphere. The glucose produced during photosynthesis is not just used immediately; plants also store it as starch or convert it into other organic compounds like cellulose, which is essential for building their cellular structure.
+This energy reserve allows them to grow, develop leaves, produce flowers, bear fruit, and carry out various physiological processes throughout their lifecycle.""")
+```
+</hfoption>
+<hfoption id="AutoModel">
+
+```py
+import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+
+tokenizer = AutoTokenizer.from_pretrained(
+    "google/bigbird-pegasus-large-arxiv"
+)
+model = AutoModelForSeq2SeqLM.from_pretrained(
+    "google/bigbird-pegasus-large-arxiv",
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+)
+
+input_text = """Plants are among the most remarkable and essential life forms on Earth, possessing a unique ability to produce their own food through a process known as photosynthesis. This complex biochemical process is fundamental not only to plant life but to virtually all life on the planet.
+Through photosynthesis, plants capture energy from sunlight using a green pigment called chlorophyll, which is located in specialized cell structures called chloroplasts. In the presence of light, plants absorb carbon dioxide from the atmosphere through small pores in their leaves called stomata, and take in water from the soil through their root systems.
+These ingredients are then transformed into glucose, a type of sugar that serves as a source of chemical energy, and oxygen, which is released as a byproduct into the atmosphere. The glucose produced during photosynthesis is not just used immediately; plants also store it as starch or convert it into other organic compounds like cellulose, which is essential for building their cellular structure.
+This energy reserve allows them to grow, develop leaves, produce flowers, bear fruit, and carry out various physiological processes throughout their lifecycle."""
+input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+
+output = model.generate(**input_ids, cache_implementation="static")
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+</hfoption>
+<hfoption id="transformers-cli">
+
+```bash
+echo -e "Plants are among the most remarkable and essential life forms on Earth, possessing a unique ability to produce their own food through a process known as photosynthesis. This complex biochemical process is fundamental not only to plant life but to virtually all life on the planet. Through photosynthesis, plants capture energy from sunlight using a green pigment called chlorophyll, which is located in specialized cell structures called chloroplasts." | transformers-cli run --task summarization --model google/bigbird-pegasus-large-arxiv --device 0
+```
+
+</hfoption>
+</hfoptions>
+
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
+
+The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to int4.
+
+```py
+import torch
+from transformers import BitsAndBytesConfig, AutoModelForSeq2SeqLM, AutoTokenizer
+
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.bfloat16,
+    bnb_4bit_quant_type="nf4"
+)
+model = AutoModelForSeq2SeqLM.from_pretrained(
+    "google/bigbird-pegasus-large-arxiv",
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    quantization_config=quantization_config
+)
+
+tokenizer = AutoTokenizer.from_pretrained(
+    "google/bigbird-pegasus-large-arxiv"
+)
+
+input_text = """Plants are among the most remarkable and essential life forms on Earth, possessing a unique ability to produce their own food through a process known as photosynthesis. This complex biochemical process is fundamental not only to plant life but to virtually all life on the planet.
+Through photosynthesis, plants capture energy from sunlight using a green pigment called chlorophyll, which is located in specialized cell structures called chloroplasts. In the presence of light, plants absorb carbon dioxide from the atmosphere through small pores in their leaves called stomata, and take in water from the soil through their root systems.
+These ingredients are then transformed into glucose, a type of sugar that serves as a source of chemical energy, and oxygen, which is released as a byproduct into the atmosphere. The glucose produced during photosynthesis is not just used immediately; plants also store it as starch or convert it into other organic compounds like cellulose, which is essential for building their cellular structure.
+This energy reserve allows them to grow, develop leaves, produce flowers, bear fruit, and carry out various physiological processes throughout their lifecycle."""
+input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+
+output = model.generate(**input_ids, cache_implementation="static")
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+
+## Notes
+
+- BigBirdPegasus also uses the [`PegasusTokenizer`].
+- Inputs should be padded on the right because BigBird uses absolute position embeddings.
+- BigBirdPegasus supports `original_full` and `block_sparse` attention. If the input sequence length is less than 1024, it is recommended to use `original_full` since sparse patterns don't offer much benefit for smaller inputs.
+- The current implementation uses window size of 3 blocks and 2 global blocks, only supports the ITC-implementation, and doesn't support `num_random_blocks=0`.
+- The sequence length must be divisible by the block size.

 ## Resources

- [Text classification task guide](../tasks/sequence_classification)
- [Question answering task guide](../tasks/question_answering)
- [Causal language modeling task guide](../tasks/language_modeling)
- [Translation task guide](../tasks/translation)
- [Summarization task guide](../tasks/summarization)
+Read the [Understanding BigBird's Block Sparse Attention](https://huggingface.co/blog/big-bird) blog post for more details about how BigBird's attention works.

 ## BigBirdPegasusConfig

--- a/docs/source/en/model_doc/chameleon.md
+++ b/docs/source/en/model_doc/chameleon.md
@ -191,6 +191,11 @@ model = ChameleonForConditionalGeneration.from_pretrained(
 [[autodoc]] ChameleonImageProcessor
    - preprocess

+## ChameleonImageProcessorFast
+
+[[autodoc]] ChameleonImageProcessorFast
+    - preprocess
+
 ## ChameleonVQVAE

 [[autodoc]] ChameleonVQVAE
--- a/docs/source/en/model_doc/cohere.md
+++ b/docs/source/en/model_doc/cohere.md
@ -3,6 +3,7 @@
        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
    </div>
 </div>

--- a/docs/source/en/model_doc/cohere2.md
+++ b/docs/source/en/model_doc/cohere2.md
@ -4,6 +4,7 @@
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
 <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
 <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
 </div>

 ## Overview
--- a/docs/source/en/model_doc/deberta-v2.md
+++ b/docs/source/en/model_doc/deberta-v2.md
@ -14,66 +14,111 @@ rendered properly in your Markdown viewer.

 -->

-# DeBERTa-v2
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+           <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white" >
+           <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+    </div>
 </div>

-## Overview

-The DeBERTa model was proposed in [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://huggingface.co/papers/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen It is based on Google's
-BERT model released in 2018 and Facebook's RoBERTa model released in 2019.
+# DeBERTa-v2

-It builds on RoBERTa with disentangled attention and enhanced mask decoder training with half of the data used in
-RoBERTa.
+[DeBERTa-v2](https://huggingface.co/papers/2006.03654) improves on the original [DeBERTa](./deberta) architecture by using a SentencePiece-based tokenizer and a new vocabulary size of 128K. It also adds an additional convolutional layer within the first transformer layer to better learn local dependencies of input tokens. Finally, the position projection and content projection matrices are shared in the attention layer to reduce the number of parameters.

-The abstract from the paper is the following:
-
-*Recent progress in pre-trained neural language models has significantly improved the performance of many natural
-language processing (NLP) tasks. In this paper we propose a new model architecture DeBERTa (Decoding-enhanced BERT with
-disentangled attention) that improves the BERT and RoBERTa models using two novel techniques. The first is the
-disentangled attention mechanism, where each word is represented using two vectors that encode its content and
-position, respectively, and the attention weights among words are computed using disentangled matrices on their
-contents and relative positions. Second, an enhanced mask decoder is used to replace the output softmax layer to
-predict the masked tokens for model pretraining. We show that these two techniques significantly improve the efficiency
-of model pretraining and performance of downstream tasks. Compared to RoBERTa-Large, a DeBERTa model trained on half of
-the training data performs consistently better on a wide range of NLP tasks, achieving improvements on MNLI by +0.9%
-(90.2% vs. 91.1%), on SQuAD v2.0 by +2.3% (88.4% vs. 90.7%) and RACE by +3.6% (83.2% vs. 86.8%). The DeBERTa code and
-pre-trained models will be made publicly available at https://github.com/microsoft/DeBERTa.*
+You can find all the original [DeBERTa-v2] checkpoints under the [Microsoft](https://huggingface.co/microsoft?search_models=deberta-v2) organization.


-The following information is visible directly on the [original implementation
-repository](https://github.com/microsoft/DeBERTa). DeBERTa v2 is the second version of the DeBERTa model. It includes
-the 1.5B model used for the SuperGLUE single-model submission and achieving 89.9, versus human baseline 89.8. You can
-find more details about this submission in the authors'
-[blog](https://www.microsoft.com/en-us/research/blog/microsoft-deberta-surpasses-human-performance-on-the-superglue-benchmark/)
+> [!TIP]
+> This model was contributed by [Pengcheng He](https://huggingface.co/DeBERTa).
+>
+> Click on the DeBERTa-v2 models in the right sidebar for more examples of how to apply DeBERTa-v2 to different language tasks.

-New in v2:
+The example below demonstrates how to classify text with [`Pipeline`] or the [`AutoModel`] class.

- **Vocabulary** In v2 the tokenizer is changed to use a new vocabulary of size 128K built from the training data.
-  Instead of a GPT2-based tokenizer, the tokenizer is now
-  [sentencepiece-based](https://github.com/google/sentencepiece) tokenizer.
- **nGiE(nGram Induced Input Encoding)** The DeBERTa-v2 model uses an additional convolution layer aside with the first
-  transformer layer to better learn the local dependency of input tokens.
- **Sharing position projection matrix with content projection matrix in attention layer** Based on previous
-  experiments, this can save parameters without affecting the performance.
- **Apply bucket to encode relative positions** The DeBERTa-v2 model uses log bucket to encode relative positions
-  similar to T5.
- **900M model & 1.5B model** Two additional model sizes are available: 900M and 1.5B, which significantly improves the
-  performance of downstream tasks.
+<hfoptions id="usage">
+<hfoption id="Pipeline">

-This model was contributed by [DeBERTa](https://huggingface.co/DeBERTa). This model TF 2.0 implementation was
-contributed by [kamalkraj](https://huggingface.co/kamalkraj). The original code can be found [here](https://github.com/microsoft/DeBERTa).
+```py
+import torch
+from transformers import pipeline

-## Resources
+pipeline = pipeline(
+    task="text-classification",
+    model="microsoft/deberta-v2-xlarge-mnli",
+    device=0,
+    torch_dtype=torch.float16
+)
+result = pipeline("DeBERTa-v2 is great at understanding context!")
+print(result)
+```
+
+</hfoption>
+<hfoption id="AutoModel">
+
+```py
+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+
+tokenizer = AutoTokenizer.from_pretrained(
+    "microsoft/deberta-v2-xlarge-mnli"
+)
+model = AutoModelForSequenceClassification.from_pretrained(
+    "microsoft/deberta-v2-xlarge-mnli",
+    torch_dtype=torch.float16,
+    device_map="auto"
+)
+
+inputs = tokenizer("DeBERTa-v2 is great at understanding context!", return_tensors="pt").to("cuda")
+outputs = model(**inputs)
+
+logits = outputs.logits
+predicted_class_id = logits.argmax().item()
+predicted_label = model.config.id2label[predicted_class_id]
+print(f"Predicted label: {predicted_label}")
+
+```
+
+</hfoption>
+
+<hfoption id="transformers CLI">
+
+```bash
+echo -e "DeBERTa-v2 is great at understanding context!" | transformers-cli run --task fill-mask --model microsoft/deberta-v2-xlarge-mnli --device 0
+```
+</hfoption>
+</hfoptions>
+
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
+
+The example below uses [bitsandbytes quantization](../quantization/bitsandbytes) to only quantize the weights to 4-bit.
+
+```py
+from transformers import AutoModelForSequenceClassification, AutoTokenizer, BitsAndBytesConfig
+
+model_id = "microsoft/deberta-v2-xlarge-mnli"
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype="float16",
+    bnb_4bit_use_double_quant=True,
+)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForSequenceClassification.from_pretrained(
+    model_id,
+    quantization_config=quantization_config,
+    torch_dtype="float16"
+)
+
+inputs = tokenizer("DeBERTa-v2 is great at understanding context!", return_tensors="pt").to("cuda")
+outputs = model(**inputs)
+logits = outputs.logits
+predicted_class_id = logits.argmax().item()
+predicted_label = model.config.id2label[predicted_class_id]
+print(f"Predicted label: {predicted_label}")
+
+```

- [Text classification task guide](../tasks/sequence_classification)
- [Token classification task guide](../tasks/token_classification)
- [Question answering task guide](../tasks/question_answering)
- [Masked language modeling task guide](../tasks/masked_language_modeling)
- [Multiple choice task guide](../tasks/multiple_choice)

 ## DebertaV2Config

--- a/docs/source/en/model_doc/dia.md
+++ b/docs/source/en/model_doc/dia.md
@ -44,7 +44,7 @@ tokens and decodes them back into audio.
 from transformers import AutoProcessor, DiaForConditionalGeneration

 torch_device = "cuda"
-model_checkpoint = "buttercrab/dia-v1-1.6b"
+model_checkpoint = "nari-labs/Dia-1.6B-0626"

 text = ["[S1] Dia is an open weights text to dialogue model."]
 processor = AutoProcessor.from_pretrained(model_checkpoint)
@ -66,7 +66,7 @@ from datasets import load_dataset, Audio
 from transformers import AutoProcessor, DiaForConditionalGeneration

 torch_device = "cuda"
-model_checkpoint = "buttercrab/dia-v1-1.6b"
+model_checkpoint = "nari-labs/Dia-1.6B-0626"

 ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
 ds = ds.cast_column("audio", Audio(sampling_rate=44100))
@ -93,7 +93,7 @@ from datasets import load_dataset, Audio
 from transformers import AutoProcessor, DiaForConditionalGeneration

 torch_device = "cuda"
-model_checkpoint = "buttercrab/dia-v1-1.6b"
+model_checkpoint = "nari-labs/Dia-1.6B-0626"

 ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
 ds = ds.cast_column("audio", Audio(sampling_rate=44100))
--- a/docs/source/en/model_doc/eomt.md
+++ b/docs/source/en/model_doc/eomt.md
@ -0,0 +1,210 @@
+<!--Copyright 2025 Mobile Perception Systems Lab at TU/e and The HuggingFace Inc. team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
+# EoMT
+
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
+## Overview
+
+The Encoder-only Mask Transformer (EoMT) model was introduced in the CVPR 2025 Highlight Paper [Your ViT is Secretly an Image Segmentation Model](https://www.tue-mps.org/eomt) by Tommie Kerssies, Niccolò Cavagnero, Alexander Hermans, Narges Norouzi, Giuseppe Averta, Bastian Leibe, Gijs Dubbelman, and Daan de Geus.
+EoMT reveals Vision Transformers can perform image segmentation efficiently without task-specific components.
+
+The abstract from the paper is the following:
+
+*Vision Transformers (ViTs) have shown remarkable performance and scalability across various computer vision tasks. To apply single-scale ViTs to image segmentation, existing methods adopt a convolutional adapter to generate multi-scale features, a pixel decoder to fuse these features, and a Transformer decoder that uses the fused features to make predictions. In this paper, we show that the inductive biases introduced by these task-specific components can instead be learned by the ViT itself, given sufficiently large models and extensive pre-training. Based on these findings, we introduce the Encoder-only Mask Transformer (EoMT), which repurposes the plain ViT architecture to conduct image segmentation. With large-scale models and pre-training, EoMT obtains a segmentation accuracy similar to state-of-the-art models that use task-specific components. At the same time, EoMT is significantly faster than these methods due to its architectural simplicity, e.g., up to 4x faster with ViT-L. Across a range of model sizes, EoMT demonstrates an optimal balance between segmentation accuracy and prediction speed, suggesting that compute resources are better spent on scaling the ViT itself rather than adding architectural complexity.*
+
+This model was contributed by [Yaswanth Gali](https://huggingface.co/yaswanthgali).
+The original code can be found [here](https://github.com/tue-mps/eomt).
+
+## Architecture Info
+
+The `EoMT` model uses a DINOv2-pretrained Vision Transformer with **register tokens** as its backbone. EoMT simplifies the segmentation pipeline by relying solely on the encoder, eliminating the need for task-specific decoders commonly used in prior approaches.
+
+Architecturally, EoMT introduces a small set of **learned queries** and a lightweight **mask prediction module**. These queries are injected into the final encoder blocks, enabling **joint attention** between image patches and object queries. During training, **masked attention** is applied to constrain each query to focus on its corresponding region—effectively mimicking cross-attention. This constraint is gradually phased out via a **mask annealing strategy**, allowing for **efficient, decoder-free inference** without compromising segmentation performance.
+
+<div style="text-align: center;">
+  <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/eomt_architecture.png"
+       alt="drawing" width="500"/>
+</div>
+
+
+The model supports semantic, instance, and panoptic segmentation using a unified architecture and task-specific post-processing.
+
+## Usage Examples
+
+Use the Hugging Face implementation of EoMT for inference with pre-trained models.
+
+### Semantic Segmentation
+
+The EoMT model performs semantic segmentation using sliding-window inference. The input image is resized such that the shorter side matches the target input size, then it is split into overlapping crops. Each crop is then passed through the model. After inference, the predicted logits from each crop are stitched back together and rescaled to the original image size to get the final segmentation mask.
+
+> **Note:**  
+> If you want to use a custom target size for **semantic segmentation**, specify it in the following format:  
+> `{"shortest_edge": 512}`  
+> Notice that `longest_edge` is not provided here — this is intentional. For semantic segmentation, images are typically **scaled so that the shortest edge is greater than or equal to the target size** hence longest_edge is not necessary.
+
+```python
+import matplotlib.pyplot as plt
+import requests
+import torch
+from PIL import Image
+
+from transformers import EomtForUniversalSegmentation, AutoImageProcessor
+
+
+model_id = "tue-mps/ade20k_semantic_eomt_large_512"
+processor = AutoImageProcessor.from_pretrained(model_id)
+model = EomtForUniversalSegmentation.from_pretrained(model_id)
+
+image = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
+
+inputs = processor(
+    images=image,
+    return_tensors="pt",
+)
+
+with torch.inference_mode():
+    outputs = model(**inputs)
+
+# Prepare the original image size in the format (height, width)
+target_sizes = [(image.height, image.width)]
+
+# Post-process the model outputs to get final segmentation prediction
+preds = processor.post_process_semantic_segmentation(
+    outputs,
+    target_sizes=target_sizes,
+)
+
+# Visualize the segmentation mask
+plt.imshow(preds[0])
+plt.axis("off")
+plt.title("Semantic Segmentation")
+plt.show()
+```
+
+### Instance Segmentation
+
+The EoMT model performs instance segmentation using padded inference. The input image is resized so that the longer side matches the target input size, and the shorter side is zero-padded to form a square. The resulting mask and class logits are combined through post-processing (adapted from Mask2Former) to produce a unified instance segmentation map, along with segment metadata like segment id, class labels and confidence scores.
+
+> **Note:**  
+> To use a custom target size, specify the size as a dictionary in the following format:  
+> `{"shortest_edge": 512, "longest_edge": 512}`  
+> For both instance and panoptic segmentation, input images will be **scaled and padded** to this target size.
+
+```python
+import matplotlib.pyplot as plt
+import requests
+import torch
+from PIL import Image
+
+from transformers import EomtForUniversalSegmentation, AutoImageProcessor
+
+
+model_id = "tue-mps/coco_instance_eomt_large_640"
+processor = AutoImageProcessor.from_pretrained(model_id)
+model = EomtForUniversalSegmentation.from_pretrained(model_id)
+
+image = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
+
+inputs = processor(
+    images=image,
+    return_tensors="pt",
+)
+
+with torch.inference_mode():
+    outputs = model(**inputs)
+
+# Prepare the original image size in the format (height, width)
+target_sizes = [(image.height, image.width)]
+
+# Post-process the model outputs to get final segmentation prediction
+preds = processor.post_process_instance_segmentation(
+    outputs,
+    target_sizes=target_sizes,
+)
+
+# Visualize the segmentation mask
+plt.imshow(preds[0]["segmentation"])
+plt.axis("off")
+plt.title("Instance Segmentation")
+plt.show()
+```
+
+### Panoptic Segmentation
+
+The EoMT model performs panoptic segmentation using the same padded inference strategy as in instance segmentation. After padding and normalization, the model predicts both thing (instances) and stuff (amorphous regions) classes. The resulting mask and class logits are combined through post-processing (adapted from Mask2Former) to produce a unified panoptic segmentation map, along with segment metadata like segment id, class labels and confidence scores.
+
+```python
+import matplotlib.pyplot as plt
+import requests
+import torch
+from PIL import Image
+
+from transformers import EomtForUniversalSegmentation, AutoImageProcessor
+
+
+model_id = "tue-mps/coco_panoptic_eomt_large_640"
+processor = AutoImageProcessor.from_pretrained(model_id)
+model = EomtForUniversalSegmentation.from_pretrained(model_id)
+
+image = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
+
+inputs = processor(
+    images=image,
+    return_tensors="pt",
+)
+
+with torch.inference_mode():
+    outputs = model(**inputs)
+
+# Prepare the original image size in the format (height, width)
+target_sizes = [(image.height, image.width)]
+
+# Post-process the model outputs to get final segmentation prediction
+preds = processor.post_process_panoptic_segmentation(
+    outputs,
+    target_sizes=target_sizes,
+)
+
+# Visualize the panoptic segmentation mask
+plt.imshow(preds[0]["segmentation"])
+plt.axis("off")
+plt.title("Panoptic Segmentation")
+plt.show()
+```
+
+## EomtImageProcessor
+
+[[autodoc]] EomtImageProcessor
+    - preprocess
+    - post_process_semantic_segmentation
+    - post_process_instance_segmentation
+    - post_process_panoptic_segmentation
+
+## EomtImageProcessorFast
+
+[[autodoc]] EomtImageProcessorFast
+    - preprocess
+    - post_process_semantic_segmentation
+    - post_process_instance_segmentation
+    - post_process_panoptic_segmentation
+
+## EomtConfig
+
+[[autodoc]] EomtConfig
+
+## EomtForUniversalSegmentation
+
+[[autodoc]] EomtForUniversalSegmentation
+    - forward
--- a/docs/source/en/model_doc/gemma.md
+++ b/docs/source/en/model_doc/gemma.md
@ -23,6 +23,7 @@ rendered properly in your Markdown viewer.
        ">
        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
    </div>
 </div>

--- a/docs/source/en/model_doc/gemma2.md
+++ b/docs/source/en/model_doc/gemma2.md
@ -22,6 +22,7 @@ rendered properly in your Markdown viewer.
        ">
        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
    </div>
 </div>

--- a/docs/source/en/model_doc/gemma3n.md
+++ b/docs/source/en/model_doc/gemma3n.md
@ -29,11 +29,11 @@ rendered properly in your Markdown viewer.
 Gemma3n is a multimodal model with pretrained and instruction-tuned variants, available in E4B and E2B sizes. While
 large portions of the language model architecture are shared with prior Gemma releases, there are many new additions in
 this model, including [Alternating Updates][altup] (AltUp), [Learned Augmented Residual Layer][laurel] (LAuReL),
-[MatFormer][matformer], Per-Layer Embeddings (PLE), activation sparsity, and KV cache sharing. The language model uses
+[MatFormer][matformer], Per-Layer Embeddings (PLE), [Activation Sparsity with Statistical Top-k][spark-transformer], and KV cache sharing. The language model uses
 a similar attention pattern to [Gemma 3](./gemma3.md) with alternating 4 local sliding window self-attention layers for
 every global self-attention layer with a maximum context length of 32k tokens. Gemma 3n introduces
-[MobileNet v5][mobilenetv5] as the vision encoder, using a default resolution of 768x768 pixels, and adds a
-[Universal Speech Model][usm] (USM) as the audio encoder.
+[MobileNet v5][mobilenetv5] as the vision encoder, using a default resolution of 768x768 pixels, and adds a newly
+trained audio encoder based on the [Universal Speech Model][usm] (USM) architecture.

 The instruction-tuned variant was post-trained with knowledge distillation and reinforcement learning.

@ -201,4 +201,5 @@ echo -e "Plants create energy through a process known as" | transformers run --t
 [gemma3n-collection]: https://huggingface.co/collections/google/gemma-3n
 [laurel]: https://arxiv.org/abs/2411.07501
 [matformer]: https://arxiv.org/abs/2310.07707
+[spark-transformer]: https://arxiv.org/abs/2506.06644
 [usm]: https://arxiv.org/abs/2303.01037
--- a/docs/source/en/model_doc/glm.md
+++ b/docs/source/en/model_doc/glm.md
@ -20,6 +20,7 @@ rendered properly in your Markdown viewer.
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
 <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
 <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
 </div>

 ## Overview
--- a/docs/source/en/model_doc/granite.md
+++ b/docs/source/en/model_doc/granite.md
@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
 <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
 <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
 </div>

 # Granite
--- a/docs/source/en/model_doc/llama.md
+++ b/docs/source/en/model_doc/llama.md
@ -21,6 +21,7 @@ rendered properly in your Markdown viewer.
        ">
        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
    </div>
 </div>

--- a/docs/source/en/model_doc/llama2.md
+++ b/docs/source/en/model_doc/llama2.md
@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
        <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
        ">
+        <img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
    </div>
 </div>

--- a/docs/source/en/model_doc/llama3.md
+++ b/docs/source/en/model_doc/llama3.md
@ -20,6 +20,7 @@ rendered properly in your Markdown viewer.
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
 <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
 ">
+<img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
 </div>

 ```py3
--- a/docs/source/en/model_doc/llama4.md
+++ b/docs/source/en/model_doc/llama4.md
@ -21,6 +21,7 @@ rendered properly in your Markdown viewer.
    <div class="flex flex-wrap space-x-1">
        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+        <img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
    </div>
 </div>

--- a/docs/source/en/model_doc/mistral.md
+++ b/docs/source/en/model_doc/mistral.md
@ -22,6 +22,7 @@ rendered properly in your Markdown viewer.
        ">
        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
    </div>
 </div>

--- a/docs/source/en/model_doc/mixtral.md
+++ b/docs/source/en/model_doc/mixtral.md
@ -20,6 +20,7 @@ rendered properly in your Markdown viewer.
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
 <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
 <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
 </div>

 ## Overview
--- a/docs/source/en/model_doc/mobilevit.md
+++ b/docs/source/en/model_doc/mobilevit.md
@ -95,6 +95,12 @@ If you're interested in submitting a resource to be included here, please feel f
    - preprocess
    - post_process_semantic_segmentation

+## MobileViTImageProcessorFast
+
+[[autodoc]] MobileViTImageProcessorFast
+    - preprocess
+    - post_process_semantic_segmentation
+
 <frameworkcontent>
 <pt>

--- a/docs/source/en/model_doc/nougat.md
+++ b/docs/source/en/model_doc/nougat.md
@ -107,6 +107,11 @@ The model is identical to [Donut](donut) in terms of architecture.
 [[autodoc]] NougatImageProcessor
    - preprocess

+## NougatImageProcessorFast
+
+[[autodoc]] NougatImageProcessorFast
+    - preprocess
+
 ## NougatTokenizerFast

 [[autodoc]] NougatTokenizerFast
--- a/docs/source/en/model_doc/olmo.md
+++ b/docs/source/en/model_doc/olmo.md
@ -20,6 +20,7 @@ rendered properly in your Markdown viewer.
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
 <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
 <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
 </div>

 ## Overview
--- a/docs/source/en/model_doc/pegasus_x.md
+++ b/docs/source/en/model_doc/pegasus_x.md
@ -14,35 +14,115 @@ rendered properly in your Markdown viewer.

 -->

-# PEGASUS-X
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+    </div>
 </div>

-## Overview
+# PEGASUS-X

-The PEGASUS-X model was proposed in [Investigating Efficiently Extending Transformers for Long Input Summarization](https://huggingface.co/papers/2208.04347)  by Jason Phang, Yao Zhao and Peter J. Liu.
+[PEGASUS-X](https://huggingface.co/papers/2208.04347) is an encoder-decoder (sequence-to-sequence) transformer model for long-input summarization. It extends the [Pegasus](./pegasus) model with staggered block-local attention, global encoder tokens, and additional pretraining on long text sequences, enabling it to handle inputs of up to 16,000 tokens. PEGASUS-X matches the performance of much larger models while using fewer parameters.

-PEGASUS-X (PEGASUS eXtended) extends the PEGASUS models for long input summarization through additional long input pretraining and using staggered block-local attention with global tokens in the encoder.
+You can find all the original PEGASUS-X checkpoints under the [Google](https://huggingface.co/google/models?search=pegasus-x) organization.

-The abstract from the paper is the following:
+> [!TIP]
+> This model was contributed by [zphang](https://huggingface.co/zphang).
+>
+> Click on the PEGASUS-X models in the right sidebar for more examples of how to apply PEGASUS-X to different language tasks.

-*While large pretrained Transformer models have proven highly capable at tackling natural language tasks, handling long sequence inputs continues to be a significant challenge. One such task is long input summarization, where inputs are longer than the maximum input context of most pretrained models. Through an extensive set of experiments, we investigate what model architectural changes and pretraining paradigms can most efficiently adapt a pretrained Transformer for long input summarization. We find that a staggered, block-local Transformer with global encoder tokens strikes a good balance of performance and efficiency, and that an additional pretraining phase on long sequences meaningfully improves downstream summarization performance. Based on our findings, we introduce PEGASUS-X, an extension of the PEGASUS model with additional long input pretraining to handle inputs of up to 16K tokens. PEGASUS-X achieves strong performance on long input summarization tasks comparable with much larger models while adding few additional parameters and not requiring model parallelism to train.*
+The example below demonstrates how to summarize text with [`Pipeline`], [`AutoModel`], and from the command line.

-This model was contributed by [zphang](https://huggingface.co/zphang). The original code can be found [here](https://github.com/google-research/pegasus).
+<hfoptions id="usage">
+<hfoption id="Pipeline">

-## Documentation resources
+```py
+import torch
+from transformers import pipeline

- [Translation task guide](../tasks/translation)
- [Summarization task guide](../tasks/summarization)
+pipeline = pipeline(
+    task="summarization",
+    model="google/pegasus-x-large",
+    torch_dtype=torch.bfloat16,
+    device=0
+)
+pipeline("""Plants are among the most remarkable and essential life forms on Earth, possessing a unique ability to produce their own food through a process known as photosynthesis. This complex biochemical process is fundamental not only to plant life but to virtually all life on the planet.
+Through photosynthesis, plants capture energy from sunlight using a green pigment called chlorophyll, which is located in specialized cell structures called chloroplasts. In the presence of light, plants absorb carbon dioxide from the atmosphere through small pores in their leaves called stomata, and take in water from the soil through their root systems.
+These ingredients are then transformed into glucose, a type of sugar that serves as a source of chemical energy, and oxygen, which is released as a byproduct into the atmosphere. The glucose produced during photosynthesis is not just used immediately; plants also store it as starch or convert it into other organic compounds like cellulose, which is essential for building their cellular structure.
+This energy reserve allows them to grow, develop leaves, produce flowers, bear fruit, and carry out various physiological processes throughout their lifecycle.""")
+```
+</hfoption>
+<hfoption id="AutoModel">

-<Tip>
+```py
+import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

-PEGASUS-X uses the same tokenizer as [PEGASUS](pegasus).
+tokenizer = AutoTokenizer.from_pretrained(
+    "google/pegasus-x-large"
+)
+model = AutoModelForSeq2SeqLM.from_pretrained(
+    "google/pegasus-x-large",
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+)

-</Tip>
+input_text = """Plants are among the most remarkable and essential life forms on Earth, possessing a unique ability to produce their own food through a process known as photosynthesis. This complex biochemical process is fundamental not only to plant life but to virtually all life on the planet.
+Through photosynthesis, plants capture energy from sunlight using a green pigment called chlorophyll, which is located in specialized cell structures called chloroplasts. In the presence of light, plants absorb carbon dioxide from the atmosphere through small pores in their leaves called stomata, and take in water from the soil through their root systems.
+These ingredients are then transformed into glucose, a type of sugar that serves as a source of chemical energy, and oxygen, which is released as a byproduct into the atmosphere. The glucose produced during photosynthesis is not just used immediately; plants also store it as starch or convert it into other organic compounds like cellulose, which is essential for building their cellular structure.
+This energy reserve allows them to grow, develop leaves, produce flowers, bear fruit, and carry out various physiological processes throughout their lifecycle."""
+input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+
+output = model.generate(**input_ids, cache_implementation="static")
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+</hfoption>
+<hfoption id="transformers-cli">
+
+```bash
+echo -e "Plants are among the most remarkable and essential life forms on Earth, possessing a unique ability to produce their own food through a process known as photosynthesis. This complex biochemical process is fundamental not only to plant life but to virtually all life on the planet. Through photosynthesis, plants capture energy from sunlight using a green pigment called chlorophyll, which is located in specialized cell structures called chloroplasts." | transformers-cli run --task summarization --model google/pegasus-x-large --device 0
+```
+</hfoption>
+</hfoptions>
+
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
+
+The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to int4.
+
+```py
+import torch
+from transformers import BitsAndBytesConfig, AutoModelForSeq2SeqLM, AutoTokenizer
+
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.bfloat16,
+    bnb_4bit_quant_type="nf4"
+)
+model = AutoModelForSeq2SeqLM.from_pretrained(
+    "google/pegasus-x-large",
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    quantization_config=quantization_config
+)
+
+tokenizer = AutoTokenizer.from_pretrained(
+    "google/pegasus-x-large"
+)
+
+input_text = """Plants are among the most remarkable and essential life forms on Earth, possessing a unique ability to produce their own food through a process known as photosynthesis. This complex biochemical process is fundamental not only to plant life but to virtually all life on the planet.
+Through photosynthesis, plants capture energy from sunlight using a green pigment called chlorophyll, which is located in specialized cell structures called chloroplasts. In the presence of light, plants absorb carbon dioxide from the atmosphere through small pores in their leaves called stomata, and take in water from the soil through their root systems.
+These ingredients are then transformed into glucose, a type of sugar that serves as a source of chemical energy, and oxygen, which is released as a byproduct into the atmosphere. The glucose produced during photosynthesis is not just used immediately; plants also store it as starch or convert it into other organic compounds like cellulose, which is essential for building their cellular structure.
+This energy reserve allows them to grow, develop leaves, produce flowers, bear fruit, and carry out various physiological processes throughout their lifecycle."""
+input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+
+output = model.generate(**input_ids, cache_implementation="static")
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+
+## Notes
+
+- PEGASUS-X also uses the [`PegasusTokenizer`].

 ## PegasusXConfig

--- a/docs/source/en/model_doc/phi.md
+++ b/docs/source/en/model_doc/phi.md
@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
    </div>
 </div>

--- a/docs/source/en/model_doc/phi3.md
+++ b/docs/source/en/model_doc/phi3.md
@ -20,6 +20,7 @@ rendered properly in your Markdown viewer.
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
 <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
 <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
 </div>

 ## Overview
--- a/docs/source/en/model_doc/qwen2.md
+++ b/docs/source/en/model_doc/qwen2.md
@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
    </div>
 </div>

--- a/docs/source/en/model_doc/qwen2_moe.md
+++ b/docs/source/en/model_doc/qwen2_moe.md
@ -18,6 +18,7 @@ rendered properly in your Markdown viewer.
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
 <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
 <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
 </div>

 # Qwen2MoE
--- a/docs/source/en/model_doc/qwen2_vl.md
+++ b/docs/source/en/model_doc/qwen2_vl.md
@ -19,6 +19,7 @@ rendered properly in your Markdown viewer.
 <div class="flex flex-wrap space-x-1">
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
 <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
 </div>

 ## Overview
--- a/docs/source/en/model_doc/starcoder2.md
+++ b/docs/source/en/model_doc/starcoder2.md
@ -20,6 +20,7 @@ rendered properly in your Markdown viewer.
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
 <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
 <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
 </div>

 ## Overview
--- a/docs/source/en/model_doc/superpoint.md
+++ b/docs/source/en/model_doc/superpoint.md
@ -10,48 +10,35 @@ specific language governing permissions and limitations under the License.
 ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
 rendered properly in your Markdown viewer.

-
 -->

+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white" >
+    </div>
+</div>
+
 # SuperPoint

-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-</div>
-
-## Overview
-
-The SuperPoint model was proposed
-in [SuperPoint: Self-Supervised Interest Point Detection and Description](https://huggingface.co/papers/1712.07629) by Daniel
-DeTone, Tomasz Malisiewicz and Andrew Rabinovich.
-
-This model is the result of a self-supervised training of a fully-convolutional network for interest point detection and
-description. The model is able to detect interest points that are repeatable under homographic transformations and
-provide a descriptor for each point. The use of the model in its own is limited, but it can be used as a feature
-extractor for other tasks such as homography estimation, image matching, etc.
-
-The abstract from the paper is the following:
-
-*This paper presents a self-supervised framework for training interest point detectors and descriptors suitable for a
-large number of multiple-view geometry problems in computer vision. As opposed to patch-based neural networks, our
-fully-convolutional model operates on full-sized images and jointly computes pixel-level interest point locations and
-associated descriptors in one forward pass. We introduce Homographic Adaptation, a multi-scale, multi-homography
-approach for boosting interest point detection repeatability and performing cross-domain adaptation (e.g.,
-synthetic-to-real). Our model, when trained on the MS-COCO generic image dataset using Homographic Adaptation, is able
-to repeatedly detect a much richer set of interest points than the initial pre-adapted deep model and any other
-traditional corner detector. The final system gives rise to state-of-the-art homography estimation results on HPatches
-when compared to LIFT, SIFT and ORB.*
+[SuperPoint](https://huggingface.co/papers/1712.07629) is the result of self-supervised training of a fully-convolutional network for interest point detection and description. The model is able to detect interest points that are repeatable under homographic transformations and provide a descriptor for each point. Usage on it's own is limited, but it can be used as a feature extractor for other tasks such as homography estimation and image matching.

 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/superpoint_architecture.png"
 alt="drawing" width="500"/>

-<small> SuperPoint overview. Taken from the <a href="https://huggingface.co/papers/1712.07629v4">original paper.</a> </small>
+You can find all the original SuperPoint checkpoints under the [Magic Leap Community](https://huggingface.co/magic-leap-community) organization.

-## Usage tips
+> [!TIP]
+> This model was contributed by [stevenbucaille](https://huggingface.co/stevenbucaille).
+>
+> Click on the SuperPoint models in the right sidebar for more examples of how to apply SuperPoint to different computer vision tasks.

-Here is a quick example of using the model to detect interest points in an image:

-```python
+
+The example below demonstrates how to detect interest points in an image with the [`AutoModel`] class.
+<hfoptions id="usage">
+<hfoption id="AutoModel">
+
+```py
 from transformers import AutoImageProcessor, SuperPointForKeypointDetection
 import torch
 from PIL import Image
@ -64,67 +51,76 @@ processor = AutoImageProcessor.from_pretrained("magic-leap-community/superpoint"
 model = SuperPointForKeypointDetection.from_pretrained("magic-leap-community/superpoint")

 inputs = processor(image, return_tensors="pt")
-outputs = model(**inputs)
+with torch.no_grad():
+    outputs = model(**inputs)
+
+# Post-process to get keypoints, scores, and descriptors
+image_size = (image.height, image.width)
+processed_outputs = processor.post_process_keypoint_detection(outputs, [image_size])
 ```

-The outputs contain the list of keypoint coordinates with their respective score and description (a 256-long vector).
+</hfoption>
+</hfoptions>

-You can also feed multiple images to the model. Due to the nature of SuperPoint, to output a dynamic number of keypoints,
-you will need to use the mask attribute to retrieve the respective information :
+## Notes

-```python
-from transformers import AutoImageProcessor, SuperPointForKeypointDetection
-import torch
-from PIL import Image
-import requests
+- SuperPoint outputs a dynamic number of keypoints per image, which makes it suitable for tasks requiring variable-length feature representations.

-url_image_1 = "http://images.cocodataset.org/val2017/000000039769.jpg"
-image_1 = Image.open(requests.get(url_image_1, stream=True).raw)
-url_image_2 = "http://images.cocodataset.org/test-stuff2017/000000000568.jpg"
-image_2 = Image.open(requests.get(url_image_2, stream=True).raw)
+    ```py
+    from transformers import AutoImageProcessor, SuperPointForKeypointDetection
+    import torch
+    from PIL import Image
+    import requests
+    processor = AutoImageProcessor.from_pretrained("magic-leap-community/superpoint")
+    model = SuperPointForKeypointDetection.from_pretrained("magic-leap-community/superpoint")
+    url_image_1 = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    image_1 = Image.open(requests.get(url_image_1, stream=True).raw)
+    url_image_2 = "http://images.cocodataset.org/test-stuff2017/000000000568.jpg"
+    image_2 = Image.open(requests.get(url_image_2, stream=True).raw)
+    images = [image_1, image_2]
+    inputs = processor(images, return_tensors="pt")
+    # Example of handling dynamic keypoint output
+    outputs = model(**inputs)
+    keypoints = outputs.keypoints  # Shape varies per image
+    scores = outputs.scores        # Confidence scores for each keypoint
+    descriptors = outputs.descriptors  # 256-dimensional descriptors
+    mask = outputs.mask # Value of 1 corresponds to a keypoint detection
+    ```

-images = [image_1, image_2]
+- The model provides both keypoint coordinates and their corresponding descriptors (256-dimensional vectors) in a single forward pass.
+- For batch processing with multiple images, you need to use the mask attribute to retrieve the respective information for each image. You can use the `post_process_keypoint_detection` from the `SuperPointImageProcessor` to retrieve the each image information.

-processor = AutoImageProcessor.from_pretrained("magic-leap-community/superpoint")
-model = SuperPointForKeypointDetection.from_pretrained("magic-leap-community/superpoint")
+    ```py
+    # Batch processing example
+    images = [image1, image2, image3]
+    inputs = processor(images, return_tensors="pt")
+    outputs = model(**inputs)
+    image_sizes = [(img.height, img.width) for img in images]
+    processed_outputs = processor.post_process_keypoint_detection(outputs, image_sizes)
+    ```

-inputs = processor(images, return_tensors="pt")
-outputs = model(**inputs)
-image_sizes = [(image.height, image.width) for image in images]
-outputs = processor.post_process_keypoint_detection(outputs, image_sizes)
+- You can then print the keypoints on the image of your choice to visualize the result:
+    ```py
+    import matplotlib.pyplot as plt
+    plt.axis("off")
+    plt.imshow(image_1)
+    plt.scatter(
+        outputs[0]["keypoints"][:, 0],
+        outputs[0]["keypoints"][:, 1],
+        c=outputs[0]["scores"] * 100,
+        s=outputs[0]["scores"] * 50,
+        alpha=0.8
+    )
+    plt.savefig(f"output_image.png")
+    ```

-for output in outputs:
-    for keypoints, scores, descriptors in zip(output["keypoints"], output["scores"], output["descriptors"]):
-        print(f"Keypoints: {keypoints}")
-        print(f"Scores: {scores}")
-        print(f"Descriptors: {descriptors}")
-```
-
-You can then print the keypoints on the image of your choice to visualize the result:
-```python
-import matplotlib.pyplot as plt
-
-plt.axis("off")
-plt.imshow(image_1)
-plt.scatter(
-    outputs[0]["keypoints"][:, 0],
-    outputs[0]["keypoints"][:, 1],
-    c=outputs[0]["scores"] * 100,
-    s=outputs[0]["scores"] * 50,
-    alpha=0.8
-)
-plt.savefig(f"output_image.png")
-```
-![image/png](https://cdn-uploads.huggingface.co/production/uploads/632885ba1558dac67c440aa8/ZtFmphEhx8tcbEQqOolyE.png)
-
-This model was contributed by [stevenbucaille](https://huggingface.co/stevenbucaille).
-The original code can be found [here](https://github.com/magicleap/SuperPointPretrainedNetwork).
+<div class="flex justify-center">
+    <img src="https://cdn-uploads.huggingface.co/production/uploads/632885ba1558dac67c440aa8/ZtFmphEhx8tcbEQqOolyE.png">
+</div>

 ## Resources

-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with SuperPoint. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
-
- A notebook showcasing inference and visualization with SuperPoint can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/SuperPoint/Inference_with_SuperPoint_to_detect_interest_points_in_an_image.ipynb). 🌎
+- Refer to this [noteboook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/SuperPoint/Inference_with_SuperPoint_to_detect_interest_points_in_an_image.ipynb) for an inference and visualization example.

 ## SuperPointConfig

@ -137,8 +133,12 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 - preprocess
 - post_process_keypoint_detection

+<frameworkcontent>
+<pt>
 ## SuperPointForKeypointDetection

 [[autodoc]] SuperPointForKeypointDetection

 - forward
+
+</pt>
--- a/docs/source/en/model_doc/vitpose.md
+++ b/docs/source/en/model_doc/vitpose.md
@ -10,52 +10,39 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# ViTPose
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<div style="float: right;">
+  <div class="flex flex-wrap space-x-1">
+    <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+  </div>
 </div>

-## Overview
+# ViTPose

-The ViTPose model was proposed in [ViTPose: Simple Vision Transformer Baselines for Human Pose Estimation](https://huggingface.co/papers/2204.12484) by Yufei Xu, Jing Zhang, Qiming Zhang, Dacheng Tao. ViTPose employs a standard, non-hierarchical [Vision Transformer](vit) as backbone for the task of keypoint estimation. A simple decoder head is added on top to predict the heatmaps from a given image. Despite its simplicity, the model gets state-of-the-art results on the challenging MS COCO Keypoint Detection benchmark. The model was further improved in [ViTPose++: Vision Transformer for Generic Body Pose Estimation](https://huggingface.co/papers/2212.04246) where the authors employ
-a mixture-of-experts (MoE) module in the ViT backbone along with pre-training on more data, which further enhances the performance.
+[ViTPose](https://huggingface.co/papers/2204.12484) is a vision transformer-based model for keypoint (pose) estimation. It uses a simple, non-hierarchical [ViT](./vit) backbone and a lightweight decoder head. This architecture simplifies model design, takes advantage of transformer scalability, and can be adapted to different training strategies.

-The abstract from the paper is the following:
-
-*Although no specific domain knowledge is considered in the design, plain vision transformers have shown excellent performance in visual recognition tasks. However, little effort has been made to reveal the potential of such simple structures for pose estimation tasks. In this paper, we show the surprisingly good capabilities of plain vision transformers for pose estimation from various aspects, namely simplicity in model structure, scalability in model size, flexibility in training paradigm, and transferability of knowledge between models, through a simple baseline model called ViTPose. Specifically, ViTPose employs plain and non-hierarchical vision transformers as backbones to extract features for a given person instance and a lightweight decoder for pose estimation. It can be scaled up from 100M to 1B parameters by taking the advantages of the scalable model capacity and high parallelism of transformers, setting a new Pareto front between throughput and performance. Besides, ViTPose is very flexible regarding the attention type, input resolution, pre-training and finetuning strategy, as well as dealing with multiple pose tasks. We also empirically demonstrate that the knowledge of large ViTPose models can be easily transferred to small ones via a simple knowledge token. Experimental results show that our basic ViTPose model outperforms representative methods on the challenging MS COCO Keypoint Detection benchmark, while the largest model sets a new state-of-the-art.*
+[ViTPose++](https://huggingface.co/papers/2212.04246) improves on ViTPose by incorporating a mixture-of-experts (MoE) module in the backbone and using more diverse pretraining data.

 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/vitpose-architecture.png"
 alt="drawing" width="600"/>

-<small> ViTPose architecture. Taken from the <a href="https://huggingface.co/papers/2204.12484">original paper.</a> </small>
+You can find all ViTPose and ViTPose++ checkpoints under the [ViTPose collection](https://huggingface.co/collections/usyd-community/vitpose-677fcfd0a0b2b5c8f79c4335).

-This model was contributed by [nielsr](https://huggingface.co/nielsr) and [sangbumchoi](https://github.com/SangbumChoi).
-The original code can be found [here](https://github.com/ViTAE-Transformer/ViTPose).
-
-## Usage Tips
-
-ViTPose is a so-called top-down keypoint detection model. This means that one first uses an object detector, like [RT-DETR](rt_detr.md), to detect people (or other instances) in an image. Next, ViTPose takes the cropped images as input and predicts the keypoints for each of them.
+The example below demonstrates pose estimation with the [`VitPoseForPoseEstimation`] class.

 ```py
 import torch
 import requests
 import numpy as np
-
+import supervision as sv
 from PIL import Image
-
 from transformers import AutoProcessor, RTDetrForObjectDetection, VitPoseForPoseEstimation

 device = "cuda" if torch.cuda.is_available() else "cpu"

-url = "http://images.cocodataset.org/val2017/000000000139.jpg"
+url = "https://www.fcbarcelona.com/fcbarcelona/photo/2021/01/31/3c55a19f-dfc1-4451-885e-afd14e890a11/mini_2021-01-31-BARCELONA-ATHLETIC-BILBAOI-30.JPG"
 image = Image.open(requests.get(url, stream=True).raw)

-# ------------------------------------------------------------------------
-# Stage 1. Detect humans on the image
-# ------------------------------------------------------------------------
-
-# You can choose any detector of your choice
+# Detect humans in the image
 person_image_processor = AutoProcessor.from_pretrained("PekingU/rtdetr_r50vd_coco_o365")
 person_model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd_coco_o365", device_map=device)

@ -67,7 +54,7 @@ with torch.no_grad():
 results = person_image_processor.post_process_object_detection(
    outputs, target_sizes=torch.tensor([(image.height, image.width)]), threshold=0.3
 )
-result = results[0]  # take first image results
+result = results[0]

 # Human label refers 0 index in COCO dataset
 person_boxes = result["boxes"][result["labels"] == 0]
@ -77,10 +64,7 @@ person_boxes = person_boxes.cpu().numpy()
 person_boxes[:, 2] = person_boxes[:, 2] - person_boxes[:, 0]
 person_boxes[:, 3] = person_boxes[:, 3] - person_boxes[:, 1]

-# ------------------------------------------------------------------------
-# Stage 2. Detect keypoints for each person found
-# ------------------------------------------------------------------------
-
+# Detect keypoints for each person found
 image_processor = AutoProcessor.from_pretrained("usyd-community/vitpose-base-simple")
 model = VitPoseForPoseEstimation.from_pretrained("usyd-community/vitpose-base-simple", device_map=device)

@ -90,54 +74,7 @@ with torch.no_grad():
    outputs = model(**inputs)

 pose_results = image_processor.post_process_pose_estimation(outputs, boxes=[person_boxes])
-image_pose_result = pose_results[0]  # results for first image
-```
-
-### ViTPose++ models
-
-The best [checkpoints](https://huggingface.co/collections/usyd-community/vitpose-677fcfd0a0b2b5c8f79c4335) are those of the [ViTPose++ paper](https://huggingface.co/papers/2212.04246). ViTPose++ models employ a so-called [Mixture-of-Experts (MoE)](https://huggingface.co/blog/moe) architecture for the ViT backbone, resulting in better performance.
-
-The ViTPose+ checkpoints use 6 experts, hence 6 different dataset indices can be passed. 
-An overview of the various dataset indices is provided below:
-
- 0: [COCO validation 2017](https://cocodataset.org/#overview) dataset, using an object detector that gets 56 AP on the "person" class
- 1: [AiC](https://github.com/fabbrimatteo/AiC-Dataset) dataset
- 2: [MPII](https://www.mpi-inf.mpg.de/departments/computer-vision-and-machine-learning/software-and-datasets/mpii-human-pose-dataset) dataset
- 3: [AP-10K](https://github.com/AlexTheBad/AP-10K) dataset
- 4: [APT-36K](https://github.com/pandorgan/APT-36K) dataset
- 5: [COCO-WholeBody](https://github.com/jin-s13/COCO-WholeBody) dataset
-
-Pass the `dataset_index` argument in the forward of the model to indicate which experts to use for each example in the batch. Example usage is shown below:
-
-```python
-image_processor = AutoProcessor.from_pretrained("usyd-community/vitpose-plus-base")
-model = VitPoseForPoseEstimation.from_pretrained("usyd-community/vitpose-plus-base", device=device)
-
-inputs = image_processor(image, boxes=[person_boxes], return_tensors="pt").to(device)
-
-dataset_index = torch.tensor([0], device=device) # must be a tensor of shape (batch_size,)
-
-with torch.no_grad():
-    outputs = model(**inputs, dataset_index=dataset_index)
-```
-
-The ViTPose+ checkpoints use 6 experts, hence 6 different dataset indices can be passed. 
-An overview of the various dataset indices is provided below:
-
- 0: [COCO validation 2017](https://cocodataset.org/#overview) dataset, using an object detector that gets 56 AP on the "person" class
- 1: [AiC](https://github.com/fabbrimatteo/AiC-Dataset) dataset
- 2: [MPII](https://www.mpi-inf.mpg.de/departments/computer-vision-and-machine-learning/software-and-datasets/mpii-human-pose-dataset) dataset
- 3: [AP-10K](https://github.com/AlexTheBad/AP-10K) dataset
- 4: [APT-36K](https://github.com/pandorgan/APT-36K) dataset
- 5: [COCO-WholeBody](https://github.com/jin-s13/COCO-WholeBody) dataset
-
-
-### Visualization
-
-To visualize the various keypoints, one can either leverage the `supervision` [library](https://github.com/roboflow/supervision (requires `pip install supervision`):
-
-```python
-import supervision as sv
+image_pose_result = pose_results[0]

 xy = torch.stack([pose_result['keypoints'] for pose_result in image_pose_result]).cpu().numpy()
 scores = torch.stack([pose_result['scores'] for pose_result in image_pose_result]).cpu().numpy()
@ -162,119 +99,192 @@ annotated_frame = vertex_annotator.annotate(
    scene=annotated_frame,
    key_points=key_points
 )
+annotated_frame
 ```

-Alternatively, one can also visualize the keypoints using [OpenCV](https://opencv.org/) (requires `pip install opencv-python`):
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/vitpose.png"/>
+</div>

-```python
-import math
-import cv2
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.

-def draw_points(image, keypoints, scores, pose_keypoint_color, keypoint_score_threshold, radius, show_keypoint_weight):
-    if pose_keypoint_color is not None:
-        assert len(pose_keypoint_color) == len(keypoints)
-    for kid, (kpt, kpt_score) in enumerate(zip(keypoints, scores)):
-        x_coord, y_coord = int(kpt[0]), int(kpt[1])
-        if kpt_score > keypoint_score_threshold:
-            color = tuple(int(c) for c in pose_keypoint_color[kid])
-            if show_keypoint_weight:
-                cv2.circle(image, (int(x_coord), int(y_coord)), radius, color, -1)
-                transparency = max(0, min(1, kpt_score))
-                cv2.addWeighted(image, transparency, image, 1 - transparency, 0, dst=image)
-            else:
-                cv2.circle(image, (int(x_coord), int(y_coord)), radius, color, -1)
+The example below uses [torchao](../quantization/torchao) to only quantize the weights to int4.

-def draw_links(image, keypoints, scores, keypoint_edges, link_colors, keypoint_score_threshold, thickness, show_keypoint_weight, stick_width = 2):
-    height, width, _ = image.shape
-    if keypoint_edges is not None and link_colors is not None:
-        assert len(link_colors) == len(keypoint_edges)
-        for sk_id, sk in enumerate(keypoint_edges):
-            x1, y1, score1 = (int(keypoints[sk[0], 0]), int(keypoints[sk[0], 1]), scores[sk[0]])
-            x2, y2, score2 = (int(keypoints[sk[1], 0]), int(keypoints[sk[1], 1]), scores[sk[1]])
-            if (
-                x1 > 0
-                and x1 < width
-                and y1 > 0
-                and y1 < height
-                and x2 > 0
-                and x2 < width
-                and y2 > 0
-                and y2 < height
-                and score1 > keypoint_score_threshold
-                and score2 > keypoint_score_threshold
-            ):
-                color = tuple(int(c) for c in link_colors[sk_id])
+```py
+# pip install torchao
+import torch
+import requests
+import numpy as np
+from PIL import Image
+from transformers import AutoProcessor, RTDetrForObjectDetection, VitPoseForPoseEstimation, TorchAoConfig
+
+url = "https://www.fcbarcelona.com/fcbarcelona/photo/2021/01/31/3c55a19f-dfc1-4451-885e-afd14e890a11/mini_2021-01-31-BARCELONA-ATHLETIC-BILBAOI-30.JPG"
+image = Image.open(requests.get(url, stream=True).raw)
+
+person_image_processor = AutoProcessor.from_pretrained("PekingU/rtdetr_r50vd_coco_o365")
+person_model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd_coco_o365", device_map=device)
+
+inputs = person_image_processor(images=image, return_tensors="pt").to(device)
+
+with torch.no_grad():
+    outputs = person_model(**inputs)
+
+results = person_image_processor.post_process_object_detection(
+    outputs, target_sizes=torch.tensor([(image.height, image.width)]), threshold=0.3
+)
+result = results[0]
+
+person_boxes = result["boxes"][result["labels"] == 0]
+person_boxes = person_boxes.cpu().numpy()
+
+person_boxes[:, 2] = person_boxes[:, 2] - person_boxes[:, 0]
+person_boxes[:, 3] = person_boxes[:, 3] - person_boxes[:, 1]
+
+quantization_config = TorchAoConfig("int4_weight_only", group_size=128)
+
+image_processor = AutoProcessor.from_pretrained("usyd-community/vitpose-plus-huge")
+model = VitPoseForPoseEstimation.from_pretrained("usyd-community/vitpose-plus-huge", device_map=device, quantization_config=quantization_config)
+
+inputs = image_processor(image, boxes=[person_boxes], return_tensors="pt").to(device)
+
+with torch.no_grad():
+    outputs = model(**inputs)
+
+pose_results = image_processor.post_process_pose_estimation(outputs, boxes=[person_boxes])
+image_pose_result = pose_results[0]
+```
+
+## Notes
+
+- Use [`AutoProcessor`] to automatically prepare bounding box and image inputs.
+- ViTPose is a top-down pose estimator. It uses a object detector to detect individuals first before keypoint prediction.
+- ViTPose++ has 6 different MoE expert heads (COCO validation `0`, AiC `1`, MPII `2`, AP-10K `3`, APT-36K `4`, COCO-WholeBody `5`) which supports 6 different datasets. Pass a specific value corresponding to the dataset to the `dataset_index` to indicate which expert to use.
+
+    ```py
+    from transformers import AutoProcessor, VitPoseForPoseEstimation
+
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+
+    image_processor = AutoProcessor.from_pretrained("usyd-community/vitpose-plus-base")
+    model = VitPoseForPoseEstimation.from_pretrained("usyd-community/vitpose-plus-base", device=device)
+
+    inputs = image_processor(image, boxes=[person_boxes], return_tensors="pt").to(device)
+    dataset_index = torch.tensor([0], device=device) # must be a tensor of shape (batch_size,)
+
+    with torch.no_grad():
+        outputs = model(**inputs, dataset_index=dataset_index)
+    ```
+
+- [OpenCV](https://opencv.org/) is an alternative option for visualizing the estimated pose.
+
+    ```py
+    # pip install opencv-python
+    import math
+    import cv2
+
+    def draw_points(image, keypoints, scores, pose_keypoint_color, keypoint_score_threshold, radius, show_keypoint_weight):
+        if pose_keypoint_color is not None:
+            assert len(pose_keypoint_color) == len(keypoints)
+        for kid, (kpt, kpt_score) in enumerate(zip(keypoints, scores)):
+            x_coord, y_coord = int(kpt[0]), int(kpt[1])
+            if kpt_score > keypoint_score_threshold:
+                color = tuple(int(c) for c in pose_keypoint_color[kid])
                if show_keypoint_weight:
-                    X = (x1, x2)
-                    Y = (y1, y2)
-                    mean_x = np.mean(X)
-                    mean_y = np.mean(Y)
-                    length = ((Y[0] - Y[1]) ** 2 + (X[0] - X[1]) ** 2) ** 0.5
-                    angle = math.degrees(math.atan2(Y[0] - Y[1], X[0] - X[1]))
-                    polygon = cv2.ellipse2Poly(
-                        (int(mean_x), int(mean_y)), (int(length / 2), int(stick_width)), int(angle), 0, 360, 1
-                    )
-                    cv2.fillConvexPoly(image, polygon, color)
-                    transparency = max(0, min(1, 0.5 * (keypoints[sk[0], 2] + keypoints[sk[1], 2])))
+                    cv2.circle(image, (int(x_coord), int(y_coord)), radius, color, -1)
+                    transparency = max(0, min(1, kpt_score))
                    cv2.addWeighted(image, transparency, image, 1 - transparency, 0, dst=image)
                else:
-                    cv2.line(image, (x1, y1), (x2, y2), color, thickness=thickness)
+                    cv2.circle(image, (int(x_coord), int(y_coord)), radius, color, -1)

+    def draw_links(image, keypoints, scores, keypoint_edges, link_colors, keypoint_score_threshold, thickness, show_keypoint_weight, stick_width = 2):
+        height, width, _ = image.shape
+        if keypoint_edges is not None and link_colors is not None:
+            assert len(link_colors) == len(keypoint_edges)
+            for sk_id, sk in enumerate(keypoint_edges):
+                x1, y1, score1 = (int(keypoints[sk[0], 0]), int(keypoints[sk[0], 1]), scores[sk[0]])
+                x2, y2, score2 = (int(keypoints[sk[1], 0]), int(keypoints[sk[1], 1]), scores[sk[1]])
+                if (
+                    x1 > 0
+                    and x1 < width
+                    and y1 > 0
+                    and y1 < height
+                    and x2 > 0
+                    and x2 < width
+                    and y2 > 0
+                    and y2 < height
+                    and score1 > keypoint_score_threshold
+                    and score2 > keypoint_score_threshold
+                ):
+                    color = tuple(int(c) for c in link_colors[sk_id])
+                    if show_keypoint_weight:
+                        X = (x1, x2)
+                        Y = (y1, y2)
+                        mean_x = np.mean(X)
+                        mean_y = np.mean(Y)
+                        length = ((Y[0] - Y[1]) ** 2 + (X[0] - X[1]) ** 2) ** 0.5
+                        angle = math.degrees(math.atan2(Y[0] - Y[1], X[0] - X[1]))
+                        polygon = cv2.ellipse2Poly(
+                            (int(mean_x), int(mean_y)), (int(length / 2), int(stick_width)), int(angle), 0, 360, 1
+                        )
+                        cv2.fillConvexPoly(image, polygon, color)
+                        transparency = max(0, min(1, 0.5 * (keypoints[sk[0], 2] + keypoints[sk[1], 2])))
+                        cv2.addWeighted(image, transparency, image, 1 - transparency, 0, dst=image)
+                    else:
+                        cv2.line(image, (x1, y1), (x2, y2), color, thickness=thickness)

-# Note: keypoint_edges and color palette are dataset-specific
-keypoint_edges = model.config.edges
+    # Note: keypoint_edges and color palette are dataset-specific
+    keypoint_edges = model.config.edges

-palette = np.array(
-    [
-        [255, 128, 0],
-        [255, 153, 51],
-        [255, 178, 102],
-        [230, 230, 0],
-        [255, 153, 255],
-        [153, 204, 255],
-        [255, 102, 255],
-        [255, 51, 255],
-        [102, 178, 255],
-        [51, 153, 255],
-        [255, 153, 153],
-        [255, 102, 102],
-        [255, 51, 51],
-        [153, 255, 153],
-        [102, 255, 102],
-        [51, 255, 51],
-        [0, 255, 0],
-        [0, 0, 255],
-        [255, 0, 0],
-        [255, 255, 255],
-    ]
-)
+    palette = np.array(
+        [
+            [255, 128, 0],
+            [255, 153, 51],
+            [255, 178, 102],
+            [230, 230, 0],
+            [255, 153, 255],
+            [153, 204, 255],
+            [255, 102, 255],
+            [255, 51, 255],
+            [102, 178, 255],
+            [51, 153, 255],
+            [255, 153, 153],
+            [255, 102, 102],
+            [255, 51, 51],
+            [153, 255, 153],
+            [102, 255, 102],
+            [51, 255, 51],
+            [0, 255, 0],
+            [0, 0, 255],
+            [255, 0, 0],
+            [255, 255, 255],
+        ]
+    )

-link_colors = palette[[0, 0, 0, 0, 7, 7, 7, 9, 9, 9, 9, 9, 16, 16, 16, 16, 16, 16, 16]]
-keypoint_colors = palette[[16, 16, 16, 16, 16, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0, 0, 0]]
+    link_colors = palette[[0, 0, 0, 0, 7, 7, 7, 9, 9, 9, 9, 9, 16, 16, 16, 16, 16, 16, 16]]
+    keypoint_colors = palette[[16, 16, 16, 16, 16, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0, 0, 0]]

-numpy_image = np.array(image)
+    numpy_image = np.array(image)

-for pose_result in image_pose_result:
-    scores = np.array(pose_result["scores"])
-    keypoints = np.array(pose_result["keypoints"])
+    for pose_result in image_pose_result:
+        scores = np.array(pose_result["scores"])
+        keypoints = np.array(pose_result["keypoints"])

-    # draw each point on image
-    draw_points(numpy_image, keypoints, scores, keypoint_colors, keypoint_score_threshold=0.3, radius=4, show_keypoint_weight=False)
+        # draw each point on image
+        draw_points(numpy_image, keypoints, scores, keypoint_colors, keypoint_score_threshold=0.3, radius=4, show_keypoint_weight=False)

-    # draw links
-    draw_links(numpy_image, keypoints, scores, keypoint_edges, link_colors, keypoint_score_threshold=0.3, thickness=1, show_keypoint_weight=False)
+        # draw links
+        draw_links(numpy_image, keypoints, scores, keypoint_edges, link_colors, keypoint_score_threshold=0.3, thickness=1, show_keypoint_weight=False)

-pose_image = Image.fromarray(numpy_image)
-pose_image
-```
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/vitpose-coco.jpg" alt="drawing" width="600"/>
+    pose_image = Image.fromarray(numpy_image)
+    pose_image
+    ```

 ## Resources

-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ViTPose. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+Refer to resources below to learn more about using ViTPose.

- A demo of ViTPose on images and video can be found [here](https://huggingface.co/spaces/hysts/ViTPose-transformers).
- A notebook illustrating inference and visualization can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/ViTPose/Inference_with_ViTPose_for_human_pose_estimation.ipynb).
+- This [notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/ViTPose/Inference_with_ViTPose_for_body_pose_estimation.ipynb) demonstrates inference and visualization.
+- This [Space](https://huggingface.co/spaces/hysts/ViTPose-transformers) demonstrates ViTPose on images and video.

 ## VitPoseImageProcessor

--- a/docs/source/en/modular_transformers.md
+++ b/docs/source/en/modular_transformers.md
@ -1,4 +1,4 @@
-# Modular Transformers
+# Contributing a new model to Transformers

 Modular Transformers lowers the bar for contributing models and significantly reduces the code required to add a model by allowing imports and inheritance.

@ -540,6 +540,9 @@ This makes it very easy to switch decorators and makes it explicit that the only

 ## Docstring variables

+> [!TIP]
+> Refer to the [Documeting a model](./auto_docstring) guide for more information about how you can use the `@auto_docstring` decorator to help automatically generate consistent docstring arguments.
+
 If an object defined in both the modular and modeling file from which it inherits, the modular definition has precedence unless for assignments containing the pattern `DOCSTRING`. These variables are typically used in `MODEL_START_DOCSTRING` and `MODEL_INPUT_DOCSTRING` in the modeling files. They are big blocks of docstrings and the linter rewrites the names everywhere. For this reason, assignments containing the `DOCSTRING` variable can use the definition found in the source file without copying the whole docstring, by simply setting the variable to `None` in the modular file.

 This is very useful if you need the variable reference somewhere but you don't want to clutter the modular file with docstrings which are always the same. The example code below allows you to automatically use the same docstrings from [Mistral](./model_doc/mistral) in [Starcoder2](./model_doc/starcoder2).
--- a/docs/source/en/perf_infer_gpu_multi.md
+++ b/docs/source/en/perf_infer_gpu_multi.md
@ -13,21 +13,19 @@ rendered properly in your Markdown viewer.

 -->

-# Tensor parallelism in transformers
+# Distributed inference

-[Tensor parallelism](./perf_train_gpu_many#tensor-parallelism) shards a model onto multiple GPUs and parallelizes computations such as matrix multiplication. It enables fitting larger model sizes into memory and is faster because each GPU can process a tensor slice.
-This document assumes that you are already familiar with the basics of tensor parallelism. If you are not, please refer to the [Ultra-Scale Playbook](https://huggingface.co/spaces/nanotron/ultrascale-playbook?section=tensor_parallelism) section on tensor parallelism.
+When a model doesn't fit on a single GPU, distributed inference with [tensor parallelism](./perf_train_gpu_many#tensor-parallelism) can help. Tensor parallelism shards a model onto multiple accelerators (CUDA GPU, Intel XPU, etc.) and parallelizes computations such as matrix multiplication. It enables fitting larger model sizes into memory and is faster because each accelerator can process a tensor slice.
+
+However, tensor parallelism adds communication overhead and should be used on single machine setups with multiple accelerators to take advantage of fast intra-node communication. For multi-node training, it may be more efficient to use pipeline or data parallelism depending on your use case.

 > [!TIP]
-> Tensor parallelism is very communication intensive, therefore it is reccomended to use it on a single machine with multiple GPUs, utilizing fast intra-node communication. For multi-node training, methods as pipeline or data parallelism are more efficient (depending on your use case).
+> Refer to the [Ultra-Scale Playbook](https://huggingface.co/spaces/nanotron/ultrascale-playbook?section=tensor_parallelism) section on tensor parallelism to learn more.

-Tensor parallelism requires slight changes to the model parameters, therefore in transformers, we support some of the popular models out of the box.
-
-> [!TIP]
-> Expand the list below to see which models support tensor parallelism. Open a GitHub issue or pull request to add support for a model not currently below.
+Check the list below for models that natively support tensor parallelism. Open a GitHub issue or pull request to add support for a model.

 <details>
-<summary>Supported models</summary>
+<summary>Show supported models</summary>

 * [Cohere](./model_doc/cohere) and [Cohere 2](./model_doc/cohere2)
 * [Gemma](./model_doc/gemma) and [Gemma 2](./model_doc/gemma2)
@ -43,19 +41,74 @@ Tensor parallelism requires slight changes to the model parameters, therefore in

 </details>

-## Using 🤗 transformers
+This guide shows how to enable tensor parallelism with Transformers and different partitioning strategies.

-Transformers provides a simple interface to use for tensor parallelism. We provide multiple classes implementing different partitioning
-strategies and a simple entrypoint to parallelize `nn.Module` instance. You won't have to interact with this interface directly, everything is done in `PretrainedModel.from_pretrained` method for you. This section will first talk about the partitioning strategies
-we support, then the user interface you will be interacting with, and finally it will teach you how to extend it with your own partitioning
-strategies.
+## Partitioning a model

-### Partitioning strategies
+Transformers supports tensor parallelism if a model has a `tp_plan`. There are two plans to partition a model.

-In transformers, partitioning strategies reside in a class `ParallelInterface` which works like a mapping from string to the strategy implementation.
+- The `auto` tensor parallelism plan partitions a model (see the supported models above) based on a predefined configuration.
+- You can also manually specify your own partitioning plan and pass it to the `tp_plan` parameter in [`~PreTrainedModel.from_pretrained`].

+<hfoptions id="sharding">
+<hfoption id="auto plan">

-```python
+```py
+import os
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+# model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct" # better to visualize all the possible strategies
+model_id = "meta-llama/Meta-Llama-3-8B-Instruct"  # better for smaller number of GPUs
+
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, tp_plan="auto")
+print(model._tp_plan)
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
+prompt = "Can I help"
+inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
+
+# distributed run
+outputs = model(inputs)
+```
+
+Launch the inference script above on [torchrun](https://pytorch.org/docs/stable/elastic/run.html) with 4 processes per GPU.
+
+```bash
+torchrun --nproc-per-node 4 demo.py
+```
+
+</hfoption>
+<hfoption id="manual plan">
+
+Define a tensor parallel plan for each layer in `tp_plan` and pass it to [`~PreTrainedModel.from_pretrained`]. The example below uses a combination of column and row partitioning. Refer to the [Partitioning strategies](#partitioning-strategies) section to learn about other supported partitioning strategies.
+
+> [!WARNING]
+> Manually specifying your own partitioning plan requires a good understanding of the model architecture and how the partitioning strategies interact together. If you are not sure about the partitioning strategies, the resulting model can be very slow, even failing or incorrect. Refer to the [Ultra-Scale Playbook](https://huggingface.co/spaces/nanotron/ultrascale-playbook?section=tensor_parallelism) to learn more.
+
+```py
+from transformers import AutoModelForCausalLM
+
+tp_plan = {
+    "model.layers.*.self_attn.q_proj": "colwise",
+    "model.layers.*.self_attn.k_proj": "colwise",
+    "model.layers.*.self_attn.v_proj": "colwise",
+    "model.layers.*.self_attn.o_proj": "rowwise",
+    ...
+}
+
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, tp_plan=tp_plan)
+print(model._tp_plan)
+```
+
+</hfoption>
+</hfoptions>
+
+## Partitioning strategies
+
+All partitioning strategies are defined in the [`ParallelInterface`] class which maps a string to the strategy implementation. You don't need to interact with this class directly since all the strategies are set with `tp_plan` in [`~PreTrainedModel.from_pretrained`], but it is useful for checking what strategies are available.
+
+```py
 class ParallelInterface(MutableMapping):
    """
    Dict-like object keeping track of allowed attention functions. You can easily add a new attention function
@ -77,66 +130,32 @@ class ParallelInterface(MutableMapping):
    }
 ```

-We support the following strategies:
+Refer to the table below to learn more about each strategy.

- `ColwiseParallel` - A simple column-wise partitioning, being able to handle both weights and biases, does exactly what we've discussed before.
- `RowwiseParallel` - Again, row-wise partitioning as dicussed before, supports weights and biases, on top of that it also supports `nn.Embedding` modules.
- `SequenceParallel` - Sequence parallel implementation, for support of `LayerNorm` and `Dropout` layers. Also supports Python implementation of `RMSNorm` (see [this](https://github.com/facebookresearch/llama/blob/main/llama/model.py#L34))
- `PackedColwiseParallel` - A variant of column-wise partitioning, however it works on packed weights (i.e. `up_proj` and `gate_proj` being packed together). For more details, see [this comment](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/tensor_parallel.py#L79-#L108)
- `PackedRowwiseParallel` - A variant of row-wise partitioning, works on packed weights, for more details check the comment linked above.
- `GatherParallel` - A very simple class, that only makes the outputs of the module to be gathered across devices.
- `IsolatedParallel` - This is a special case, where we want to *isolate* the module from the rest of the devices (world). This is used for Experts in MoE layers, basically creating Expert parallelism of sorts.
- `ReplicateParallel` - Many `torch.distributed` APIs break if model is partially sharded, so this class is used to replicate the module across all devices.
+| Strategy | Description |
+|---|---|
+| `ColwiseParallel` | Column-wise partitioning of weights and biases. |
+| `RowwiseParallel` | Row-wise partitioning of weights and biases. Also supports partitioning `nn.Embedding` modules. |
+| `SequenceParallel` | Sequence parallel implementation to support `LayerNorm` and `Dropout` layers. Also supports Python implementation of [RMSNorm](https://github.com/facebookresearch/llama/blob/main/llama/model.py#L34). |
+| `PackedColwiseParallel` | Variant of `ColwiseParallel` to support packed weights (for example, packing `up_proj` and `gate_proj` together). Refer to the [code](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/tensor_parallel.py#L79-#L108) for more details. |
+| `PackedRowwiseParallel` | Variant of `RowwiseParallel` to support packed weights (refer to the [code](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/tensor_parallel.py#L79-#L108) for more details). |
+| `GatherParallel` | Gather outputs of the module across devices. |
+| `IsolatedParallel` | Used for Experts in Mixture-of-Experts (MoE) layers to isolates module from other devices. |
+| `ReplicateParallel` | Replicate modules across all devices to prevent `torch.distributed` APIs from breaking due to a partially sharded model. |

-### Sharding a model
+### Packed strategies

-We provide two ways to shard a model, first one is to use `auto` tensor parallelism plan, which will automatically shard the model based on our predefined configuration. This requires the model to have predefined tensor parallel plan in transformers.
+Weight packing packs multiple linear layers into a single, bigger layer. Packed strategies, `PackedColwiseParallel` and `PackedRowwiseParallel`, are used to shard packed weights. The more basic `ColwiseParallel` or `RowwiseParallel` will incorrectly shard the packed weights.

-```python
-from transformers import AutoModelForCausalLM
+The example below packs `up_proj` and `gate_proj` into a single `gate_up_proj` module and requires the `PackedRowwiseParallel` strategy to shard `gate_up_proj`.

-# model_id = "meta-llama/Meta-Llama-3-8B-Instruct" # better for smaller number of GPUs
-model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct" # better to visualize all the possible strategies
-
-model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, tp_plan="auto")
-
-print(model._tp_plan)
-```
-
-> [!TIP]
-> For a list of models that support tensor parallelism, see the [Supported models](#supported-models) section above.
-
-The second way is to manually specify your own partitioning plan.
-
-```python
-from transformers import AutoModelForCausalLM
-
-tp_plan = {
-    "model.layers.*.self_attn.q_proj": "colwise",
-    "model.layers.*.self_attn.k_proj": "colwise",
-    "model.layers.*.self_attn.v_proj": "colwise",
-    "model.layers.*.self_attn.o_proj": "rowwise",
-    ...
-}
-
-model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, tp_plan=tp_plan)
-
-print(model._tp_plan)
-```
-
-You might have noticed that there are some special cases in the `ParallelInterface` mapping, let's now talk about them. This will help you understand their purpose and help with extending to other strategies.
-
-### PackedRowwiseParallel
-This class is a special case of `RowwiseParallel`, it's used to shard packed weights. Weight packing is a common technique used in models. It's a technique where we pack multiple linear layers into a single, bigger one.
-
-For example in `Llama4` model, we pack `up_proj` and `gate_proj` into a single `gate_up_proj` module.
 ```python
 class Llama4TextExperts(nn.Module):
    ...
    self.gate_up_proj = nn.Parameter(torch.empty(self.num_experts, self.hidden_size, 2 * self.expert_dim))
 ```

-Then in forward, we can use batch matrix multiplication to compute the output of the `gate_up_proj` module.
+Batch matrix multiplication can be used in the `forward` pass to compute the output of the `gate_up_proj` module.

 ```python
 def forward(self, hidden_states):
@ -145,185 +164,148 @@ def forward(self, hidden_states):
    gate, up = gate_up.chunk(2, dim=-1) # Split the output into gate and up
 ```

-In this case, we need to use the `PackedRowwiseParallel` strategy to shard the `gate_up_proj` module, as using a simple `RowwiseParallel` will shard the layers wrongly.
-
 > [!TIP]
-> If this is a bit difficult to wrap your head around, check out [this comment](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/tensor_parallel.py#L79-#L108) for an amazing visual representation of why `Packed*` needs to be used.
+> Refer to [this comment](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/tensor_parallel.py#L79-#L108) for an visual representation of why `Packed*` needs to be used.

+### Local strategies

-### `local*` strategies
+Local strategies (`local_colwise`, `local_rowwise`, `local_packed_rowwise`) don't use [DTensor](https://docs.pytorch.org/docs/stable/distributed.tensor.html) because it isn't supported for some operations such as [torch.chunk](https://docs.pytorch.org/docs/stable/generated/torch.chunk.html). Instead, local strategies use the basic [torch.Tensor](https://docs.pytorch.org/docs/stable/tensors.html) and performs some of the distributed logic manually.

-You could have noticed that there are `local*` strategies, which use the same layers as `*` strategy, but don't use `DTensor` at all.
-This is because `DTensor` is not supported for some of the operations: such as `torch.chunk`. Therefore, sometimes we need to use the `local*` strategies, which use vanilla `torch.Tensor` and do some of the distributed logic manually.
-
-<!---
+<!--
 Readd this when I get the exact error message
 > [!TIP]
 > If you are using a custom partitioning strategy, and it's not working with `... is not supported` error, try using the `local*` strategies to see if they work better.
 -->

-> [!WARNING]
-> Manually specifying your own partitiong plan requires a good understanding of the model architecture and how the partitioning strategies interact together. If you are not sure about this, the resulting model can be very slow, even failing or incorrect. Again, refer to the [Ultra-Scale Playbook](https://huggingface.co/spaces/nanotron/ultrascale-playbook?section=tensor_parallelism) which can teach you everything required.
+## Custom partitioning strategies

-### Extending the interface with your own partitioning strategies
+A custom partitioning strategy should inherit from [`TensorParallelLayer`](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/tensor_parallel.py) and implement `partition_tensor`, `_prepare_input_fn` and `_prepare_output_fn`.

-This is a very advanced topic, which requires a good understanding of distributed collectives and the model architecture.
-Your custom partitioning strategy should inherit from `TensorParallelLayer` defined in [integrations/tensor_parallel.py](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/tensor_parallel.py) and implement: `partition_tensor`, `_prepare_input_fn` and `_prepare_output_fn`. Then it should be registered in the `ParallelInterface` mapping, so our dispatching logic can find it when specified in the `tp_plan`.
+Then it needs to be registered in the `ParallelInterface` mapping so the dispatching logic can find it when specified in `tp_plan`.

-Let's go through this workflow step by step, on an already existing example: `ColwiseParallel`.
+The example below shows how to implement `ColwiseParallel` with this workflow.

-1. Inherit from `TensorParallelLayer` and initialization
+1. Inherit from `TensorParallelLayer`. In the `__init__` method, define `input_layouts` and `output_layouts` to describe how the input and output tensors should be placed on devices. The `desired_input_layouts` attribute is used to specify how the input *should* be placed on devices.

-```python
-class ColwiseParallel(TensorParallelLayer):
-    def __init__(
+    ```python
+    class ColwiseParallel(TensorParallelLayer):
+        def __init__(
+            self,
+            *,
+            input_layouts: Optional[Placement] = None, # The input layout coming from the previous layer
+            output_layouts: Optional[Placement] = None, # The output layout we want to achieve
+            use_local_output: bool = True, # Whether to use local output or not
+            use_dtensor=True, # Whether to use DTensor or not
+        ):
+            self.input_layouts = (input_layouts or Replicate(),) # The input sharding coming from the previous layer
+            self.output_layouts = (output_layouts or Shard(-1),) # Desired output sharding
+            self.desired_input_layouts = (Replicate(),) # Desired input sharding, inputs should be replicated across GPUs
+            self.use_local_output = use_local_output
+            self.use_dtensor = use_dtensor
+    ```
+
+2. Implement the `partition_tensor`, `_prepare_input_fn` and `_prepare_output_fn` methods.
+
+    The `partition_tensor` method partitions the tensor and fills `empty_param` with the partitioned tensor. Use the utility function `get_tensor_shard` to help you get the correct shard of the original parameter for a given rank and `get_packed_weights` to help with packed weights.
+
+    ```python
+    def partition_tensor(
        self,
-        *,
-        input_layouts: Optional[Placement] = None, # The input layout coming from the previous layer
-        output_layouts: Optional[Placement] = None, # The output layout we want to achieve
-        use_local_output: bool = True, # Whether to use local output or not
-        use_dtensor=True, # Whether to use DTensor or not
-    ):
-        self.input_layouts = (input_layouts or Replicate(),) # The input sharding coming from the previous layer
-        self.output_layouts = (output_layouts or Shard(-1),) # Desired output sharding
-        self.desired_input_layouts = (Replicate(),) # Desired input sharding, inputs should be replicated across GPUs
-        self.use_local_output = use_local_output
-        self.use_dtensor = use_dtensor
-```
+        param, # Full tensor of the parameter
+        empty_param, # Empty tensor of the parameter, will be filled with the partitioned tensor
+        param_type, # Type of the parameter, `bias` or `weight`
+        param_casting_dtype, # The type to cast the parameter to
+        to_contiguous, # Whether to convert the tensor to a contiguous memory layout
+        rank, # The rank of the current device
+        device_mesh, # The device mesh
+    ) -> nn.Parameter: # Return the partitioned parameter
+        ...
+    ```

-In the `__init__` method, we define these attributes, where `input_layouts` and `output_layouts` describing, how the input and output tensors should be placed on the devices. `desired_input_layouts` is used to specify, how the input *SHOULD* be placed on the devices.
+    The `_prepare_input_fn` and `_prepare_output_fn` methods are used in the [pre-forward](https://docs.pytorch.org/docs/stable/generated/torch.nn.modules.module.register_module_forward_pre_hook.html) and [forward](https://docs.pytorch.org/docs/stable/generated/torch.nn.modules.module.register_module_forward_hook.html) hooks. They redistribute the inputs and outputs to the desired layout as specified in the `__init__`.

-2a. Implement `partition_tensor` method
+    ```python
+    def _prepare_input_fn(input_layouts, desired_input_layouts, mod, inputs, device_mesh):
+        ...
+        # Do some custom logic, cast to DTensor etc.
+        ...
+        return inputs.redistribute(placements=desired_input_layouts, device_mesh=device_mesh)
+    def _prepare_output_fn(output_layouts, use_local_output, mod, outputs, device_mesh):
+        ...
+        # Do some custom logic, cast to DTensor etc.
+        ...
+        return outputs.redistribute(placements=output_layouts, device_mesh=device_mesh)
+    ```

-```python
-def partition_tensor(
-    self,
-    param, # Full tensor of the parameter
-    empty_param, # Empty tensor of the parameter, will be filled with the partitioned tensor
-    param_type, # Type of the parameter, `bias` or `weight`
-    param_casting_dtype, # The type to cast the parameter to
-    to_contiguous, # Whether to convert the tensor to a contiguous memory layout
-    rank, # The rank of the current device
-    device_mesh, # The device mesh
-) -> nn.Parameter: # Return the partitioned parameter
-    ...
-```
+3. Register the strategy to [`ParallelInterface`] to enable it for use with `tp_plan`.

-This method is used to partition the tensor, and fill the `empty_param` with the partitioned tensor.
-We provide some utility functions to help you with this, such as `get_tensor_shard` which will get you the correct shard of the original parameter for this rank or `get_packed_weights` to help with packed weights.
+    ```python
+    from transformers.integrations.tensor_parallel import ParallelInterface

-2b. Implement `_prepare_input_fn` and `_prepare_output_fn` methods
+    ParallelInterface.register_strategy("colwise_custom", ColwiseParallel)
+    tp_plan = {
+        "model.layers.*.self_attn.q_proj": "colwise_custom",
+        ...
+    }
+    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, tp_plan=tp_plan)
+    ```

-These methods are used as [`pre-forward`](https://docs.pytorch.org/docs/stable/generated/torch.nn.modules.module.register_module_forward_pre_hook.html) and [`forward`](https://docs.pytorch.org/docs/stable/generated/torch.nn.modules.module.register_module_forward_hook.html) hooks respectively. Their purpose is to re-distribute the inputs and outputs to the desired layout, passed in the `__init__` method.
+## Benchmarks

-```python
-def _prepare_input_fn(input_layouts, desired_input_layouts, mod, inputs, device_mesh):
-    ...
-    # Do some custom logic, cast to DTensor etc.
-    ...
-    return inputs.redistribute(placements=desired_input_layouts, device_mesh=device_mesh)
+Tensor parallelism can considerably speedup inference, especially for inputs with large batch sizes or long sequences.

-def _prepare_output_fn(output_layouts, use_local_output, mod, outputs, device_mesh):
-    ...
-    # Do some custom logic, cast to DTensor etc.
-    ...
-    return outputs.redistribute(placements=output_layouts, device_mesh=device_mesh)
-```
-
-3. Register the strategy
-Congratulations! You've implemented your own partitioning strategy. Now, to use it with your own `tp_plan`, you need to register it in the `ParallelInterface` mapping.
-
-```python
-from transformers.integrations.tensor_parallel import ParallelInterface
-
-ParallelInterface.register_strategy("colwise_custom", ColwiseParallel)
-```
-
-And now you can use it in your `tp_plan` as such:
-
-```python
-tp_plan = {
-    "model.layers.*.self_attn.q_proj": "colwise_custom",
-    ...
-}
-
-model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, tp_plan=tp_plan)
-```
-
-
-## Full example
-
-Let's go through a full example of inference with tensor parallelism.
-```python
-import os
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-
-# enable tensor parallelism
-model = AutoModelForCausalLM.from_pretrained(
-    "meta-llama/Meta-Llama-3-8B-Instruct",
-    tp_plan="auto",
-)
-
-# prepare input tokens
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
-prompt = "Can I help"
-inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
-
-# distributed run
-outputs = model(inputs)
-```
-
-Launch the inference script above on [torchrun](https://pytorch.org/docs/stable/elastic/run.html) with 4 processes per GPU.
-
-```bash
-torchrun --nproc-per-node 4 demo.py
-```
-
-You can benefit from considerable speed ups for inference, especially for inputs with large batch size or long sequences.
-
-For a single forward pass on [Llama](./model_doc/llama) with a sequence length of 512 and various batch sizes, you can expect the following speed ups.
+Refer to the chart below for the expected speedup for a single forward pass on [Llama](./model_doc/llama) with a sequence length of 512.

 <div style="text-align: center">
    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Meta-Llama-3-8B-Instruct%2C%20seqlen%20%3D%20512%2C%20python%2C%20w_%20compile.png">
 </div>

-## Tensor parallelism in-depth
-Our implementation of tensor parallelism is framework-agnostic in design, but the specific implementations we've developed rely on the torch.distributed package. We heavily utilize abstractions such as `DeviceMesh` or `DTensor` to provide a simple and extensible interface to the user.
+## Design implementation
+
+The Transformers tensor parallelism implementation is framework-agnostic, but for specific implementations, we rely on [DeviceMesh](https://docs.pytorch.org/tutorials/recipes/distributed_device_mesh.html) and [DTensor](https://docs.pytorch.org/docs/stable/distributed.tensor.html) from [torch.distributed](https://docs.pytorch.org/tutorials/beginner/dist_overview.html) to provide a simple and extensible interface.

 ### DeviceMesh
-Imagine `DeviceMesh` as a multi-dimensional grid of devices that communicate together. Different parallelization strategies require different types of communication patterns, therefore we can create a `DeviceMesh` with multiple submeshes:
+
+Imagine `DeviceMesh` as a multi-dimensional grid of devices that communicate together. Different parallelization strategies require different types of communication patterns, so you can create a `DeviceMesh` with multiple sub-meshes.
+
 ```python
 from torch.distributed.device_mesh import init_device_mesh

 # Create a 1D mesh of 4 GPUs
 device_mesh = init_device_mesh("cuda", (4,), mesh_dim_names=["tp"])
 ```
-Then, most of the `torch.distributed` defined parallelization strategies can be applied to a mesh itself, or its submesh, automatically handling the communication patterns.
+
+Most of the `torch.distributed` defined parallelization strategies can be applied to the mesh itself, or its sub-mesh, and it automatically handles the communication patterns.

 ### DTensor

-Abbreviation for Distributed Tensor, `DTensor` is a tensor subclass that handles the distributed logic on-top of the usual tensor operations. Most of the model weights in case of tensor parallelism are stored as `DTensor`s (with some exceptions, more on that later).
-The most important part of DTensor, that is crucial to understand, is the `placement` attribute. It's an attribute that tells PyTorch how is the tensor placed on the devices of the `DeviceMesh`.
+`DTensor` (Distributed Tensor) is a tensor subclass that handles the distributed logic on top of the usual tensor operations. Most of the model weights in tensor parallelism are stored as `DTensor`s.

-It can have the following values:
+The most important part of DTensor is the `placement` attribute because it tells PyTorch how a tensor is placed on the devices in `DeviceMesh`. The `placement` attribute can take the following values.

- `Shard(dimension)` - Annotates that this `DTensor` is sharded across a given dimension, over the `DeviceMesh` it was constructed under. For example, if we would like to shard weights for column-wise partitioning, we would do:
-```python
-weight = ...
-weight = DTensor.from_local(weight, device_mesh["tp"], placements=[Shard(0)]) # Shard across the 1st (column-wise) dimension
-bias = ...
-bias = DTensor.from_local(bias, device_mesh["tp"], placements=[Shard(-1)]) # Shard across the ONLY dimension
-```
+- `Shard(dimension)` - Indicates how a `DTensor` is sharded across a given dimension, over the `DeviceMesh` it was constructed under. The example below demonstrates how to shard weights over different dimensions for column-wise partitioning.

-To give another example, for row-wise partitioning, we would do:
-```python
-weight = ...
-weight = DTensor.from_local(weight, device_mesh["tp"], placements=[Shard(1)]) # Shard across the 2nd (row-wise) dimension
-bias = ...
-bias = DTensor.from_local(bias, device_mesh["tp"], placements=[Replicate()]) # Replicate bias across all GPUs
-```
+    ```python
+    weight = ...
+    weight = DTensor.from_local(weight, device_mesh["tp"], placements=[Shard(0)]) # Shard across the 1st (column-wise) dimension
+    bias = ...
+    bias = DTensor.from_local(bias, device_mesh["tp"], placements=[Shard(-1)]) # Shard across the ONLY dimension
+    ```

- `Replicate()` - Annotates that this `DTensor` is replicated across the `DeviceMesh`. Very straight-forward, only creates a full copy of the tensor on each device.
- `Partial()` - This placement is mostly of no interest to us, it's used to annotate that this tensor is pending a reduction operation.
+    This example demonstrates how to shard weights over different dimensions for row-wise partitioning.
+
+    ```python
+    weight = ...
+    weight = DTensor.from_local(weight, device_mesh["tp"], placements=[Shard(1)]) # Shard across the 2nd (row-wise) dimension
+    bias = ...
+    bias = DTensor.from_local(bias, device_mesh["tp"], placements=[Replicate()]) # Replicate bias across all GPUs
+    ```
+
+- `Replicate()` - Indicates a `DTensor` is replicated across the `DeviceMesh`. It only creates a full copy of the tensor on each device.
+
+    ```py
+    bias = ...
+    bias = DTensor.from_local(bias, device_mesh["tp"], placements=[Replicate()]) # Replicate bias across all GPUs
+    ```
+
+- `Partial()` - Indicates a tensor is pending a reduction operation (not typically relevant for usage in Transformers).
--- a/docs/source/en/perf_train_gpu_many.md
+++ b/docs/source/en/perf_train_gpu_many.md
@ -91,6 +91,8 @@ Tensor parallelism distributes large tensor computations across multiple GPUs. T

 Tensor parallelism is effective for training large models that don't fit into the memory of a single GPU. It is also faster and more efficient because each GPU can process its tensor slice in parallel, and it can be combined with other parallelism methods. Like other parallelism methods though, tensor parallelism adds communication overhead between GPUs.

+Refer to the [Tensor parallelism](./perf_infer_gpu_multi) guide to learn how to use it for inference.
+
 ## Hybrid parallelism

 Parallelism methods can be combined to achieve even greater memory savings and more efficiently train models with billions of parameters.
--- a/docs/source/en/perf_train_tpu_tf.md
+++ b/docs/source/en/perf_train_tpu_tf.md
@ -1,355 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# TPU
-
-TPU (Tensor Processing Unit) is a type of hardware designed to accelerate tensor computations for training and inference. TPUs are generally accessed through Google cloud services, but smaller TPUs are also available for free from [Google Colab](https://colab.research.google.com/notebooks/tpu.ipynb) or [Kaggle](https://www.kaggle.com/docs/tpu).
-
-This guide focuses on training a Keras model for sequence classification on a TPU from Google Colab. Make sure the TPU runtime is enabled by going to **Runtime > Change runtime type** and selecting a TPU.
-
-Run the command below to install the latest version of Transformers and [Datasets](https://huggingface.co/docs/datasets).
-
-```py
-!pip install --U transformers datasets
-```
-
-Create an instance of [tf.distribute.cluster_resolver.TPUClusterResolver](https://www.tensorflow.org/api_docs/python/tf/distribute/cluster_resolver/TPUClusterResolver), and then connect to the remote cluster and initialize the TPUs.
-
-```py
-import tensorflow as tf
-
-resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
-tf.config.experimental_connect_to_cluster(resolver)
-tf.tpu.experimental.initialize_tpu_system(resolver)
-```
-
-There are various distribution strategies for running your model on multiple TPUs. The [tpu.distribute.TPUStrategy](https://www.tensorflow.org/api_docs/python/tf/distribute/TPUStrategy) offers synchronized distributed training.
-
-```py
-strategy = tf.distribute.TPUStrategy(resolver)
-```
-
-Load and tokenize a dataset - this example uses [CoLA](https://huggingface.co/datasets/nyu-mll/glue/viewer/cola) from the GLUE benchmark - and pad all samples to the maximum length so it is easier to load as an array and to avoid [XLA compilation issues](#xla).
-
-```py
-from transformers import AutoTokenizer
-from datasets import load_dataset
-import numpy as np
-
-dataset = load_dataset("glue", "cola")["train"]
-tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
-
-train_data = tokenizer(
-    dataset["sentence"],
-    padding="max_length",
-    truncation=True,
-    max_length=128,
-    return_tensors="np",
-)
-train_data = dict(train_data)
-train_labels = np.array(dataset["label"])
-```
-
-The model **must** be created inside [Strategy.scope](https://www.tensorflow.org/api_docs/python/tf/distribute/MirroredStrategy#scope) in order to replicate the model layers on each TPU device.
-
-```py
-from transformers import TFAutoModelForSequenceClassification
-
-with strategy.scope():
-    model = TFAutoModelForSequenceClassification.from_pretrained(model_checkpoint)
-    model.compile(optimizer="adam")
-```
-
-TPUs only accept [tf.data.Dataset](https://www.tensorflow.org/api_docs/python/tf/data/Dataset) inputs unlike the Keras [fit](https://keras.io/api/models/model_training_apis/#fit-method) method which accepts a broader range of inputs.
-
-```py
-BATCH_SIZE = 8 * strategy.num_replicas_in_sync
-
-tf_dataset = tf.data.Dataset.from_tensor_slices((train_data, train_labels))
-tf_dataset = tf_dataset.shuffle(len(tf_dataset))
-tf_dataset = tf_dataset.batch(BATCH_SIZE, drop_remainder=True)
-```
-
-Finally, call [fit](https://keras.io/api/models/model_training_apis/#fit-method) to start training.
-
-```py
-model.fit(tf_dataset)
-```
-
-## Large datasets
-
-The dataset created above pads every sample to the maximum length and loads the whole dataset into memory. This may not be possible if you're working with larger datasets. When training on large datasets, you may want to create a [tf.TFRecord](https://www.tensorflow.org/tutorials/load_data/tfrecord) or stream the data.
-
-### tf.TFRecord
-
-[tf.TFRecord](https://www.tensorflow.org/tutorials/load_data/tfrecord) is the standard [tf.data](https://www.tensorflow.org/guide/data) format for storing training data. For very large training jobs, it's worth preprocessing your data and storing it in the `tf.TFRecord` format and building a `tf.data` pipeline on top. Refer to the table below to help you decide whether `tf.TFRecord` is helpful for you.
-
-| pros | cons |
-|---|---|
-| works on all TPU instances | costs associated with cloud storage |
-| supports huge datasets and massive throughput | some data types (images) can take a lot of space to store |
-| suitable for training on entire TPU pods |  |
-| preprocessing is done in advance, maximizing training speed |  |
-
-Preprocess and tokenize the dataset before writing it to a `tf.TFRecord` to avoid writing every time the data is loaded.
-
-An exception is made for *train-time augmentations*, because augmentations applied after writing to a `tf.TFRecord` results in the same augmentation for each epoch. Instead, apply augmentations in the `tf.data` pipeline that loads the data.
-
-> [!TIP]
-> In practice, you probably won't be able to load the entire dataset in memory. Load a chunk of the dataset at a time and convert it to `TFRecord`, and repeat until the entire dataset is in the `TFRecord` format. Then you can use a list of all the files to create a `TFRecordDataset`. The example below demonstrates a single file for simplicity.
-
-```py
-tokenized_data = tokenizer(
-    dataset["sentence"],
-    padding="max_length",
-    truncation=True,
-    max_length=128,
-    return_tensors="np",
-)
-labels = dataset["label"]
-
-with tf.io.TFRecordWriter("dataset.tfrecords") as file_writer:
-    for i in range(len(labels)):
-        features = {
-            "input_ids": tf.train.Feature(
-                int64_list=tf.train.Int64List(value=tokenized_data["input_ids"][i])
-            ),
-            "attention_mask": tf.train.Feature(
-                int64_list=tf.train.Int64List(value=tokenized_data["attention_mask"][i])
-            ),
-            "labels": tf.train.Feature(
-                int64_list=tf.train.Int64List(value=[labels[i]])
-            ),
-        }
-        features = tf.train.Features(feature=features)
-        example = tf.train.Example(features=features)
-        record_bytes = example.SerializeToString()
-        file_writer.write(record_bytes)
-```
-
-Build a [TFRecordDataset](https://www.tensorflow.org/api_docs/python/tf/data/TFRecordDataset) using the saved filename to load it.
-
-```py
-def decode_fn(sample):
-    features = {
-        "input_ids": tf.io.FixedLenFeature((128,), dtype=tf.int64),
-        "attention_mask": tf.io.FixedLenFeature((128,), dtype=tf.int64),
-        "labels": tf.io.FixedLenFeature((1,), dtype=tf.int64),
-    }
-    return tf.io.parse_example(sample, features)
-
-# TFRecordDataset can handle gs:// paths
-tf_dataset = tf.data.TFRecordDataset(["gs://matt-tf-tpu-tutorial-datasets/cola/dataset.tfrecords"])
-tf_dataset = tf_dataset.map(decode_fn)
-tf_dataset = tf_dataset.shuffle(len(dataset)).batch(BATCH_SIZE, drop_remainder=True)
-tf_dataset = tf_dataset.apply(
-    tf.data.experimental.assert_cardinality(len(labels) // BATCH_SIZE)
-)
-```
-
-The dataset can now be passed to the [fit](https://keras.io/api/models/model_training_apis/#fit-method) method.
-
-```py
-model.fit(tf_dataset)
-```
-
-### Stream from raw data
-
-Data can be stored in its native format and preprocessed in a [tf.data](https://www.tensorflow.org/guide/data) pipeline as the data is loaded. This approach isn't supported for many models with complex tokenization schemes, but some models like BERT are supported because their tokenization can be compiled. Refer to the table below to help you decide whether this approach is helpful for you.
-
-| pros | cons |
-|---|---|
-| suitable for highly compressed big data in native format (images, audio) | requires writing a full preprocessing pipeline |
-| convenient if raw data is available in a public cloud bucket | complex preprocessing on-the-fly can hurt throughput |
-| works on all TPU instances if data is stored in Google Cloud | must place data in cloud storage if not already there |
-|  | not as suitable for text data because writing a tokenization pipeline is hard (use `TFRecord` for text) |
-
-The example below demonstrates streaming data for an image model.
-
-Load an image dataset and get a list of the underlying image file paths and labels.
-
-```py
-from datasets import load_dataset
-
-image_dataset = load_dataset("beans", split="train")
-filenames = image_dataset["image_file_path"]
-labels = image_dataset["labels"]
-```
-
-Convert the local filenames in the dataset into `gs://` paths in Google Cloud Storage.
-
-```py
-# strip everything but the category directory and filenames
-base_filenames = ['/'.join(filename.split('/')[-2:]) for filename in filenames]
-# prepend the Google Cloud base path to everything instead
-gs_paths = ["gs://matt-tf-tpu-tutorial-datasets/beans/"+filename for filename in base_filenames]
-
-# create tf_dataset
-tf_dataset = tf.data.Dataset.from_tensor_slices(
-    {"filename": gs_paths, "labels": labels}
-)
-tf_dataset = tf_dataset.shuffle(len(tf_dataset))
-```
-
-Transformers preprocessing classes like [`AutoImageProcessor`] are framework-agnostic and can't be compiled into a pipeline by `tf.data`. To get around this, get the normalization values (`mean` and `std`) from the [`AutoImageProcessor`] and use them in the `tf.data` pipeline.
-
-```py
-from transformers import AutoImageProcessor
-
-processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
-image_size = (processor.size["height"], processor.size["width"])
-image_mean = processor.image_mean
-image_std = processor.image_std
-```
-
-Use these normalization values to create a function to load and preprocess the images.
-
-```py
-BATCH_SIZE = 8 * strategy.num_replicas_in_sync
-
-def decode_fn(sample):
-    image_data = tf.io.read_file(sample["filename"])
-    image = tf.io.decode_jpeg(image_data, channels=3)
-    image = tf.image.resize(image, image_size)
-    array = tf.cast(image, tf.float32)
-    array /= 255.0
-    array = (array - image_mean) / image_std
-    array = tf.transpose(array, perm=[2, 0, 1])
-    return {"pixel_values": array, "labels": sample["labels"]}
-
-tf_dataset = tf_dataset.map(decode_fn)
-tf_dataset = tf_dataset.batch(BATCH_SIZE, drop_remainder=True)
-print(tf_dataset.element_spec)
-```
-
-The dataset can now be passed to the [fit](https://keras.io/api/models/model_training_apis/#fit-method) method.
-
-```py
-from transformers import TFAutoModelForImageClassification
-
-with strategy.scope():
-    model = TFAutoModelForImageClassification.from_pretrained(image_model_checkpoint)
-    model.compile(optimizer="adam")
-
-model.fit(tf_dataset)
-```
-
-### Stream with prepare_tf_dataset
-
-[`~TFPreTrainedModel.prepare_tf_dataset`] creates a `tf.data` pipeline that loads samples from [tf.data.Dataset](https://www.tensorflow.org/api_docs/python/tf/data/Dataset). The pipeline uses [tf.numpy_function]() or [`~datasets.Dataset.from_generator`], which can't be compiled by TensorFlow, to access the underlying `tf.data.Dataset`. It also won't work on a Colab TPU or TPU Nodes because the pipeline streams data from a local disk. Refer to the table below to help you decide whether this approach is helpful for you.
-
-| pros | cons |
-|---|---|
-| simple code | only works on TPU VM |
-| same approach on TPU/GPU | data must be available as a Hugging Face Dataset |
-| dataset doesn't have to fit in memory | data must fit on local storage |
-| supports variable padding | data loading may be a bottleneck on a big TPU pod slice |
-
-[`~TFPreTrainedModel.prepare_tf_dataset`] only works on [TPU VM](#tpu-types). Add the tokenizer output as columns in the dataset since the dataset is stored on disk, which means it can handle data larger than the available memory. Use [`~TFPreTrainedModel.prepare_tf_dataset`] to stream data from the dataset by wrapping it with a `tf.data` pipeline.
-
-```py
-def tokenize_function(examples):
-    return tokenizer(
-        examples["sentence"], padding="max_length", truncation=True, max_length=128
-    )
-# add the tokenizer output to the dataset as new columns
-dataset = dataset.map(tokenize_function)
-
-# prepare_tf_dataset() chooses columns that match the models input names
-tf_dataset = model.prepare_tf_dataset(
-    dataset, batch_size=BATCH_SIZE, shuffle=True, tokenizer=tokenizer
-)
-```
-
-The dataset can now be passed to the [fit](https://keras.io/api/models/model_training_apis/#fit-method) method.
-
-```py
-from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
-
-with strategy.scope():
-    model = TFAutoModelForSequenceClassification.from_pretrained(model_checkpoint)
-    model.compile(optimizer="adam")
-
-model.fit(tf_dataset)
-```
-
-## TPU types
-
-There are two types of TPUs, a TPU Node and a TPU VM.
-
-A TPU Node indirectly accesses a remote TPU. It requires a separate VM to initialize your network and data pipeline, and then forwards it to the remote node. Google Colab TPUs are an example of a TPU Node. You can't use local data because the TPU is remotely located, and data must be stored in Google Cloud Storage where the data pipeline can access it.
-
-TPU VM are connected directly to the machine the TPU is located on, and they are generally easier to work with, especially when it comes to your data pipeline.
-
-> [!TIP]
-> We recommend avoiding TPU Nodes if possible because it is more difficult to debug than TPU VMs. TPU Nodes may also be unsupported in the future and become a legacy access method.
-
-A single TPU (v2-8, v3-8, v4-8) runs 8 replicas. TPUs can exist in **pods** which run hundreds or even thousands of replicas simultaneously. When you only use a portion of a pod, it is referred to as a **pod slice**. On Google Colab, you'll typically get a single v2-8 TPU.
-
-## XLA
-
-[XLA](https://openxla.org/xla) is a linear algebra compiler for high-performance execution and it is used by default to improve performance on TPUs.
-
-Before executing your code on a TPU, it's a good idea to try it first on a CPU or GPU because it is easier to debug. You can train for a few steps to make sure the model and data pipeline work as expected. Set `jit_compile=True` in the [compile](https://keras.io/api/models/model_training_apis/#compile-method) method to enable XLA compilation (but remember to remove this line of code before running on a TPU).
-
-The section below outlines three rules for making your code XLA-compatible. Transformers enforce the first two rules for models and loss functions by default, but don't forget about them if you're writing your own models and loss functions.
-
-### Data dependent conditionals
-
-Any `if` statements cannot depend on values inside a [tf.Tensor](https://www.tensorflow.org/api_docs/python/tf/Tensor). The code below can't be compiled by XLA.
-
-```py
-if tf.reduce_sum(tensor) > 10:
-    tensor = tensor / 2.0
-```
-
-To compile with XLA, use [tf.cond](https://www.tensorflow.org/api_docs/python/tf/cond) or remove the conditional and use indicator variables instead as shown below.
-
-```py
-sum_over_10 = tf.cast(tf.reduce_sum(tensor) > 10, tf.float32)
-tensor = tensor / (1.0 + sum_over_10)
-```
-
-### Data dependent shapes
-
-The shape of a [tf.Tensor](https://www.tensorflow.org/api_docs/python/tf/Tensor) cannot depend on their values. For example, [tf.unique](https://www.tensorflow.org/api_docs/python/tf/unique) can't be compiled because it returns a tensor containing an instance of each unique value in the input. The shape of this output depends on how repetitive the input [tf.Tensor](https://www.tensorflow.org/api_docs/python/tf/Tensor) is.
-
-This is an issue during **label masking**, where labels are set to a negative value to indicate they should be ignored when computing the loss. The code below can't be compiled by XLA because the shape of `masked_outputs` and `masked_labels` depend on how many positions are masked.
-
-```py
-label_mask = labels >= 0
-masked_outputs = outputs[label_mask]
-masked_labels = labels[label_mask]
-loss = compute_loss(masked_outputs, masked_labels)
-mean_loss = torch.mean(loss)
-```
-
-To compile with XLA, avoid the data-dependent shapes by computing the loss for every position and zeroing out the masked positions in both the numerator and denominator when calculating the mean. Convert `tf.bool` to `tf.float32` as an indicator variable to make your code XLA-compatible.
-
-```py
-label_mask = tf.cast(labels >= 0, tf.float32)
-loss = compute_loss(outputs, labels)
-loss = loss * label_mask
-mean_loss = tf.reduce_sum(loss) / tf.reduce_sum(label_mask)
-```
-
-### Recompile different input shapes
-
-XLA recompiles your model if input shapes are variable which create huge performance problems. It is especially common in text models because input texts have variable lengths after tokenization.
-
-> [!WARNING]
-> Execessive padding can also severely slow down training because requires more compute and memory to process.
-
-To avoid different shapes, use padding to pad all your inputs to the same length and use an `attention_mask`. Try padding batches of samples to a multiple of 32 or 64 tokens. Use the parameters `padding="max_length"`, `padding="longest"`, or `pad_to_multiple_of` to help with padding. This often increases the number of tokens by a small amount, but it significantly reduces the number of unique input shapes because every input shape is a multiple of 32 or 64. Fewer unique input shapes requires fewer recompilation.
--- a/docs/source/en/quantization/finegrained_fp8.md
+++ b/docs/source/en/quantization/finegrained_fp8.md
@ -47,7 +47,7 @@ quantized_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="

 tokenizer = AutoTokenizer.from_pretrained(model_name)
 input_text = "What are we having for dinner?"
-input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+input_ids = tokenizer(input_text, return_tensors="pt").to(quantized_model.device.type)

 output = quantized_model.generate(**input_ids, max_new_tokens=10)
 print(tokenizer.decode(output[0], skip_special_tokens=True))
--- a/docs/source/en/quantization/torchao.md
+++ b/docs/source/en/quantization/torchao.md
@ -49,6 +49,7 @@ Check the table below to see if your hardware is compatible.
 | Component | Compatibility |
 |----------|----------------|
 | CUDA Versions | ✅ cu118, cu126, cu128 |
+| XPU Versions | ✅ pytorch2.8 |
 | CPU | ✅ change `device_map="cpu"` (see examples below) |


@ -278,6 +279,71 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 </hfoption>
 </hfoptions>

+### Intel XPU
+<hfoptions id="examples-Intel-XPU">
+<hfoption id="int8-dynamic-and-weight-only">
+    
+```py
+import torch
+from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
+from torchao.quantization import Int8DynamicActivationInt8WeightConfig, Int8WeightOnlyConfig
+
+quant_config = Int8DynamicActivationInt8WeightConfig()
+# or int8 weight only quantization
+# quant_config = Int8WeightOnlyConfig()
+quantization_config = TorchAoConfig(quant_type=quant_config)
+
+# Load and quantize the model
+quantized_model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-3.1-8B-Instruct",
+    torch_dtype="auto",
+    device_map="auto",
+    quantization_config=quantization_config
+)
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
+input_text = "What are we having for dinner?"
+input_ids = tokenizer(input_text, return_tensors="pt").to("xpu")
+
+# auto-compile the quantized model with `cache_implementation="static"` to get speed up
+output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+</hfoption>
+
+<hfoption id="int4-weight-only">
+
+```py
+import torch
+from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
+from torchao.quantization import Int4WeightOnlyConfig
+from torchao.dtypes import Int4XPULayout
+from torchao.quantization.quant_primitives import ZeroPointDomain
+
+
+quant_config = Int4WeightOnlyConfig(group_size=128, layout=Int4XPULayout(), zero_point_domain=ZeroPointDomain.INT)
+quantization_config = TorchAoConfig(quant_type=quant_config)
+
+# Load and quantize the model
+quantized_model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-3.1-8B-Instruct",
+    torch_dtype="auto",
+    device_map="auto",
+    quantization_config=quantization_config
+)
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
+input_text = "What are we having for dinner?"
+input_ids = tokenizer(input_text, return_tensors="pt").to("xpu")
+
+# auto-compile the quantized model with `cache_implementation="static"` to get speed up
+output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+</hfoption>
+</hfoptions>
+
+
 ### CPU
 <hfoptions id="examples-CPU">
 <hfoption id="int8-dynamic-and-weight-only">
@ -363,7 +429,7 @@ tokenizer = AutoTokenizer.from_pretrained(model_id)

 # Manual Testing
 prompt = "Hey, are you conscious? Can you talk to me?"
-inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
+inputs = tokenizer(prompt, return_tensors="pt").to(quantized_model.device.type)
 generated_ids = quantized_model.generate(**inputs, max_new_tokens=128)
 output_text = tokenizer.batch_decode(
    generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
@ -434,7 +500,7 @@ quantized_model = AutoModelForCausalLM.from_pretrained(

 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
 input_text = "What are we having for dinner?"
-input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+input_ids = tokenizer(input_text, return_tensors="pt").to(quantized_model.device.type)

 # auto-compile the quantized model with `cache_implementation="static"` to get speed up
 output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
@ -474,7 +540,7 @@ tokenizer.push_to_hub(f"{USER_ID}/llama3-8b-int4wo-128")

 ## Loading quantized models

-Loading a quantized model depends on the quantization scheme. For quantization schemes, like int8 and float8, you can quantize the model on any device and also load it on any device. The example below demonstrates quantizing a model on the CPU and then loading it on CUDA.
+Loading a quantized model depends on the quantization scheme. For quantization schemes, like int8 and float8, you can quantize the model on any device and also load it on any device. The example below demonstrates quantizing a model on the CPU and then loading it on CUDA or XPU.
 ```py
 import torch
 from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
@ -491,7 +557,7 @@ quantized_model = AutoModelForCausalLM.from_pretrained(
    quantization_config=quantization_config
 )
 # save the quantized model
-output_dir = "llama-3.1-8b-torchao-int8-cuda"
+output_dir = "llama-3.1-8b-torchao-int8"
 quantized_model.save_pretrained(output_dir, safe_serialization=False)

 # reload the quantized model
@ -502,7 +568,7 @@ reloaded_model = AutoModelForCausalLM.from_pretrained(
 )
 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
 input_text = "What are we having for dinner?"
-input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+input_ids = tokenizer(input_text, return_tensors="pt").to(reloaded_model.device.type)

 output = reloaded_model.generate(**input_ids, max_new_tokens=10)
 print(tokenizer.decode(output[0], skip_special_tokens=True))
--- a/docs/source/en/serving.md
+++ b/docs/source/en/serving.md
@ -16,7 +16,9 @@ rendered properly in your Markdown viewer.

 # Serving

-Transformer models can be served for inference with specialized libraries such as Text Generation Inference (TGI) and vLLM. These libraries are specifically designed to optimize performance with LLMs and include many unique optimization features that may not be included in Transformers.
+Transformer models can be efficiently deployed using libraries such as vLLM, Text Generation Inference (TGI), and others. These libraries are designed for production-grade user-facing services, and can scale to multiple servers and millions of concurrent users.
+
+You can also serve transformer models easily using the `transformers serve` CLI. This is ideal for experimentation purposes, or to run models locally for personal and private use.

 ## TGI

@ -61,4 +63,167 @@ vllm serve Qwen/Qwen2.5-1.5B-Instruct \
    --task generate \
    --model-impl transformers \
    --trust-remote-code
-```
+```
+
+## Serve CLI
+
+> [!WARNING]
+> This section is experimental and subject to change in future versions
+
+<!-- TODO: LLMs -> models, after we add audio/image input/output support -->
+You can serve LLMs supported by `transformers` with the `transformers serve` CLI. It spawns a local server that offers a chat Completions API compatible with the OpenAI SDK, which is the _de facto_ standard for LLM conversations. This way, you can use the server from many third party applications, or test it using the `transformers chat` CLI ([docs](conversations.md#chat-cli)).
+
+To launch a server, simply use the `transformers serve` CLI command:
+
+```shell
+transformers serve
+```
+
+The simplest way to interact with the server is through our `transformers chat` CLI
+
+```shell
+transformers chat localhost:8000 --model-name-or-path Qwen/Qwen3-4B
+```
+
+or by sending an HTTP request with `cURL`, e.g.
+
+```shell
+curl -X POST http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{"messages": [{"role": "system", "content": "hello"}], "temperature": 0.9, "max_tokens": 1000, "stream": true, "model": "Qwen/Qwen2.5-0.5B-Instruct"}'
+```
+
+from which you'll receive multiple chunks in the Completions API format
+
+```shell
+data: {"object": "chat.completion.chunk", "id": "req_0", "created": 1751377863, "model": "Qwen/Qwen2.5-0.5B-Instruct", "system_fingerprint": "", "choices": [{"delta": {"role": "assistant", "content": "", "tool_call_id": null, "tool_calls": null}, "index": 0, "finish_reason": null, "logprobs": null}]}
+
+data: {"object": "chat.completion.chunk", "id": "req_0", "created": 1751377863, "model": "Qwen/Qwen2.5-0.5B-Instruct", "system_fingerprint": "", "choices": [{"delta": {"role": "assistant", "content": "", "tool_call_id": null, "tool_calls": null}, "index": 0, "finish_reason": null, "logprobs": null}]}
+
+(...)
+```
+
+The server is also an MCP client, so it can interact with MCP tools in agentic use cases. This, of course, requires the use of an LLM that is designed to use tools.
+
+> [!TIP]
+> At the moment, MCP tool usage in `transformers` is limited to the `qwen` family of models.
+
+<!-- TODO: example with a minimal python example, and explain that it is possible to pass a full generation config in the request -->
+
+
+### Usage example 1: apps with local requests (feat. Jan)
+
+This example shows how to use `transformers serve` as a local LLM provider for the [Jan](https://jan.ai/) app. Jan is a ChatGPT-alternative graphical interface, fully running on your machine. The requests to `transformers serve` come directly from the local app -- while this section focuses on Jan, you can extrapolate some instructions to other apps that make local requests.
+
+To connect `transformers serve` with Jan, you'll need to set up a new model provider ("Settings" > "Model Providers"). Click on "Add Provider", and set a new name. In your new model provider page, all you need to set is the "Base URL" to the following pattern:
+
+```shell
+http://[host]:[port]/v1
+```
+
+where `host` and `port` are the `transformers serve` CLI parameters (`localhost:8000` by default). After setting this up, you should be able to see some models in the "Models" section, hitting "Refresh". Make sure you add some text in the "API key" text field too -- this data is not actually used, but the field can't be empty. Your custom model provider page should look like this:
+
+<h3 align="center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_serve_jan_model_providers.png"/>
+</h3>
+
+You are now ready to chat!
+
+> [!TIP]
+> You can add any `transformers`-compatible model to Jan through `transformers serve`. In the custom model provider you created, click on the "+" button in the "Models" section and add its Hub repository name, e.g. `Qwen/Qwen3-4B`.
+
+To conclude this example, let's look into a more advanced use-case. If you have a beefy machine to serve models with, but prefer using Jan on a different device, you need to add port forwarding. If you have `ssh` access from your Jan machine into your server, this can be accomplished by typing the following to your Jan machine's terminal
+
+```
+ssh -N -f -L 8000:localhost:8000 your_server_account@your_server_IP -p port_to_ssh_into_your_server
+```
+
+Port forwarding is not Jan-specific: you can use it to connect `transformers serve` running in a different machine with an app of your choice.
+
+
+### Usage example 2: apps with external requests (feat. Cursor)
+
+This example shows how to use `transformers serve` as a local LLM provider for [Cursor](https://cursor.com/), the popular IDE. Unlike in the previous example, requests to `transformers serve` will come from an external IP (Cursor's server IPs), which requires some additional setup. Furthermore, some of Cursor's requests require [CORS](https://developer.mozilla.org/en-US/docs/Web/HTTP/Guides/CORS), which is disabled by default for security reasons.
+
+To launch our server with CORS enabled, run
+
+```shell
+transformers serve --enable-cors
+```
+
+We'll also need to expose our server to external IPs. A potential solution is to use [`ngrok`](https://ngrok.com/), which has a permissive free tier. After setting up your `ngrok` account and authenticating on your server machine, you run
+
+```shell
+ngrok http [port]
+```
+
+where `port` is the port used by `transformers serve` (`8000` by default). On the terminal where you launched `ngrok`, you'll see an https address in the "Forwarding" row, as in the image below. This is the address to send requests to.
+
+<h3 align="center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_serve_ngrok.png"/>
+</h3>
+
+We're now ready to set things up on the app side! In Cursor, while we can't set a new provider, we can change the endpoint for OpenAI requests in the model selection settings. First, navigate to "Settings" > "Cursor Settings", "Models" tab, and expand the "API Keys" collapsible. To set our `transformers serve` endpoint, follow this order:
+1. Unselect ALL models in the list above (e.g. `gpt4`, ...);
+2. Add and select the model you want to use (e.g. `Qwen/Qwen3-4B`)
+3. Add some random text to OpenAI API Key. This field won't be used, but it can’t be empty;
+4. Add the https address from `ngrok` to the "Override OpenAI Base URL" field, appending `/v1` to the address (i.e. `https://(...).ngrok-free.app/v1`);
+5. Hit "Verify".
+
+After you follow these steps, your "Models" tab should look like the image below. Your server should also have received a few requests from the verification step.
+
+<h3 align="center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_serve_cursor.png"/>
+</h3>
+
+You are now ready to use your local model in Cursor! For instance, if you toggle the AI Pane, you can select the model you added and ask it questions about your local files.
+
+<h3 align="center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_serve_cursor_chat.png"/>
+</h3>
+
+
+### Usage example 3: `tiny-agents` CLI and MCP Tools
+
+To showcase the use of MCP tools, let's see how to integrate the `transformers serve` server with the [`tiny-agents`](https://huggingface.co/blog/python-tiny-agents) CLI.
+
+> [!TIP]
+> Many Hugging Face Spaces can be used as MCP servers, as in this example. You can find all compatible Spaces [here](https://huggingface.co/spaces?filter=mcp-server).
+
+The first step to use MCP tools is to let the model know which tools are available. As an example, let's consider a `tiny-agents` configuration file with a reference to an [image generation MCP server](https://evalstate-flux1-schnell.hf.space/).
+
+```json
+{
+    "model": "Menlo/Jan-nano",
+    "endpointUrl": "http://localhost:8000",
+    "servers": [
+        {
+            "type": "sse",
+            "config": {
+                "url": "https://evalstate-flux1-schnell.hf.space/gradio_api/mcp/sse"
+            }
+        }
+    ]
+}
+```
+
+You can then launch your `tiny-agents` chat interface with the following command.
+
+```bash
+tiny-agents run path/to/your/config.json
+```
+
+If you have `transformers serve` running in the background, you're ready to use MCP tools from a local model! For instance, here's the example of a chat session with `tiny-agents`:
+
+```bash
+Agent loaded with 1 tools:
+ • flux1_schnell_infer
+»  Generate an image of a cat on the moon
+<Tool req_0_tool_call>flux1_schnell_infer {"prompt": "a cat on the moon", "seed": 42, "randomize_seed": true, "width": 1024, "height": 1024, "num_inference_steps": 4}
+
+Tool req_0_tool_call
+[Binary Content: Image image/webp, 57732 bytes]
+The task is complete and the content accessible to the User
+Image URL: https://evalstate-flux1-schnell.hf.space/gradio_api/file=/tmp/gradio/3dbddc0e53b5a865ed56a4e3dbdd30f3f61cf3b8aabf1b456f43e5241bd968b8/image.webp
+380576952
+
+I have generated an image of a cat on the moon using the Flux 1 Schnell Image Generator. The image is 1024x1024 pixels and was created with 4 inference steps. Let me know if you would like to make any changes or need further assistance!
+```
--- a/docs/source/en/tf_xla.md
+++ b/docs/source/en/tf_xla.md
@ -1,129 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# XLA
-
-[[open-in-colab]]
-
-[Accelerated Linear Algebra (XLA)](https://openxla.org/xla) is a linear algebra compiler that optimizes model runtime across different hardware and frameworks.
-
-This guide will look specifically at how to accelerate *TensorFlow* models with XLA.
-
-## TensorFlow
-
-XLA can potentially accelerate a TensorFlow model without making any source code changes. It is already packaged with the TensorFlow library, and it is triggered with `jit_compile` in any graph creating function such as [tf.function](https://www.tensorflow.org/api_docs/python/tf/function).
-
-If you're using Keras methods like [fit](https://keras.io/api/models/model_training_apis/#fit-method) and [predict](https://keras.io/api/models/model_training_apis/#predict-method), enable XLA by passing `jit_compile=True` to [compile](https://keras.io/api/models/model_training_apis/#compile-method).
-
-```py
-model.compile(jit_compile=True)
-```
-
-XLA can be used to accelerate any arbitrary [tf.function](https://www.tensorflow.org/api_docs/python/tf/function).
-
-Models with a TensorFlow implementation like [GPT2](./model_doc/gpt2), [T5](./model_doc/t5), [OPT](./model_doc/opt), and [Whisper](./model_doc/whisper) are XLA compatible. The speed up depends on a model, but in general, TensorFlow models in Transformers get a ~100x speed up.
-
-### Functions
-
-A typical forward pass in a TensorFlow model is shown below. To run a forward pass with XLA, wrap the model with [tf.function](https://www.tensorflow.org/api_docs/python/tf/function) and set `jit_compile=True`.
-
-```diff
-import tensorflow as tf
-
-model = tf.keras.Sequential(
-    [tf.keras.layers.Dense(10, input_shape=(10,), activation="relu"), tf.keras.layers.Dense(5, activation="softmax")]
-)
-# Generate random inputs for the model.
-batch_size = 16
-input_vector_dim = 10
-random_inputs = tf.random.normal((batch_size, input_vector_dim))
-
-# Run a forward pass.
- _ = model(random_inputs)
-+ xla_fn = tf.function(model, jit_compile=True)
-+ _ = xla_fn(random_inputs)
-```
-
-The default `call` function of the model is used to compile the XLA graph. But if there's any other model function you want to compile with XLA, wrap them with [tf.function](https://www.tensorflow.org/api_docs/python/tf/function).
-
-```py
-my_xla_fn = tf.function(model.my_xla_fn, jit_compile=True)
-```
-
-### Text generation
-
-You could also compile other model functions with XLA. For example, enable XLA for text generation by wrapping [`~TFGenerationMixin.generate`] with [tf.function](https://www.tensorflow.org/api_docs/python/tf/function).
-
-```py
-import tensorflow as tf
-from transformers import AutoTokenizer, TFAutoModelForCausalLM
-# Will error if the minimal version of Transformers is not installed.
-from transformers.utils import check_min_version
-
-check_min_version("4.21.0")
-
-tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", padding_side="left", pad_token="</s>")
-model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2")
-input_string = ["TensorFlow is"]
-
-xla_generate = tf.function(model.generate, jit_compile=True)
-
-tokenized_input = tokenizer(input_string, return_tensors="tf")
-generated_tokens = xla_generate(**tokenized_input, num_beams=2)
-
-decoded_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
-print(f"Generated -- {decoded_text}")
-"Generated -- TensorFlow is an open-source, open-source, distributed-source application framework for the"
-```
-
-## Tracing
-
-When executing an XLA-enabled function for the first time, it tries to infer the computation graph in a process known as *tracing*. This is a time-consuming step, but any consecutive calls to the function will be much faster because it won't have to trace the computation graph again.
-
-To ensure a function is only traced once, the inputs must have the same shape as when the graph was built. This usually isn't an issue for fixed input shapes like images, but it can be an issue for inputs with variable shapes like text.
-
-One way to handle this is to pad your text so it always has the same shape. Configure padding options such as [pad_to_multiple_of](https://hf.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.pad.pad_to_multiple_of) in the tokenizer.
-
-```py
-import tensorflow as tf
-from transformers import AutoTokenizer, TFAutoModelForCausalLM
-
-tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", padding_side="left", pad_token="</s>")
-model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2")
-input_string = ["TensorFlow is"]
-
-xla_generate = tf.function(model.generate, jit_compile=True)
-
-# Call tokenizer with padding options.
-tokenized_input = tokenizer(input_string, pad_to_multiple_of=8, padding=True, return_tensors="tf")
-
-generated_tokens = xla_generate(**tokenized_input, num_beams=2)
-decoded_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
-print(f"Generated -- {decoded_text}")
-```
-
-In addition to the input shape, any changes to the generation options at any point also triggers tracing.
-
-## Resources
-
-Learn more about XLA with the following resources.
-
- A [notebook](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/91_tf_xla_generate.ipynb) demonstrating XLA-compatible encoder-decoder and decoder-only text generation models.
- The [Faster Text Generation with TensorFlow and XLA](https://hf.co/blog/tf-xla-generate) blog post compares benchmarks for XLA-compatible models and provides a friendly introduction to XLA in TensorFlow.
- The [How Hugging Face improved Text Generation performance with XLA](https://blog.tensorflow.org/2022/11/how-hugging-face-improved-text-generation-performance-with-xla.html) blog post discusses the design philosophy behind adding XLA to TensorFlow models in Transformers.
- The [Introduction to graphs and tf.function](https://www.tensorflow.org/guide/intro_to_graphs) guide.
- The [Better performance with tf.function](https://www.tensorflow.org/guide/function) guide.
- The [XLA](https://openxla.org/xla) documentation.
--- a/docs/source/en/tools.md
+++ b/docs/source/en/tools.md
@ -14,5 +14,9 @@ rendered properly in your Markdown viewer.

 -->

+# Tools
+
+(deprecated)
+
 > [!WARNING]
 > Agents and tools were spun out into the standalone [smolagents](https://huggingface.co/docs/smolagents/index) library. They were removed from `transformers` in v4.52.
--- a/examples/flax/question-answering/run_qa.py
+++ b/examples/flax/question-answering/run_qa.py
@ -60,7 +60,7 @@ from transformers.utils import check_min_version, send_example_telemetry
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.53.0.dev0")
+check_min_version("4.54.0.dev0")

 Array = Any
 Dataset = datasets.arrow_dataset.Dataset
--- a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
+++ b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
@ -59,7 +59,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risk.
-check_min_version("4.53.0.dev0")
+check_min_version("4.54.0.dev0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/flax/speech-recognition/requirements.txt")

--- a/examples/flax/text-classification/run_flax_glue.py
+++ b/examples/flax/text-classification/run_flax_glue.py
@ -55,7 +55,7 @@ from transformers.utils import check_min_version, send_example_telemetry

 logger = logging.getLogger(__name__)
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.53.0.dev0")
+check_min_version("4.54.0.dev0")

 Array = Any
 Dataset = datasets.arrow_dataset.Dataset
--- a/examples/flax/token-classification/run_flax_ner.py
+++ b/examples/flax/token-classification/run_flax_ner.py
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version

 logger = logging.getLogger(__name__)
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.53.0.dev0")
+check_min_version("4.54.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")

--- a/examples/pytorch/audio-classification/run_audio_classification.py
+++ b/examples/pytorch/audio-classification/run_audio_classification.py
@ -44,7 +44,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.53.0.dev0")
+check_min_version("4.54.0.dev0")

 require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt")

--- a/examples/pytorch/contrastive-image-text/run_clip.py
+++ b/examples/pytorch/contrastive-image-text/run_clip.py
@ -53,7 +53,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.53.0.dev0")
+check_min_version("4.54.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt")

--- a/examples/pytorch/image-classification/run_image_classification.py
+++ b/examples/pytorch/image-classification/run_image_classification.py
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.53.0.dev0")
+check_min_version("4.54.0.dev0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")

--- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py
+++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.53.0.dev0")
+check_min_version("4.54.0.dev0")

 logger = get_logger(__name__)

--- a/examples/pytorch/image-pretraining/run_mae.py
+++ b/examples/pytorch/image-pretraining/run_mae.py
@ -42,7 +42,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.53.0.dev0")
+check_min_version("4.54.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")

--- a/examples/pytorch/image-pretraining/run_mim.py
+++ b/examples/pytorch/image-pretraining/run_mim.py
@ -47,7 +47,7 @@ Any model supported by the AutoModelForMaskedImageModeling API can be used.
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.53.0.dev0")
+check_min_version("4.54.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")

--- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py
+++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
@ -52,7 +52,7 @@ Any model supported by the AutoModelForMaskedImageModeling API can be used.
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.53.0.dev0")
+check_min_version("4.54.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")

--- a/examples/pytorch/instance-segmentation/run_instance_segmentation.py
+++ b/examples/pytorch/instance-segmentation/run_instance_segmentation.py
@ -46,7 +46,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.53.0.dev0")
+check_min_version("4.54.0.dev0")

 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt")

--- a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py
+++ b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py
@ -52,7 +52,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.53.0.dev0")
+check_min_version("4.54.0.dev0")

 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt")

--- a/examples/pytorch/language-modeling/run_clm.py
+++ b/examples/pytorch/language-modeling/run_clm.py
@ -54,7 +54,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.53.0.dev0")
+check_min_version("4.54.0.dev0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.53.0.dev0")
+check_min_version("4.54.0.dev0")

 logger = get_logger(__name__)

--- a/examples/pytorch/language-modeling/run_fim.py
+++ b/examples/pytorch/language-modeling/run_fim.py
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.53.0.dev0")
+check_min_version("4.54.0.dev0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

--- a/examples/pytorch/language-modeling/run_fim_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_fim_no_trainer.py
@ -59,7 +59,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.53.0.dev0")
+check_min_version("4.54.0.dev0")

 logger = get_logger(__name__)

--- a/examples/pytorch/language-modeling/run_mlm.py
+++ b/examples/pytorch/language-modeling/run_mlm.py
@ -53,7 +53,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.53.0.dev0")
+check_min_version("4.54.0.dev0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.53.0.dev0")
+check_min_version("4.54.0.dev0")

 logger = get_logger(__name__)
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
--- a/examples/pytorch/language-modeling/run_plm.py
+++ b/examples/pytorch/language-modeling/run_plm.py
@ -46,7 +46,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.53.0.dev0")
+check_min_version("4.54.0.dev0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

--- a/examples/pytorch/multiple-choice/run_swag.py
+++ b/examples/pytorch/multiple-choice/run_swag.py
@ -45,7 +45,7 @@ from transformers.utils import check_min_version, send_example_telemetry


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.53.0.dev0")
+check_min_version("4.54.0.dev0")

 logger = logging.getLogger(__name__)

--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@ -53,7 +53,7 @@ from transformers.utils import check_min_version, send_example_telemetry


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.53.0.dev0")
+check_min_version("4.54.0.dev0")

 logger = get_logger(__name__)
 # You should update this to your particular problem to have better documentation of `model_type`
--- a/examples/pytorch/object-detection/run_object_detection.py
+++ b/examples/pytorch/object-detection/run_object_detection.py
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.53.0.dev0")
+check_min_version("4.54.0.dev0")

 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/object-detection/requirements.txt")

--- a/examples/pytorch/object-detection/run_object_detection_no_trainer.py
+++ b/examples/pytorch/object-detection/run_object_detection_no_trainer.py
@ -51,7 +51,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.53.0.dev0")
+check_min_version("4.54.0.dev0")

 logging.basicConfig(level=logging.INFO)
 logger = get_logger(__name__)
--- a/examples/pytorch/question-answering/run_qa.py
+++ b/examples/pytorch/question-answering/run_qa.py
@ -49,7 +49,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.53.0.dev0")
+check_min_version("4.54.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

--- a/examples/pytorch/question-answering/run_qa_beam_search.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search.py
@ -47,7 +47,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.53.0.dev0")
+check_min_version("4.54.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

--- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
@ -54,7 +54,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.53.0.dev0")
+check_min_version("4.54.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

--- a/examples/pytorch/question-answering/run_qa_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_no_trainer.py
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.53.0.dev0")
+check_min_version("4.54.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

--- a/examples/pytorch/question-answering/run_seq2seq_qa.py
+++ b/examples/pytorch/question-answering/run_seq2seq_qa.py
@ -45,7 +45,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.53.0.dev0")
+check_min_version("4.54.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
@ -50,7 +50,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.53.0.dev0")
+check_min_version("4.54.0.dev0")

 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/semantic-segmentation/requirements.txt")

--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
@ -49,7 +49,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.53.0.dev0")
+check_min_version("4.54.0.dev0")

 logger = get_logger(__name__)

--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
@ -49,7 +49,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.53.0.dev0")
+check_min_version("4.54.0.dev0")

 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")

--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
@ -52,7 +52,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.53.0.dev0")
+check_min_version("4.54.0.dev0")

 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")

--- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
@ -47,7 +47,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.53.0.dev0")
+check_min_version("4.54.0.dev0")

 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")

--- a/examples/pytorch/summarization/run_summarization.py
+++ b/examples/pytorch/summarization/run_summarization.py
@ -51,7 +51,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.53.0.dev0")
+check_min_version("4.54.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")

--- a/examples/pytorch/summarization/run_summarization_no_trainer.py
+++ b/examples/pytorch/summarization/run_summarization_no_trainer.py
@ -55,7 +55,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.53.0.dev0")
+check_min_version("4.54.0.dev0")

 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
--- a/examples/pytorch/text-classification/run_classification.py
+++ b/examples/pytorch/text-classification/run_classification.py
@ -46,7 +46,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.53.0.dev0")
+check_min_version("4.54.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")

--- a/examples/pytorch/text-classification/run_glue.py
+++ b/examples/pytorch/text-classification/run_glue.py
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.53.0.dev0")
+check_min_version("4.54.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")

--- a/examples/pytorch/text-classification/run_glue_no_trainer.py
+++ b/examples/pytorch/text-classification/run_glue_no_trainer.py
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.53.0.dev0")
+check_min_version("4.54.0.dev0")

 logger = get_logger(__name__)

--- a/examples/pytorch/text-classification/run_xnli.py
+++ b/examples/pytorch/text-classification/run_xnli.py
@ -47,7 +47,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.53.0.dev0")
+check_min_version("4.54.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")

--- a/Show More
+++ b/Show More