Add ModernBERT Decoder Models - ModernBERT, but trained with CLM! (#38967 )

* working locally; need to style and test * added docs and initial tests; need to debug and flesh out * fixed tests * working long context; batches * working fa2 and eager * update tests * add missing confnigs * remove default autoset * fix spacing * fix most tests * fixed tests * fix to init * refactor to match new transformers updates * remove static cache option * fa2 fix * fix docs * in progress * working on tests * fixed issue with attn outputs * remove debug * fix local config attr * update doc string * fix docstring * add docs to toc * correct typo in toc * add new updates from main w.r.t. ModernBERT RoPE * fix local param --------- Co-authored-by: oweller2 <oweller2@dsailogin.mgmt.ai.cluster> Co-authored-by: oweller2 <oweller2@l07.mgmt.ai.cluster> Co-authored-by: oweller2 <oweller2@n02.mgmt.ai.cluster> Co-authored-by: oweller2 <oweller2@l08.mgmt.ai.cluster> Co-authored-by: oweller2 <oweller2@l01.mgmt.ai.cluster> Co-authored-by: oweller2 <oweller2@l02.mgmt.ai.cluster>
Fix typo in /v1/models output payload (#39414 )
2025-10-21 01:23:56 +08:00 · 2025-07-15 10:40:41 +02:00 · 2025-07-15 08:59:25 +01:00 · 2025-07-15 09:34:06 +02:00 · 2025-07-14 17:47:19 +00:00 · 2025-07-14 10:35:17 -07:00
706 changed files with 33313 additions and 18097 deletions
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@ -303,7 +303,7 @@ non_model_job = CircleCIJob(
    docker_image=[{"image": "huggingface/transformers-torch-light"}],
    # networkx==3.3 (after #36957) cause some issues
    # TODO: remove this once it works directly
-    install_steps=["uv venv && uv pip install ."],
+    install_steps=["uv venv && uv pip install .[serving]"],
    marker="not generate",
    parallelism=6,
 )
--- a/.github/workflows/get-pr-info.yml
+++ b/.github/workflows/get-pr-info.yml
@ -0,0 +1,157 @@
+name: Get PR commit SHA
+on:
+  workflow_call:
+    inputs:
+      pr_number:
+        required: true
+        type: string
+    outputs:
+      PR_HEAD_REPO_FULL_NAME:
+        description: "The full name of the repository from which the pull request is created"
+        value: ${{ jobs.get-pr-info.outputs.PR_HEAD_REPO_FULL_NAME }}
+      PR_BASE_REPO_FULL_NAME:
+        description: "The full name of the repository to which the pull request is created"
+        value: ${{ jobs.get-pr-info.outputs.PR_BASE_REPO_FULL_NAME }}
+      PR_HEAD_REPO_OWNER:
+        description: "The owner of the repository from which the pull request is created"
+        value: ${{ jobs.get-pr-info.outputs.PR_HEAD_REPO_OWNER }}
+      PR_BASE_REPO_OWNER:
+        description: "The owner of the repository to which the pull request is created"
+        value: ${{ jobs.get-pr-info.outputs.PR_BASE_REPO_OWNER }}
+      PR_HEAD_REPO_NAME:
+        description: "The name of the repository from which the pull request is created"
+        value: ${{ jobs.get-pr-info.outputs.PR_HEAD_REPO_NAME }}
+      PR_BASE_REPO_NAME:
+        description: "The name of the repository to which the pull request is created"
+        value: ${{ jobs.get-pr-info.outputs.PR_BASE_REPO_NAME }}
+      PR_HEAD_REF:
+        description: "The branch name of the pull request in the head repository"
+        value: ${{ jobs.get-pr-info.outputs.PR_HEAD_REF }}
+      PR_BASE_REF:
+        description: "The branch name in the base repository (to merge into)"
+        value: ${{ jobs.get-pr-info.outputs.PR_BASE_REF }}
+      PR_HEAD_SHA:
+        description: "The head sha of the pull request branch in the head repository"
+        value: ${{ jobs.get-pr-info.outputs.PR_HEAD_SHA }}
+      PR_BASE_SHA:
+        description: "The head sha of the target branch in the base repository"
+        value: ${{ jobs.get-pr-info.outputs.PR_BASE_SHA }}
+      PR_MERGE_COMMIT_SHA:
+        description: "The sha of the merge commit for the pull request (created by GitHub) in the base repository"
+        value: ${{ jobs.get-pr-info.outputs.PR_MERGE_COMMIT_SHA }}
+      PR_HEAD_COMMIT_DATE:
+        description: "The date of the head sha of the pull request branch in the head repository"
+        value: ${{ jobs.get-pr-info.outputs.PR_HEAD_COMMIT_DATE }}
+      PR_MERGE_COMMIT_DATE:
+        description: "The date of the merge commit for the pull request (created by GitHub) in the base repository"
+        value: ${{ jobs.get-pr-info.outputs.PR_MERGE_COMMIT_DATE }}
+      PR_HEAD_COMMIT_TIMESTAMP:
+        description: "The timestamp of the head sha of the pull request branch in the head repository"
+        value: ${{ jobs.get-pr-info.outputs.PR_HEAD_COMMIT_TIMESTAMP }}
+      PR_MERGE_COMMIT_TIMESTAMP:
+        description: "The timestamp of the merge commit for the pull request (created by GitHub) in the base repository"
+        value: ${{ jobs.get-pr-info.outputs.PR_MERGE_COMMIT_TIMESTAMP }}
+      PR:
+        description: "The PR"
+        value: ${{ jobs.get-pr-info.outputs.PR }}
+      PR_FILES:
+        description: "The files touched in the PR"
+        value: ${{ jobs.get-pr-info.outputs.PR_FILES }}
+
+
+jobs:
+  get-pr-info:
+    runs-on: ubuntu-22.04
+    name: Get PR commit SHA better
+    outputs:
+      PR_HEAD_REPO_FULL_NAME: ${{ steps.pr_info.outputs.head_repo_full_name }}
+      PR_BASE_REPO_FULL_NAME: ${{ steps.pr_info.outputs.base_repo_full_name }}
+      PR_HEAD_REPO_OWNER: ${{ steps.pr_info.outputs.head_repo_owner }}
+      PR_BASE_REPO_OWNER: ${{ steps.pr_info.outputs.base_repo_owner }}
+      PR_HEAD_REPO_NAME: ${{ steps.pr_info.outputs.head_repo_name }}
+      PR_BASE_REPO_NAME: ${{ steps.pr_info.outputs.base_repo_name }}
+      PR_HEAD_REF: ${{ steps.pr_info.outputs.head_ref }}
+      PR_BASE_REF: ${{ steps.pr_info.outputs.base_ref }}
+      PR_HEAD_SHA: ${{ steps.pr_info.outputs.head_sha }}
+      PR_BASE_SHA: ${{ steps.pr_info.outputs.base_sha }}
+      PR_MERGE_COMMIT_SHA: ${{ steps.pr_info.outputs.merge_commit_sha }}
+      PR_HEAD_COMMIT_DATE: ${{ steps.pr_info.outputs.head_commit_date }}
+      PR_MERGE_COMMIT_DATE: ${{ steps.pr_info.outputs.merge_commit_date }}
+      PR_HEAD_COMMIT_TIMESTAMP: ${{ steps.get_timestamps.outputs.head_commit_timestamp }}
+      PR_MERGE_COMMIT_TIMESTAMP: ${{ steps.get_timestamps.outputs.merge_commit_timestamp }}
+      PR: ${{ steps.pr_info.outputs.pr }}
+      PR_FILES: ${{ steps.pr_info.outputs.files }}
+    if: ${{ inputs.pr_number != '' }}
+    steps:
+      - name: Extract PR details
+        id: pr_info
+        uses: actions/github-script@v6
+        with:
+          script: |            
+            const { data: pr } = await github.rest.pulls.get({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              pull_number: ${{ inputs.pr_number }}
+            });
+
+            const { data: head_commit }  = await github.rest.repos.getCommit({
+              owner: pr.head.repo.owner.login,
+              repo: pr.head.repo.name,
+              ref: pr.head.ref
+            });
+
+            const { data: merge_commit }  = await github.rest.repos.getCommit({
+              owner: pr.base.repo.owner.login,
+              repo: pr.base.repo.name,
+              ref: pr.merge_commit_sha,
+            });
+
+            const { data: files } = await github.rest.pulls.listFiles({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              pull_number: ${{ inputs.pr_number }}
+            });
+
+            core.setOutput('head_repo_full_name', pr.head.repo.full_name);
+            core.setOutput('base_repo_full_name', pr.base.repo.full_name);
+            core.setOutput('head_repo_owner', pr.head.repo.owner.login);
+            core.setOutput('base_repo_owner', pr.base.repo.owner.login);
+            core.setOutput('head_repo_name', pr.head.repo.name);
+            core.setOutput('base_repo_name', pr.base.repo.name);
+            core.setOutput('head_ref', pr.head.ref);
+            core.setOutput('base_ref', pr.base.ref);
+            core.setOutput('head_sha', pr.head.sha);
+            core.setOutput('base_sha', pr.base.sha);
+            core.setOutput('merge_commit_sha', pr.merge_commit_sha);
+            core.setOutput('pr', pr);
+
+            core.setOutput('head_commit_date', head_commit.commit.committer.date);
+            core.setOutput('merge_commit_date', merge_commit.commit.committer.date);
+            
+            core.setOutput('files', files);            
+            
+            console.log('PR head commit:', {
+              head_commit: head_commit,
+              commit: head_commit.commit,
+              date: head_commit.commit.committer.date
+            });
+
+            console.log('PR merge commit:', {
+              merge_commit: merge_commit,
+              commit: merge_commit.commit,
+              date: merge_commit.commit.committer.date
+            });
+
+      - name: Convert dates to timestamps
+        id: get_timestamps
+        run: |
+          head_commit_date=${{ steps.pr_info.outputs.head_commit_date }}
+          merge_commit_date=${{ steps.pr_info.outputs.merge_commit_date }}
+          echo $head_commit_date
+          echo $merge_commit_date
+          head_commit_timestamp=$(date -d "$head_commit_date" +%s)
+          merge_commit_timestamp=$(date -d "$merge_commit_date" +%s)
+          echo $head_commit_timestamp
+          echo $merge_commit_timestamp
+          echo "head_commit_timestamp=$head_commit_timestamp" >> $GITHUB_OUTPUT
+          echo "merge_commit_timestamp=$merge_commit_timestamp" >> $GITHUB_OUTPUT
--- a/.github/workflows/get-pr-number.yml
+++ b/.github/workflows/get-pr-number.yml
@ -0,0 +1,36 @@
+name: Get PR number
+on:
+  workflow_call:
+    outputs:
+      PR_NUMBER:
+        description: "The extracted PR number"
+        value: ${{ jobs.get-pr-number.outputs.PR_NUMBER }}
+
+jobs:
+  get-pr-number:
+    runs-on: ubuntu-22.04
+    name: Get PR number
+    outputs:
+      PR_NUMBER: ${{ steps.set_pr_number.outputs.PR_NUMBER }}
+    steps:
+      - name: Get PR number
+        shell: bash
+        run: |
+          if [[ "${{ github.event.issue.number }}" != "" && "${{ github.event.issue.pull_request }}" != "" ]]; then
+            echo "PR_NUMBER=${{ github.event.issue.number }}" >> $GITHUB_ENV
+          elif [[ "${{ github.event.pull_request.number }}" != "" ]]; then
+            echo "PR_NUMBER=${{ github.event.pull_request.number }}" >> $GITHUB_ENV
+          elif [[ "${{ github.event.pull_request }}" != "" ]]; then
+            echo "PR_NUMBER=${{ github.event.number }}" >> $GITHUB_ENV
+          else
+            echo "PR_NUMBER=" >> $GITHUB_ENV
+          fi
+
+      - name: Check PR number
+        shell: bash
+        run: |
+          echo "${{ env.PR_NUMBER }}"
+
+      - name: Set PR number
+        id: set_pr_number
+        run: echo "PR_NUMBER=${{ env.PR_NUMBER }}" >> "$GITHUB_OUTPUT"
--- a/.github/workflows/pr_run_slow_ci.yml
+++ b/.github/workflows/pr_run_slow_ci.yml
@ -0,0 +1,199 @@
+name: PR slow CI
+on:
+  pull_request_target:
+    types: [opened, synchronize, reopened]
+
+jobs:
+  get-pr-number:
+    name: Get PR number
+    uses: ./.github/workflows/get-pr-number.yml
+
+  get-pr-info:
+    name: Get PR commit SHA
+    needs: get-pr-number
+    if: ${{ needs.get-pr-number.outputs.PR_NUMBER != ''}}
+    uses: ./.github/workflows/get-pr-info.yml
+    with:
+      pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
+
+  # We only need to verify the timestamp if the workflow is triggered by `issue_comment`.
+  verity_pr_commit:
+    name: Verity PR commit corresponds to a specific event by comparing timestamps
+    if: ${{ github.event.comment.created_at != '' }}
+    runs-on: ubuntu-22.04
+    needs: get-pr-info
+    env:
+      COMMENT_DATE: ${{ github.event.comment.created_at }}
+      PR_MERGE_COMMIT_DATE: ${{ needs.get-pr-info.outputs.PR_MERGE_COMMIT_DATE }}
+      PR_MERGE_COMMIT_TIMESTAMP: ${{ needs.get-pr-info.outputs.PR_MERGE_COMMIT_TIMESTAMP }}
+    steps:
+      - run: |
+          COMMENT_TIMESTAMP=$(date -d "${COMMENT_DATE}" +"%s")
+          echo "COMMENT_DATE: $COMMENT_DATE"
+          echo "PR_MERGE_COMMIT_DATE: $PR_MERGE_COMMIT_DATE"
+          echo "COMMENT_TIMESTAMP: $COMMENT_TIMESTAMP"
+          echo "PR_MERGE_COMMIT_TIMESTAMP: $PR_MERGE_COMMIT_TIMESTAMP"
+          if [ $COMMENT_TIMESTAMP -le $PR_MERGE_COMMIT_TIMESTAMP ]; then
+            echo "Last commit on the pull request is newer than the issue comment triggering this run! Abort!";
+            exit -1;
+          fi
+
+  get-jobs:
+    name: Get test files to run
+    runs-on: ubuntu-22.04
+    needs: [get-pr-number, get-pr-info]
+    outputs:
+      jobs: ${{ steps.get_jobs.outputs.jobs_to_run }}
+    steps:
+      - name: Get repository content
+        id: repo_content
+        uses: actions/github-script@v6
+        with:
+          script: |
+            const { data: tests_dir } = await github.rest.repos.getContent({
+              owner: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_OWNER }}',
+              repo: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_NAME }}',
+              path: 'tests',
+              ref: '${{ needs.get-pr-info.outputs.PR_HEAD_SHA }}',
+            });
+
+            const { data: tests_models_dir } = await github.rest.repos.getContent({
+              owner: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_OWNER }}',
+              repo: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_NAME }}',
+              path: 'tests/models',
+              ref: '${{ needs.get-pr-info.outputs.PR_HEAD_SHA }}',
+            });
+
+            const { data: tests_quantization_dir } = await github.rest.repos.getContent({
+              owner: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_OWNER }}',
+              repo: '${{ needs.get-pr-info.outputs.PR_HEAD_REPO_NAME }}',
+              path: 'tests/quantization',
+              ref: '${{ needs.get-pr-info.outputs.PR_HEAD_SHA }}',
+            });
+
+            core.setOutput('tests_dir', tests_dir);
+            core.setOutput('tests_models_dir', tests_models_dir);
+            core.setOutput('tests_quantization_dir', tests_quantization_dir);
+
+      # This checkout to the main branch
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: "0"
+
+      - name: Write pr_files file
+        run: |
+          cat > pr_files.txt << 'EOF'
+          ${{ needs.get-pr-info.outputs.PR_FILES }}
+          EOF
+
+      - name: Write tests_dir file
+        run: |
+          cat > tests_dir.txt << 'EOF'
+          ${{ steps.repo_content.outputs.tests_dir }}
+          EOF
+
+      - name: Write tests_models_dir file
+        run: |
+          cat > tests_models_dir.txt << 'EOF'
+          ${{ steps.repo_content.outputs.tests_models_dir }}
+          EOF
+
+      - name: Write tests_quantization_dir file
+        run: |
+          cat > tests_quantization_dir.txt << 'EOF'
+          ${{ steps.repo_content.outputs.tests_quantization_dir }}
+          EOF
+
+      - name: Run script to get jobs to run
+        id: get_jobs
+        run: |
+          python utils/get_pr_run_slow_jobs.py | tee output.txt
+          echo "jobs_to_run: $(tail -n 1 output.txt)"
+          echo "jobs_to_run=$(tail -n 1 output.txt)" >> $GITHUB_OUTPUT
+
+  send_comment:
+    # Will delete the previous comment and send a new one if:
+    #   - either the content is changed
+    #   - or the previous comment is 30 minutes or more old
+    name: Send a comment to suggest jobs to run
+    if: ${{ needs.get-jobs.outputs.jobs != '' }}
+    needs: [get-pr-number, get-jobs]
+    permissions:
+      pull-requests: write
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Check and update comment if needed
+        uses: actions/github-script@v7
+        env:
+          BODY: "\n\nrun-slow: ${{ needs.get-jobs.outputs.jobs }}"
+        with:
+          script: |
+            const prNumber = ${{ needs.get-pr-number.outputs.PR_NUMBER }};
+            const commentPrefix = "**[For maintainers]** Suggested jobs to run (before merge)";
+            const thirtyMinutesAgo = new Date(Date.now() - 30 * 60 * 1000); // 30 minutes ago
+            const newBody = `${commentPrefix}${process.env.BODY}`;
+            
+            // Get all comments on the PR
+            const { data: comments } = await github.rest.issues.listComments({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: prNumber
+            });
+            
+            // Find existing comments that start with our prefix
+            const existingComments = comments.filter(comment => 
+              comment.user.login === 'github-actions[bot]' && 
+              comment.body.startsWith(commentPrefix)
+            );
+            
+            let shouldCreateNewComment = true;
+            let commentsToDelete = [];
+            
+            if (existingComments.length > 0) {
+              // Get the most recent comment
+              const mostRecentComment = existingComments
+                .sort((a, b) => new Date(b.created_at) - new Date(a.created_at))[0];
+              
+              const commentDate = new Date(mostRecentComment.created_at);
+              const isOld = commentDate < thirtyMinutesAgo;
+              const isDifferentContent = mostRecentComment.body !== newBody;
+              
+              console.log(`Most recent comment created: ${mostRecentComment.created_at}`);
+              console.log(`Is older than 30 minutes: ${isOld}`);
+              console.log(`Has different content: ${isDifferentContent}`);
+              
+              if (isOld || isDifferentContent) {
+                // Delete all existing comments and create new one
+                commentsToDelete = existingComments;
+                console.log(`Will delete ${commentsToDelete.length} existing comment(s) and create new one`);
+              } else {
+                // Content is same and comment is recent, skip
+                shouldCreateNewComment = false;
+                console.log('Comment is recent and content unchanged, skipping update');
+              }
+            } else {
+              console.log('No existing comments found, will create new one');
+            }
+            
+            // Delete old comments if needed
+            for (const comment of commentsToDelete) {
+              console.log(`Deleting comment #${comment.id} (created: ${comment.created_at})`);
+              await github.rest.issues.deleteComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                comment_id: comment.id
+              });
+            }
+            
+            // Create new comment if needed
+            if (shouldCreateNewComment) {
+              await github.rest.issues.createComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: prNumber,
+                body: newBody
+              });
+              console.log('✅ New comment created');
+            } else {
+              console.log('ℹ️ No comment update needed');
+            }
--- a/.github/workflows/self-comment-ci.yml
+++ b/.github/workflows/self-comment-ci.yml
@ -29,7 +29,7 @@ jobs:
    runs-on: ubuntu-22.04
    name: Get PR number
    # For security: only allow team members to run
-    if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr", "eustlb", "MekkCyber", "manueldeprada", "vasqu", "ivarflakstad"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
+    if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr", "eustlb", "MekkCyber", "manueldeprada", "vasqu", "ivarflakstad", "stevhliu"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
    outputs:
      PR_NUMBER: ${{ steps.set_pr_number.outputs.PR_NUMBER }}
    steps:
--- a/.github/workflows/self-scheduled-intel-gaudi.yml
+++ b/.github/workflows/self-scheduled-intel-gaudi.yml
@ -84,8 +84,6 @@ jobs:
      machine_type: ${{ matrix.machine_type }}
      folder_slices: ${{ needs.setup.outputs.folder_slices }}
      runner: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
-      report_name_prefix: run_models_gpu
-
    secrets: inherit

  run_trainer_and_fsdp_gpu:
@ -104,11 +102,10 @@ jobs:
      folder_slices: ${{ needs.setup.outputs.folder_slices }}
      runner: ${{ inputs.runner_scale_set }}-${{ matrix.machine_type }}
      report_name_prefix: run_trainer_and_fsdp_gpu
-
    secrets: inherit

-  run_pipelines_gpu:
-    if: ${{ inputs.job == 'run_pipelines_gpu' }}
+  run_pipelines_torch_gpu:
+    if: ${{ inputs.job == 'run_pipelines_torch_gpu' }}
    name: Pipelines
    strategy:
      fail-fast: false
@ -161,20 +158,20 @@ jobs:

      - name: Run all pipeline tests on Intel Gaudi
        run: |
-          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_pipelines_gpu_test_reports tests/pipelines -m "not not_device_test"
+          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports tests/pipelines -m "not not_device_test"

      - name: Failure short reports
        if: ${{ failure() }}
        continue-on-error: true
        run: |
-          cat reports/${{ env.machine_type }}_run_pipelines_gpu_test_reports/failures_short.txt
+          cat reports/${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports/failures_short.txt

-      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_pipelines_gpu_test_reports"
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports"
        if: ${{ always() }}
        uses: actions/upload-artifact@v4
        with:
-          name: ${{ env.machine_type }}_run_pipelines_gpu_test_reports
-          path: reports/${{ env.machine_type }}_run_pipelines_gpu_test_reports
+          name: ${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports
+          path: reports/${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports

  run_examples_gpu:
    if: ${{ inputs.job == 'run_examples_gpu' }}
@ -248,8 +245,8 @@ jobs:
          name: ${{ env.machine_type }}_run_examples_gpu_test_reports
          path: reports/${{ env.machine_type }}_run_examples_gpu_test_reports

-  run_deepspeed_gpu:
-    if: ${{ inputs.job == 'run_deepspeed_gpu' }}
+  run_torch_cuda_extensions_gpu:
+    if: ${{ inputs.job == 'run_torch_cuda_extensions_gpu' }}
    name: Intel Gaudi deepspeed tests
    strategy:
      fail-fast: false
@ -305,20 +302,20 @@ jobs:

      - name: Run all deepspeed tests on intel Gaudi
        run: |
-          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_deepspeed_gpu_test_reports tests/deepspeed -m "not not_device_test"
+          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed -m "not not_device_test"

      - name: Failure short reports
        if: ${{ failure() }}
        continue-on-error: true
        run: |
-          cat reports/${{ env.machine_type }}_run_deepspeed_gpu_test_reports/failures_short.txt
+          cat reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt

-      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_deepspeed_gpu_test_reports"
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports"
        if: ${{ always() }}
        uses: actions/upload-artifact@v4
        with:
-          name: ${{ env.machine_type }}_run_deepspeed_gpu_test_reports
-          path: reports/${{ env.machine_type }}_run_deepspeed_gpu_test_reports
+          name: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
+          path: reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports

  send_results:
    name: Slack Report
@ -327,8 +324,8 @@ jobs:
        setup,
        run_models_gpu,
        run_examples_gpu,
-        run_pipelines_gpu,
-        run_deepspeed_gpu,
+        run_torch_cuda_extensions_gpu,
+        run_pipelines_torch_gpu,
        run_trainer_and_fsdp_gpu,
      ]
    if: ${{ always() }}
--- a/.github/workflows/self-scheduled-intel-gaudi3-caller.yml
+++ b/.github/workflows/self-scheduled-intel-gaudi3-caller.yml
@ -23,7 +23,7 @@ jobs:
    name: Pipeline CI
    uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
    with:
-      job: run_pipelines_gpu
+      job: run_pipelines_torch_gpu
      ci_event: Scheduled CI (Intel) - Gaudi3
      runner_scale_set: itac-bm-emr-gaudi3-dell
      slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
@ -47,7 +47,7 @@ jobs:
    name: DeepSpeed CI
    uses: ./.github/workflows/self-scheduled-intel-gaudi.yml
    with:
-      job: run_deepspeed_gpu
+      job: run_torch_cuda_extensions_gpu
      ci_event: Scheduled CI (Intel) - Gaudi3
      runner_scale_set: itac-bm-emr-gaudi3-dell
      slack_report_channel: "#transformers-ci-daily-intel-gaudi3"
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@ -135,6 +135,7 @@ jobs:
      folder_slices: ${{ needs.setup.outputs.folder_slices }}
      machine_type: ${{ matrix.machine_type }}
      slice_id: ${{ matrix.slice_id }}
+      runner_map: ${{ needs.setup.outputs.runner_map }}
      docker: ${{ inputs.docker }}
      report_name_prefix: run_trainer_and_fsdp_gpu
    secrets: inherit
--- a/.gitignore
+++ b/.gitignore
@ -167,3 +167,6 @@ tags

 # ruff
 .ruff_cache
+
+# modular conversion
+*.modular_backup
--- a/conftest.py
+++ b/conftest.py
@ -28,6 +28,7 @@ from transformers.testing_utils import HfDoctestModule, HfDocTestParser

 NOT_DEVICE_TESTS = {
    "test_tokenization",
+    "test_tokenization_mistral_common",
    "test_processor",
    "test_processing",
    "test_beam_constraints",
--- a/docker/examples-torch.dockerfile
+++ b/docker/examples-torch.dockerfile
@ -2,10 +2,10 @@ FROM python:3.9-slim
 ENV PYTHONDONTWRITEBYTECODE=1
 ARG REF=main
 USER root
-RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git
+RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git ffmpeg
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]" seqeval albumentations jiwer
 RUN uv pip uninstall transformers
--- a/docker/pipeline-torch.dockerfile
+++ b/docker/pipeline-torch.dockerfile
@ -2,10 +2,10 @@ FROM python:3.9-slim
 ENV PYTHONDONTWRITEBYTECODE=1
 ARG REF=main
 USER root
-RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git pkg-config openssh-client git
+RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git pkg-config openssh-client git ffmpeg
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]"
 RUN uv pip uninstall transformers
--- a/docker/torch-light.dockerfile
+++ b/docker/torch-light.dockerfile
@ -2,10 +2,10 @@ FROM python:3.9-slim
 ENV PYTHONDONTWRITEBYTECODE=1
 ARG REF=main
 USER root
-RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git git-lfs
+RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git git-lfs ffmpeg
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing,tiktoken,num2words,video]"
 RUN uv pip uninstall transformers
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@ -26,10 +26,12 @@ RUN git clone https://github.com/huggingface/transformers && cd transformers &&
 # 1. Put several commands in a single `RUN` to avoid image/layer exporting issue. Could be revised in the future.
 # 2. Regarding `torch` part, We might need to specify proper versions for `torchvision` and `torchaudio`.
 #    Currently, let's not bother to specify their versions explicitly (so installed with their latest release versions).
-RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' ||  VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA && python3 -m pip uninstall -y tensorflow tensorflow_text tensorflow_probability
+RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' ||  VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA && python3 -m pip uninstall -y tensorflow tensorflow_text tensorflow_probability

 RUN python3 -m pip uninstall -y flax jax

+RUN python3 -m pip install --no-cache-dir -U timm
+
 RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract
 RUN python3 -m pip install -U "itsdangerous<2.1.0"

--- a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
@ -21,7 +21,7 @@ RUN python3 -m pip install --no-cache-dir './transformers[deepspeed-testing]' 'p
 # Install latest release PyTorch
 # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
 # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
-RUN python3 -m pip uninstall -y torch torchvision torchaudio && python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA
+RUN python3 -m pip uninstall -y torch torchvision torchaudio && python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/$CUDA

 RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate

--- a/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
@ -19,7 +19,7 @@ RUN python3 -m pip uninstall -y torch torchvision torchaudio
 # Install **nightly** release PyTorch (flag `--pre`)
 # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
 # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
-RUN python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA
+RUN python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA

 # `datasets` requires pandas, pandas has some modules compiled with numpy=1.x causing errors
 RUN python3 -m pip install --no-cache-dir './transformers[deepspeed-testing]' 'pandas<2' 'numpy<2'
--- a/docker/transformers-quantization-latest-gpu/Dockerfile
+++ b/docker/transformers-quantization-latest-gpu/Dockerfile
@ -26,7 +26,7 @@ RUN [ ${#PYTORCH} -gt 0 ] && VERSION='torch=='$PYTORCH'.*' ||  VERSION='torch';
 RUN echo torch=$VERSION
 # `torchvision` and `torchaudio` should be installed along with `torch`, especially for nightly build.
 # Currently, let's just use their latest releases (when `torch` is installed with a release version)
-RUN python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDA
+RUN python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/$CUDA

 RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate

--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@ -100,8 +100,6 @@
      title: Distributed inference
    - local: perf_infer_cpu
      title: CPU
-    - local: tf_xla
-      title: XLA
    title: Optimization
  - local: agents
    title: Agents
@ -141,8 +139,6 @@
      title: GPU
    - local: perf_train_cpu
      title: CPU
-    - local: perf_train_tpu_tf
-      title: TPU
    - local: perf_train_special
      title: Apple Silicon
    - local: perf_train_gaudi
@ -433,6 +429,8 @@
        title: DiffLlama
      - local: model_doc/distilbert
        title: DistilBERT
+      - local: model_doc/doge
+        title: Doge
      - local: model_doc/dots1
        title: dots1
      - local: model_doc/dpr
@ -519,6 +517,8 @@
        title: Jukebox
      - local: model_doc/led
        title: LED
+      - local: model_doc/lfm2
+        title: LFM2
      - local: model_doc/llama
        title: LLaMA
      - local: model_doc/llama2
@ -563,6 +563,8 @@
        title: MobileBERT
      - local: model_doc/modernbert
        title: ModernBert
+      - local: model_doc/modernbert-decoder
+        title: ModernBERTDecoder
      - local: model_doc/mpnet
        title: MPNet
      - local: model_doc/mpt
@ -693,6 +695,8 @@
        title: Zamba2
      title: Text models
    - sections:
+      - local: model_doc/aimv2
+        title: Aimv2
      - local: model_doc/beit
        title: BEiT
      - local: model_doc/bit
@ -709,6 +713,8 @@
        title: D-FINE
      - local: model_doc/dab-detr
        title: DAB-DETR
+      - local: model_doc/deepseek_v2
+        title: DeepSeek-V2
      - local: model_doc/deformable_detr
        title: Deformable DETR
      - local: model_doc/deit
@ -1035,6 +1041,8 @@
        title: PaliGemma
      - local: model_doc/perceiver
        title: Perceiver
+      - local: model_doc/perception_lm
+        title: PerceptionLM
      - local: model_doc/phi4_multimodal
        title: Phi4 Multimodal
      - local: model_doc/pix2struct
@ -1144,4 +1152,3 @@
      title: Environment Variables
    title: Reference
  title: API
-
--- a/docs/source/en/agents.md
+++ b/docs/source/en/agents.md
@ -14,5 +14,9 @@ rendered properly in your Markdown viewer.

 -->

+# Agents
+
+(deprecated)
+
 > [!WARNING]
 > Agents and tools were spun out into the standalone [smolagents](https://huggingface.co/docs/smolagents/index) library. They were removed from `transformers` in v4.52.
--- a/docs/source/en/cache_explanation.md
+++ b/docs/source/en/cache_explanation.md
@ -99,8 +99,6 @@ self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_stat

 2. The cache grows dynamically as more tokens are processed. The sequence length dimension (`seq_len`) increases with each new token.

-3. The cache maintains a count of seen tokens through `self._seen_tokens`. This is updated when the first layer processes a new token.
-
 The example below demonstrates how to create a generation loop with [`DynamicCache`]. As discussed, the attention mask is a concatenation of past and current token values and `1` is added to the cache position for the next token.

 ```py
--- a/docs/source/en/conversations.md
+++ b/docs/source/en/conversations.md
@ -25,10 +25,7 @@ Check model leaderboards like [OpenLLM](https://hf.co/spaces/HuggingFaceH4/open_

 This guide shows you how to quickly start chatting with Transformers from the command line, how build and format a conversation, and how to chat using the [`TextGenerationPipeline`].

-## transformers CLI
-
-
-### Interactive chat session
+## chat CLI

 After you've [installed Transformers](./installation.md), chat with a model directly from the command line as shown below. It launches an interactive session with a model, with a few base commands listed at the start of the session.

@ -52,68 +49,7 @@ For a full list of options, run the command below.
 transformers chat -h
 ```

-The chat is implemented on top of the [AutoClass](./model_doc/auto), using tooling from [text generation](./llm_tutorial) and [chat](./chat_templating).
-
-
-### Serving a model and using MCP tools
-
-> [!WARNING]
-> This section is experimental and subject to changes in future versions
-
-Powering the `chat` interface, we have a server that takes user messages and returns completions. The server has a chat completion API compatible with the OpenAI SDK, so you can also quickly experiment with `transformers` models on existing aplications. To launch a server separately, use the `transformers serve` CLI:
-
-```bash
-transformers serve Menlo/Jan-nano
-```
-
-Under the hood, the `chat` CLI launches and uses `transformers serve`. This server is also an MCP client, which can receive information available MCP servers (i.e. tools), massage their information into the model prompt, and prepare calls to these tools when the model commands to do so. Naturally, this requires a model that is trained to use tools.
-
-At the moment, MCP tool usage in `transformers` has the following constraints:
- `chat` can't handle tools, but the [`tiny-agents`](https://huggingface.co/blog/python-tiny-agents) CLI can;
- Only the `qwen` family of models is supported.
-
-The first step to use MCP tools is to let the model know which tools are available. As an example, let's consider a `tiny-agents` configuration file with a reference to an [image generation MCP server](https://evalstate-flux1-schnell.hf.space/).
-
-> [!TIP]
-> Many Hugging Face Spaces can be used as MCP servers. You can find all compatible Spaces [here](https://huggingface.co/spaces?filter=mcp-server).
-
-```json
-{
-    "model": "http://localhost:8000",
-    "provider": "local",
-    "servers": [
-        {
-            "type": "sse",
-            "config": {
-                "url": "https://evalstate-flux1-schnell.hf.space/gradio_api/mcp/sse"
-            }
-        }
-    ]
-}
-```
-
-You can then launch your `tiny-agents` chat interface with the following command.
-
-```bash
-tiny-agents run path/to/your/config.json
-```
-
-If you have a server (from `transformers serve`) running in the background, you're ready to use MCP tools from a local model! For instance, here's the example of a chat session:
-
-```bash
-Agent loaded with 1 tools:
- • flux1_schnell_infer
-»  Generate an image of a cat on the moon
-<Tool req_0_tool_call>flux1_schnell_infer {"prompt": "a cat on the moon", "seed": 42, "randomize_seed": true, "width": 1024, "height": 1024, "num_inference_steps": 4}
-
-Tool req_0_tool_call
-[Binary Content: Image image/webp, 57732 bytes]
-The task is complete and the content accessible to the User
-Image URL: https://evalstate-flux1-schnell.hf.space/gradio_api/file=/tmp/gradio/3dbddc0e53b5a865ed56a4e3dbdd30f3f61cf3b8aabf1b456f43e5241bd968b8/image.webp
-380576952
-
-I have generated an image of a cat on the moon using the Flux 1 Schnell Image Generator. The image is 1024x1024 pixels and was created with 4 inference steps. Let me know if you would like to make any changes or need further assistance!
-```
+The chat is implemented on top of the [AutoClass](./model_doc/auto), using tooling from [text generation](./llm_tutorial) and [chat](./chat_templating). It uses the `transformers serve` CLI under the hood ([docs](./serving.md#serve-cli)).


 ## TextGenerationPipeline
--- a/docs/source/en/kv_cache.md
+++ b/docs/source/en/kv_cache.md
@ -44,7 +44,7 @@ import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM

 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16).to("cuda:0")
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
 inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device)

 model.generate(**inputs, do_sample=False, max_new_tokens=20, use_cache=False)
@ -59,7 +59,7 @@ import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache

 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16).to("cuda:0")
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
 inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device)

 past_key_values = DynamicCache()
@ -142,13 +142,14 @@ Enable [`QuantizedCache`] by configuring `cache_implementation="quantized"` in [
 For [`HQQQuantizedCache`], we recommend setting the `axis-key` and `axis-value` parameters to `1`.

 ```py
+import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM, HQQQuantizedCache, QuantizedCacheConfig

 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16).to("cuda:0")
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
 inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device)

-out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="quantized", cache_config={"axis-key": 1, "axis-value": 1, "backend": "hqq"})
+out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="quantized", cache_config={"backend": "HQQ"})
 print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
 I like rock music because it's loud and energetic. It's a great way to express myself and rel
 ```
@ -159,13 +160,14 @@ I like rock music because it's loud and energetic. It's a great way to express m
 For [`QuantoQuantizedCache`], we recommend setting the `axis-key` and `axis-value` parameters to `0`.

 ```py
+import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM, QuantoQuantizedCache, QuantizedCacheConfig

 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16).to("cuda:0")
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
 inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device)

-out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="quantized", cache_config={"nbits": 4, "axis-key": 0, "axis-value": 0, "backend": "quanto"})
+out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="quantized", cache_config={"nbits": 4, "backend": "quanto"})
 print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
 I like rock music because it's loud and energetic. It's a great way to express myself and rel
 ```
@ -207,14 +209,14 @@ import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM

 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map={"": 0})
 inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device)

 out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="offloaded_static")
 tokenizer.batch_decode(out, skip_special_tokens=True)[0]
 "Hello, my name is [Your Name], and I am a [Your Profession] with [Number of Years] of"
 ```
-Cache offloading requires a CUDA GPU.
+Cache offloading requires a CUDA GPU or Intel XPU.

 ### Sliding window cache

@ -227,7 +229,7 @@ import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM

 tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
-model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", torch_dtype=torch.float16).to("cuda:0")
+model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", torch_dtype=torch.float16, device_map="auto")
 inputs = tokenizer("Yesterday I was on a rock concert and.", return_tensors="pt").to(model.device)

 out = model.generate(**inputs, do_sample=False, max_new_tokens=30, cache_implementation="sliding_window")
@ -306,15 +308,15 @@ import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, DynamicCache, StaticCache

 model_id = "meta-llama/Llama-2-7b-chat-hf"
-model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="cuda")
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map={"": 0})
 tokenizer = AutoTokenizer.from_pretrained(model_id)

 # Init StaticCache with big enough max-length (1024 tokens for the below example)
 # You can also init a DynamicCache, if that suits you better
-prompt_cache = StaticCache(config=model.config, max_batch_size=1, max_cache_len=1024, device="cuda", dtype=torch.bfloat16)
+prompt_cache = StaticCache(config=model.config, max_batch_size=1, max_cache_len=1024, device=model.device.type, dtype=torch.bfloat16)

 INITIAL_PROMPT = "You are a helpful assistant. "
-inputs_initial_prompt = tokenizer(INITIAL_PROMPT, return_tensors="pt").to("cuda")
+inputs_initial_prompt = tokenizer(INITIAL_PROMPT, return_tensors="pt").to(model.device.type)
 # This is the common prompt cached, we need to run forward without grad to be able to copy
 with torch.no_grad():
     prompt_cache = model(**inputs_initial_prompt, past_key_values = prompt_cache).past_key_values
@ -322,7 +324,7 @@ with torch.no_grad():
 prompts = ["Help me to write a blogpost about travelling.", "What is the capital of France?"]
 responses = []
 for prompt in prompts:
-    new_inputs = tokenizer(INITIAL_PROMPT + prompt, return_tensors="pt").to("cuda")
+    new_inputs = tokenizer(INITIAL_PROMPT + prompt, return_tensors="pt").to(model.device.type)
    past_key_values = copy.deepcopy(prompt_cache)
    outputs = model.generate(**new_inputs, past_key_values=past_key_values,max_new_tokens=20)
    response = tokenizer.batch_decode(outputs)[0]
--- a/docs/source/en/model_doc/aimv2.md
+++ b/docs/source/en/model_doc/aimv2.md
@ -0,0 +1,104 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# AIMv2
+
+## Overview
+
+The AIMv2 model was proposed in [Multimodal Autoregressive Pre-training of Large Vision Encoders](https://arxiv.org/abs/2411.14402) by Enrico Fini, Mustafa Shukor, Xiujun Li, Philipp Dufter, Michal Klein, David Haldimann, Sai Aitharaju, Victor Guilherme Turrisi da Costa, Louis Béthune, Zhe Gan, Alexander T Toshev, Marcin Eichner, Moin Nabi, Yinfei Yang, Joshua M. Susskind, Alaaeldin El-Nouby.
+
+The abstract from the paper is the following:
+
+*We introduce a novel method for pre-training of large-scale vision encoders. Building on recent advancements in autoregressive pre-training of vision models, we extend this framework to a multimodal setting, i.e., images and text. In this paper, we present AIMV2, a family of generalist vision encoders characterized by a straightforward pre-training process, scalability, and remarkable performance across a range of downstream tasks. This is achieved by pairing the vision encoder with a multimodal decoder that autoregressively generates raw image patches and text tokens. Our encoders excel not only in multimodal evaluations but also in vision benchmarks such as localization, grounding, and classification. Notably, our AIMV2-3B encoder achieves 89.5% accuracy on ImageNet-1k with a frozen trunk. Furthermore, AIMV2 consistently outperforms state-of-the-art contrastive models (e.g., CLIP, SigLIP) in multimodal image understanding across diverse settings.*
+
+
+This model was contributed by [Yaswanth Gali](https://huggingface.co/yaswanthgali).
+The original code can be found [here](https://github.com/apple/ml-aim).
+
+## Usage Example
+
+Here is an example of Image Feature Extraction using specific checkpoints on resized images and native resolution images:
+
+```python
+import requests
+from PIL import Image
+from transformers import AutoImageProcessor, AutoModel
+
+url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+
+processor = AutoImageProcessor.from_pretrained("apple/aimv2-large-patch14-native")
+model = AutoModel.from_pretrained("apple/aimv2-large-patch14-native")
+
+inputs = processor(images=image, return_tensors="pt")
+outputs = model(**inputs)
+```
+
+Here is an example of a checkpoint performing zero-shot classification:
+
+```python
+import requests
+from PIL import Image
+from transformers import AutoProcessor, AutoModel
+
+url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+text = ["Picture of a dog.", "Picture of a cat.", "Picture of a horse."]
+
+processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-224-lit")
+model = AutoModel.from_pretrained("apple/aimv2-large-patch14-224-lit")
+
+inputs = processor(
+    images=image,
+    text=text,
+    add_special_tokens=True,
+    truncation=True,
+    padding=True,
+    return_tensors="pt",
+)
+outputs = model(**inputs)
+probs = outputs.logits_per_image.softmax(dim=-1)
+```
+
+## Aimv2Config
+
+[[autodoc]] Aimv2Config
+
+## Aimv2TextConfig
+
+[[autodoc]] Aimv2TextConfig
+
+## Aimv2VisionConfig
+
+[[autodoc]] Aimv2VisionConfig
+
+## Aimv2Model
+
+[[autodoc]] Aimv2Model
+    - forward
+
+## Aimv2VisionModel
+
+[[autodoc]] Aimv2VisionModel
+    - forward
+
+## Aimv2TextModel
+
+[[autodoc]] Aimv2TextModel
+    - forward
+
+</pt>
+<tf>
--- a/docs/source/en/model_doc/camembert.md
+++ b/docs/source/en/model_doc/camembert.md
@ -14,49 +14,105 @@ rendered properly in your Markdown viewer.

 -->

-# CamemBERT
-
+<div style="float: right;">
 	<div class="flex flex-wrap space-x-1">
 		<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
 		<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
    <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
 	</div>
+</div>

-## Overview
+# CamemBERT

-The CamemBERT model was proposed in [CamemBERT: a Tasty French Language Model](https://huggingface.co/papers/1911.03894) by
-[Louis Martin](https://huggingface.co/louismartin), [Benjamin Muller](https://huggingface.co/benjamin-mlr), [Pedro Javier Ortiz Suárez](https://huggingface.co/pjox), Yoann Dupont, Laurent Romary, Éric Villemonte de la
-Clergerie, [Djamé Seddah](https://huggingface.co/Djame), and [Benoît Sagot](https://huggingface.co/sagot). It is based on Facebook's RoBERTa model released in 2019. It is a model
-trained on 138GB of French text.
+[CamemBERT](https://huggingface.co/papers/1911.03894) is a language model based on [RoBERTa](./roberta), but trained specifically on French text from the OSCAR dataset, making it more effective for French language tasks.

-The abstract from the paper is the following:
+What sets CamemBERT apart is that it learned from a huge, high quality collection of French data, as opposed to mixing lots of languages. This helps it really understand French better than many multilingual models.

-*Pretrained language models are now ubiquitous in Natural Language Processing. Despite their success, most available
-models have either been trained on English data or on the concatenation of data in multiple languages. This makes
-practical use of such models --in all languages except English-- very limited. Aiming to address this issue for French,
-we release CamemBERT, a French version of the Bi-directional Encoders for Transformers (BERT). We measure the
-performance of CamemBERT compared to multilingual models in multiple downstream tasks, namely part-of-speech tagging,
-dependency parsing, named-entity recognition, and natural language inference. CamemBERT improves the state of the art
-for most of the tasks considered. We release the pretrained model for CamemBERT hoping to foster research and
-downstream applications for French NLP.*
+Common applications of CamemBERT include masked language modeling (Fill-mask prediction), text classification (sentiment analysis), token classification (entity recognition) and sentence pair classification (entailment tasks).

-This model was contributed by [the ALMAnaCH team (Inria)](https://huggingface.co/almanach). The original code can be found [here](https://camembert-model.fr/).
+You can find all the original CamemBERT checkpoints under the [ALMAnaCH](https://huggingface.co/almanach/models?search=camembert) organization.

-<Tip>
+> [!TIP]
+> This model was contributed by the [ALMAnaCH (Inria)](https://huggingface.co/almanach) team.
+>
+> Click on the CamemBERT models in the right sidebar for more examples of how to apply CamemBERT to different NLP tasks.

-This implementation is the same as RoBERTa. Refer to the [documentation of RoBERTa](roberta) for usage examples as well 
-as the information relative to the inputs and outputs.
+The examples below demonstrate how to predict the `<mask>` token with [`Pipeline`], [`AutoModel`], and from the command line.

-</Tip>
+<hfoptions id="usage">

-## Resources
+<hfoption id="Pipeline">

- [Text classification task guide](../tasks/sequence_classification)
- [Token classification task guide](../tasks/token_classification)
- [Question answering task guide](../tasks/question_answering)
- [Causal language modeling task guide](../tasks/language_modeling)
- [Masked language modeling task guide](../tasks/masked_language_modeling)
- [Multiple choice task guide](../tasks/multiple_choice)
+```python
+import torch
+from transformers import pipeline
+
+pipeline = pipeline("fill-mask", model="camembert-base", torch_dtype=torch.float16, device=0)
+pipeline("Le camembert est un délicieux fromage <mask>.")
+```
+</hfoption> 
+
+<hfoption id="AutoModel">
+
+```python
+import torch
+from transformers import AutoTokenizer, AutoModelForMaskedLM
+
+tokenizer = AutoTokenizer.from_pretrained("camembert-base")
+model = AutoModelForMaskedLM.from_pretrained("camembert-base", torch_dtype="auto", device_map="auto", attn_implementation="sdpa")
+inputs = tokenizer("Le camembert est un délicieux fromage <mask>.", return_tensors="pt").to("cuda")
+
+with torch.no_grad():
+    outputs = model(**inputs)
+    predictions = outputs.logits
+
+masked_index = torch.where(inputs['input_ids'] == tokenizer.mask_token_id)[1]
+predicted_token_id = predictions[0, masked_index].argmax(dim=-1)
+predicted_token = tokenizer.decode(predicted_token_id)
+
+print(f"The predicted token is: {predicted_token}")
+```
+</hfoption> 
+
+<hfoption id="transformers CLI">
+
+```bash
+echo -e "Le camembert est un délicieux fromage <mask>." | transformers run --task fill-mask --model camembert-base --device 0
+```
+
+</hfoption> 
+
+</hfoptions> 
+
+
+Quantization reduces the memory burden of large models by representing weights in lower precision. Refer to the [Quantization](../quantization/overview) overview for available options.
+
+The example below uses [bitsandbytes](../quantization/bitsandbytes) quantization to quantize the weights to 8-bits.
+  
+```python
+from transformers import AutoTokenizer, AutoModelForMaskedLM, BitsAndBytesConfig
+import torch
+
+quant_config = BitsAndBytesConfig(load_in_8bit=True)
+model = AutoModelForMaskedLM.from_pretrained(
+    "almanach/camembert-large",
+    quantization_config=quant_config,
+    device_map="auto"
+)
+tokenizer = AutoTokenizer.from_pretrained("almanach/camembert-large")
+
+inputs = tokenizer("Le camembert est un délicieux fromage <mask>.", return_tensors="pt").to("cuda")
+
+with torch.no_grad():
+    outputs = model(**inputs)
+    predictions = outputs.logits
+
+masked_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
+predicted_token_id = predictions[0, masked_index].argmax(dim=-1)
+predicted_token = tokenizer.decode(predicted_token_id)
+
+print(f"The predicted token is: {predicted_token}")
+```

 ## CamembertConfig

@ -138,4 +194,3 @@ as the information relative to the inputs and outputs.

 </tf>
 </frameworkcontent>
-
--- a/docs/source/en/model_doc/deepseek_v2.md
+++ b/docs/source/en/model_doc/deepseek_v2.md
@ -0,0 +1,49 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# DeepSeek-V2
+
+## Overview
+
+The DeepSeek-V2 model was proposed in [DeepSeek-V2: A Strong, Economical, and Efficient Mixture-of-Experts Language Model](https://arxiv.org/abs/2405.04434) by DeepSeek-AI Team.
+
+The abstract from the paper is the following:
+We present DeepSeek-V2, a strong Mixture-of-Experts (MoE) language model characterized by economical training and efficient inference. It comprises 236B total parameters, of which 21B are activated for each token, and supports a context length of 128K tokens. DeepSeek-V2 adopts innovative architectures including Multi-head Latent Attention (MLA) and DeepSeekMoE. MLA guarantees efficient inference through significantly compressing the Key-Value (KV) cache into a latent vector, while DeepSeekMoE enables training strong models at an economical cost through sparse computation. Compared with DeepSeek 67B, DeepSeek-V2 achieves significantly stronger performance, and meanwhile saves 42.5% of training costs, reduces the KV cache by 93.3%, and boosts the maximum generation throughput to 5.76 times. We pretrain DeepSeek-V2 on a high-quality and multi-source corpus consisting of 8.1T tokens, and further perform Supervised Fine-Tuning (SFT) and Reinforcement Learning (RL) to fully unlock its potential. Evaluation results show that, even with only 21B activated parameters, DeepSeek-V2 and its chat versions still achieve top-tier performance among open-source models.
+
+This model was contributed by [VladOS95-cyber](https://github.com/VladOS95-cyber).
+The original code can be found [here](https://huggingface.co/deepseek-ai/DeepSeek-V2).
+
+### Usage tips
+The model uses Multi-head Latent Attention (MLA) and DeepSeekMoE architectures for efficient inference and cost-effective training. It employs an auxiliary-loss-free strategy for load balancing and multi-token prediction training objective. The model can be used for various language tasks after being pre-trained on 14.8 trillion tokens and going through Supervised Fine-Tuning and Reinforcement Learning stages.
+
+## DeepseekV2Config
+
+[[autodoc]] DeepseekV2Config
+
+## DeepseekV2Model
+
+[[autodoc]] DeepseekV2Model
+    - forward
+
+## DeepseekV2ForCausalLM
+
+[[autodoc]] DeepseekV2ForCausalLM
+    - forward
+
+## DeepseekV2ForSequenceClassification
+
+[[autodoc]] DeepseekV2ForSequenceClassification
+    - forward
--- a/docs/source/en/model_doc/dia.md
+++ b/docs/source/en/model_doc/dia.md
@ -44,7 +44,7 @@ tokens and decodes them back into audio.
 from transformers import AutoProcessor, DiaForConditionalGeneration

 torch_device = "cuda"
-model_checkpoint = "buttercrab/dia-v1-1.6b"
+model_checkpoint = "nari-labs/Dia-1.6B-0626"

 text = ["[S1] Dia is an open weights text to dialogue model."]
 processor = AutoProcessor.from_pretrained(model_checkpoint)
@ -66,7 +66,7 @@ from datasets import load_dataset, Audio
 from transformers import AutoProcessor, DiaForConditionalGeneration

 torch_device = "cuda"
-model_checkpoint = "buttercrab/dia-v1-1.6b"
+model_checkpoint = "nari-labs/Dia-1.6B-0626"

 ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
 ds = ds.cast_column("audio", Audio(sampling_rate=44100))
@ -93,7 +93,7 @@ from datasets import load_dataset, Audio
 from transformers import AutoProcessor, DiaForConditionalGeneration

 torch_device = "cuda"
-model_checkpoint = "buttercrab/dia-v1-1.6b"
+model_checkpoint = "nari-labs/Dia-1.6B-0626"

 ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
 ds = ds.cast_column("audio", Audio(sampling_rate=44100))
--- a/docs/source/en/model_doc/doge.md
+++ b/docs/source/en/model_doc/doge.md
@ -0,0 +1,103 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Doge
+
+
+## Overview
+
+Doge is a series of small language models based on the [Doge](https://github.com/SmallDoges/small-doge) architecture, aiming to combine the advantages of state-space and self-attention algorithms, calculate dynamic masks from cached value states using the zero-order hold method, and solve the problem of existing mainstream language models getting lost in context. It uses the `wsd_scheduler` scheduler to pre-train on the `smollm-corpus`, and can continue training on new datasets or add sparse activation feedforward networks from stable stage checkpoints.
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/refs%2Fpr%2F426/transformers/model_doc/doge_architecture.png" alt="drawing" width="600"/>
+
+As shown in the figure below, the sequence transformation part of the Doge architecture uses `Dynamic Mask Attention`, which can be understood as using self-attention related to value states during training, and using state-space without past state decay during inference, to solve the problem of existing Transformers or SSMs getting lost in long text. The state transformation part of Doge uses `Cross Domain Mixture of Experts`, which consists of dense linear layers and sparse embedding layers, and can additionally increase sparse parameters to continue training from dense weight checkpoints without retraining the entire model, thereby reducing the cost of continuous iteration of the model. In addition, Doge also uses `RMSNorm` and `Residual` with learnable parameters to adapt the gradient range of deep models.
+
+Checkout all Doge model checkpoints [here](https://huggingface.co/collections/SmallDoge/doge-slm-679cc991f027c4a3abbded4a).
+
+
+## Usage
+
+<details>
+<summary>Using Doge-Base for text generation</summary>
+
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-20M")
+model = AutoModelForCausalLM.from_pretrained("SmallDoge/Doge-20M")
+inputs = tokenizer("Hey how are you doing?", return_tensors="pt")
+
+outputs = model.generate(**inputs, max_new_tokens=100)
+print(tokenizer.batch_decode(outputs))
+```
+</details>
+
+<details>
+<summary>Using Doge-Instruct for question answering</summary>
+
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig, TextStreamer
+
+tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-20M-Instruct")
+model = AutoModelForCausalLM.from_pretrained("SmallDoge/Doge-20M-Instruct")
+
+generation_config = GenerationConfig(
+      max_new_tokens=100, 
+      use_cache=True, 
+      do_sample=True, 
+      temperature=0.8, 
+      top_p=0.9,
+      repetition_penalty=1.0
+)
+steamer = TextStreamer(tokenizer=tokenizer, skip_prompt=True)
+
+prompt = "Hi, how are you doing today?"
+conversation = [
+      {"role": "user", "content": prompt}
+]
+inputs = tokenizer.apply_chat_template(
+    conversation=conversation,
+    tokenize=True,
+    return_tensors="pt",
+)
+
+outputs = model.generate(
+    inputs, 
+    tokenizer=tokenizer,
+    generation_config=generation_config, 
+    streamer=steamer
+)
+```
+</details>
+
+## DogeConfig
+
+[[autodoc]] DogeConfig
+
+## DogeModel
+
+[[autodoc]] DogeModel
+    - forward
+
+## DogeForCausalLM
+
+[[autodoc]] DogeForCausalLM
+    - forward
+
+## DogeForSequenceClassification
+
+[[autodoc]] DogeForSequenceClassification
+    - forward
--- a/docs/source/en/model_doc/encoder-decoder.md
+++ b/docs/source/en/model_doc/encoder-decoder.md
@ -14,8 +14,7 @@ rendered properly in your Markdown viewer.

 -->

-# Encoder Decoder Models
-
+<div style="float: right;">
    <div class="flex flex-wrap space-x-1">
        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
        <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
@ -23,106 +22,80 @@ rendered properly in your Markdown viewer.
        ">
        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
    </div>
+</div>

-## Overview
+# Encoder Decoder Models

-The [`EncoderDecoderModel`] can be used to initialize a sequence-to-sequence model with any
-pretrained autoencoding model as the encoder and any pretrained autoregressive model as the decoder.
+[`EncoderDecoderModel`](https://huggingface.co/papers/1706.03762) initializes a sequence-to-sequence model with any pretrained autoencoder and pretrained autoregressive model. It is effective for sequence generation tasks as demonstrated in [Text Summarization with Pretrained Encoders](https://huggingface.co/papers/1908.08345) which uses [`BertModel`] as the encoder and decoder.

-The effectiveness of initializing sequence-to-sequence models with pretrained checkpoints for sequence generation tasks
-was shown in [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://huggingface.co/papers/1907.12461) by
-Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+> [!TIP]
+> This model was contributed by [thomwolf](https://huggingface.co/thomwolf) and the TensorFlow/Flax version by [ydshieh](https://huggingface.co/ydshieh).
+>
+> Click on the Encoder Decoder models in the right sidebar for more examples of how to apply Encoder Decoder to different language tasks.

-After such an [`EncoderDecoderModel`] has been trained/fine-tuned, it can be saved/loaded just like
-any other models (see the examples for more information).
+The example below demonstrates how to generate text with [`Pipeline`], [`AutoModel`], and from the command line.

-An application of this architecture could be to leverage two pretrained [`BertModel`] as the encoder
-and decoder for a summarization model as was shown in: [Text Summarization with Pretrained Encoders](https://huggingface.co/papers/1908.08345) by Yang Liu and Mirella Lapata.
-
-## Randomly initializing `EncoderDecoderModel` from model configurations.
-
-[`EncoderDecoderModel`] can be randomly initialized from an encoder and a decoder config. In the following example, we show how to do this using the default [`BertModel`] configuration for the encoder and the default [`BertForCausalLM`] configuration for the decoder.
+<hfoptions id="usage">
+<hfoption id="Pipeline">

 ```python
->>> from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel
+from transformers import pipeline

->>> config_encoder = BertConfig()
->>> config_decoder = BertConfig()
+summarizer = pipeline(
+    "summarization",
+    model="patrickvonplaten/bert2bert-cnn_dailymail-fp16",
+    device=0
+)

->>> config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
->>> model = EncoderDecoderModel(config=config)
+text = "Plants create energy through a process known as photosynthesis. This involves capturing sunlight and converting carbon dioxide and water into glucose and oxygen."
+print(summarizer(text))
 ```

-## Initialising `EncoderDecoderModel` from a pretrained encoder and a pretrained decoder.
-
-[`EncoderDecoderModel`] can be initialized from a pretrained encoder checkpoint and a pretrained decoder checkpoint. Note that any pretrained auto-encoding model, *e.g.* BERT, can serve as the encoder and both pretrained auto-encoding models, *e.g.* BERT, pretrained causal language models, *e.g.* GPT2, as well as the pretrained decoder part of sequence-to-sequence models, *e.g.* decoder of BART, can be used as the decoder.
-Depending on which architecture you choose as the decoder, the cross-attention layers might be randomly initialized.
-Initializing [`EncoderDecoderModel`] from a pretrained encoder and decoder checkpoint requires the model to be fine-tuned on a downstream task, as has been shown in [the *Warm-starting-encoder-decoder blog post*](https://huggingface.co/blog/warm-starting-encoder-decoder).
-To do so, the `EncoderDecoderModel` class provides a [`EncoderDecoderModel.from_encoder_decoder_pretrained`] method.
+</hfoption>
+<hfoption id="AutoModel">

 ```python
->>> from transformers import EncoderDecoderModel, BertTokenizer
+import torch  
+from transformers import AutoModelForCausalLM, AutoTokenizer  

->>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
->>> model = EncoderDecoderModel.from_encoder_decoder_pretrained("google-bert/bert-base-uncased", "google-bert/bert-base-uncased")
+tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/bert2bert-cnn_dailymail-fp16")
+model = AutoModelForCausalLM.from_pretrained("patrickvonplaten/bert2bert-cnn_dailymail-fp16", torch_dtype=torch.bfloat16, device_map="auto",attn_implementation="sdpa")  
+
+text = "Plants create energy through a process known as photosynthesis. This involves capturing sunlight and converting carbon dioxide and water into glucose and oxygen."
+
+inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(model.device)
+
+summary = model.generate(**inputs, max_length=60, num_beams=4, early_stopping=True)
+print(tokenizer.decode(summary[0], skip_special_tokens=True))
 ```

-## Loading an existing `EncoderDecoderModel` checkpoint and perform inference.
+</hfoption>
+<hfoption id="transformers CLI">

-To load fine-tuned checkpoints of the `EncoderDecoderModel` class, [`EncoderDecoderModel`] provides the `from_pretrained(...)` method just like any other model architecture in Transformers.
+```bash
+echo -e "Plants create energy through a process known as photosynthesis. This involves capturing sunlight and converting carbon dioxide and water into glucose and oxygen." | transformers-cli run --task summarization --model "patrickvonplaten/bert2bert-cnn_dailymail-fp16" --device 0
+```

-To perform inference, one uses the [`generate`] method, which allows to autoregressively generate text. This method supports various forms of decoding, such as greedy, beam search and multinomial sampling.
+</hfoption>
+</hfoptions>
+
+## Notes
+
+- [`EncoderDecoderModel`] can be initialized using any pretrained encoder and decoder. But depending on the decoder architecture, the cross-attention layers may be randomly initialized.
+
+These models require downstream fine-tuning, as discussed in this [blog post](https://huggingface.co/blog/warm-starting-encoder-decoder). Use [`~EncoderDecoderModel.from_encoder_decoder_pretrained`] to combine encoder and decoder checkpoints.

 ```python
->>> from transformers import AutoTokenizer, EncoderDecoderModel
+from transformers import EncoderDecoderModel, BertTokenizer

->>> # load a fine-tuned seq2seq model and corresponding tokenizer
->>> model = EncoderDecoderModel.from_pretrained("patrickvonplaten/bert2bert_cnn_daily_mail")
->>> tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/bert2bert_cnn_daily_mail")
-
->>> # let's perform inference on a long piece of text
->>> ARTICLE_TO_SUMMARIZE = (
-...     "PG&E stated it scheduled the blackouts in response to forecasts for high winds "
-...     "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were "
-...     "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."
-... )
->>> input_ids = tokenizer(ARTICLE_TO_SUMMARIZE, return_tensors="pt").input_ids
-
->>> # autoregressively generate summary (uses greedy decoding by default)
->>> generated_ids = model.generate(input_ids)
->>> generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
->>> print(generated_text)
-nearly 800 thousand customers were affected by the shutoffs. the aim is to reduce the risk of wildfires. nearly 800, 000 customers were expected to be affected by high winds amid dry conditions. pg & e said it scheduled the blackouts to last through at least midday tomorrow.
+tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
+model = EncoderDecoderModel.from_encoder_decoder_pretrained(
+    "google-bert/bert-base-uncased", 
+    "google-bert/bert-base-uncased"
+)
 ```

-## Loading a PyTorch checkpoint into `TFEncoderDecoderModel`.
-
-[`TFEncoderDecoderModel.from_pretrained`] currently doesn't support initializing the model from a
-pytorch checkpoint. Passing `from_pt=True` to this method will throw an exception. If there are only pytorch
-checkpoints for a particular encoder-decoder model, a workaround is:
-
-```python
->>> # a workaround to load from pytorch checkpoint
->>> from transformers import EncoderDecoderModel, TFEncoderDecoderModel
-
->>> _model = EncoderDecoderModel.from_pretrained("patrickvonplaten/bert2bert-cnn_dailymail-fp16")
-
->>> _model.encoder.save_pretrained("./encoder")
->>> _model.decoder.save_pretrained("./decoder")
-
->>> model = TFEncoderDecoderModel.from_encoder_decoder_pretrained(
-...     "./encoder", "./decoder", encoder_from_pt=True, decoder_from_pt=True
-... )
->>> # This is only for copying some specific attributes of this particular model.
->>> model.config = _model.config
-```
-
-## Training
-
-Once the model is created, it can be fine-tuned similar to BART, T5 or any other encoder-decoder model.
-As you can see, only 2 inputs are required for the model in order to compute a loss: `input_ids` (which are the
-`input_ids` of the encoded input sequence) and `labels` (which are the `input_ids` of the encoded
-target sequence).
+- Encoder Decoder models can be fine-tuned like BART, T5 or any other encoder-decoder model. Only 2 inputs are required to compute a loss, `input_ids` and `labels`. Refer to this [notebook](https://colab.research.google.com/drive/1WIk2bxglElfZewOHboPFNj8H44_VAyKE?usp=sharing#scrollTo=ZwQIEhKOrJpl) for a more detailed training example.

 ```python
 >>> from transformers import BertTokenizer, EncoderDecoderModel
@ -147,11 +120,42 @@ target sequence).
 >>> loss = model(input_ids=input_ids, labels=labels).loss
 ```

-Detailed [colab](https://colab.research.google.com/drive/1WIk2bxglElfZewOHboPFNj8H44_VAyKE?usp=sharing#scrollTo=ZwQIEhKOrJpl) for training.
+- [`EncoderDecoderModel`] can be randomly initialized from an encoder and a decoder config as shown below.

-This model was contributed by [thomwolf](https://github.com/thomwolf). This model's TensorFlow and Flax versions
-were contributed by [ydshieh](https://github.com/ydshieh).
+```python
+>>> from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel

+>>> config_encoder = BertConfig()
+>>> config_decoder = BertConfig()
+
+>>> config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
+>>> model = EncoderDecoderModel(config=config)
+```
+
+- The Encoder Decoder Model can also be used for translation as shown below.
+
+```python
+from transformers import AutoTokenizer, EncoderDecoderModel  
+
+# Load a pre-trained translation model  
+model_name = "google/bert2bert_L-24_wmt_en_de" 
+tokenizer = AutoTokenizer.from_pretrained(model_name, pad_token="<pad>", eos_token="</s>", bos_token="<s>")  
+model = EncoderDecoderModel.from_pretrained(model_name)  
+
+# Input sentence to translate  
+input_text = "Plants create energy through a process known as"  
+
+# Encode the input text  
+inputs = tokenizer(input_text, return_tensors="pt", add_special_tokens=False).input_ids  
+
+# Generate the translated output  
+outputs = model.generate(inputs)[0]  
+
+# Decode the output tokens to get the translated sentence  
+translated_text = tokenizer.decode(outputs, skip_special_tokens=True)  
+
+print("Translated text:", translated_text)  
+```

 ## EncoderDecoderConfig

--- a/docs/source/en/model_doc/eomt.md
+++ b/docs/source/en/model_doc/eomt.md
@ -74,20 +74,16 @@ inputs = processor(
    return_tensors="pt",
 )

-# Remove Patch Offsets from inputs — only used later for post-processing.
-patch_offsets = inputs.pop("patch_offsets")
-
 with torch.inference_mode():
    outputs = model(**inputs)

 # Prepare the original image size in the format (height, width)
-original_image_sizes = [(image.height, image.width)]
+target_sizes = [(image.height, image.width)]

 # Post-process the model outputs to get final segmentation prediction
 preds = processor.post_process_semantic_segmentation(
    outputs,
-    patch_offsets=patch_offsets,
-    original_image_sizes=original_image_sizes,
+    target_sizes=target_sizes,
 )

 # Visualize the segmentation mask
@ -130,12 +126,12 @@ with torch.inference_mode():
    outputs = model(**inputs)

 # Prepare the original image size in the format (height, width)
-original_image_sizes = [(image.height, image.width)]
+target_sizes = [(image.height, image.width)]

 # Post-process the model outputs to get final segmentation prediction
 preds = processor.post_process_instance_segmentation(
    outputs,
-    original_image_sizes=original_image_sizes,
+    target_sizes=target_sizes,
 )

 # Visualize the segmentation mask
@ -173,12 +169,12 @@ with torch.inference_mode():
    outputs = model(**inputs)

 # Prepare the original image size in the format (height, width)
-original_image_sizes = [(image.height, image.width)]
+target_sizes = [(image.height, image.width)]

 # Post-process the model outputs to get final segmentation prediction
 preds = processor.post_process_panoptic_segmentation(
    outputs,
-    original_image_sizes=original_image_sizes,
+    target_sizes=target_sizes,
 )

 # Visualize the panoptic segmentation mask
--- a/docs/source/en/model_doc/gemma3n.md
+++ b/docs/source/en/model_doc/gemma3n.md
@ -29,7 +29,7 @@ rendered properly in your Markdown viewer.
 Gemma3n is a multimodal model with pretrained and instruction-tuned variants, available in E4B and E2B sizes. While
 large portions of the language model architecture are shared with prior Gemma releases, there are many new additions in
 this model, including [Alternating Updates][altup] (AltUp), [Learned Augmented Residual Layer][laurel] (LAuReL),
-[MatFormer][matformer], Per-Layer Embeddings (PLE), activation sparsity, and KV cache sharing. The language model uses
+[MatFormer][matformer], Per-Layer Embeddings (PLE), [Activation Sparsity with Statistical Top-k][spark-transformer], and KV cache sharing. The language model uses
 a similar attention pattern to [Gemma 3](./gemma3.md) with alternating 4 local sliding window self-attention layers for
 every global self-attention layer with a maximum context length of 32k tokens. Gemma 3n introduces
 [MobileNet v5][mobilenetv5] as the vision encoder, using a default resolution of 768x768 pixels, and adds a newly
@ -121,7 +121,7 @@ echo -e "Plants create energy through a process known as" | transformers run --t
 ## Notes

 -   Use [`Gemma3nForConditionalGeneration`] for image-audio-and-text, image-and-text, image-and-audio, audio-and-text,
-    image-only and aduio-only inputs.
+    image-only and audio-only inputs.
 -   Gemma 3n supports multiple images per input, but make sure the images are correctly batched before passing them to
    the processor. Each batch should be a list of one or more images.

@ -201,4 +201,5 @@ echo -e "Plants create energy through a process known as" | transformers run --t
 [gemma3n-collection]: https://huggingface.co/collections/google/gemma-3n
 [laurel]: https://arxiv.org/abs/2411.07501
 [matformer]: https://arxiv.org/abs/2310.07707
+[spark-transformer]: https://arxiv.org/abs/2506.06644
 [usm]: https://arxiv.org/abs/2303.01037
--- a/docs/source/en/model_doc/glm4.md
+++ b/docs/source/en/model_doc/glm4.md
@ -18,7 +18,37 @@ rendered properly in your Markdown viewer.

 ## Overview

-To be released with the official model launch.
+The GLM family welcomes new members [GLM-4-0414](https://arxiv.org/pdf/2406.12793) series models.
+
+The **GLM-4-32B-0414** series models, featuring 32 billion parameters. Its performance is comparable to OpenAI’s GPT
+series and DeepSeek’s V3/R1 series. It also supports very user-friendly local deployment features. GLM-4-32B-Base-0414
+was pre-trained on 15T of high-quality data, including substantial reasoning-type synthetic data. This lays the
+foundation for subsequent reinforcement learning extensions. In the post-training stage, we employed human preference
+alignment for dialogue scenarios. Additionally, using techniques like rejection sampling and reinforcement learning, we
+enhanced the model’s performance in instruction following, engineering code, and function calling, thus strengthening
+the atomic capabilities required for agent tasks. GLM-4-32B-0414 achieves good results in engineering code, Artifact
+generation, function calling, search-based Q&A, and report generation. In particular, on several benchmarks, such as
+code generation or specific Q&A tasks, GLM-4-32B-Base-0414 achieves comparable performance with those larger models like
+GPT-4o and DeepSeek-V3-0324 (671B).
+
+**GLM-Z1-32B-0414** is a reasoning model with deep thinking capabilities. This was developed based on GLM-4-32B-0414
+through cold start, extended reinforcement learning, and further training on tasks including mathematics, code, and
+logic. Compared to the base model, GLM-Z1-32B-0414 significantly improves mathematical abilities and the capability to
+solve complex tasks. During training, we also introduced general reinforcement learning based on pairwise ranking
+feedback, which enhances the model's general capabilities.
+
+**GLM-Z1-Rumination-32B-0414** is a deep reasoning model with rumination capabilities (against OpenAI's Deep Research).
+Unlike typical deep thinking models, the rumination model is capable of deeper and longer thinking to solve more
+open-ended and complex problems (e.g., writing a comparative analysis of AI development in two cities and their future
+development plans). Z1-Rumination is trained through scaling end-to-end reinforcement learning with responses graded by
+the ground truth answers or rubrics and can make use of search tools during its deep thinking process to handle complex
+tasks. The model shows significant improvements in research-style writing and complex tasks.
+
+Finally, **GLM-Z1-9B-0414** is a surprise. We employed all the aforementioned techniques to train a small model (9B).
+GLM-Z1-9B-0414 exhibits excellent capabilities in mathematical reasoning and general tasks. Its overall performance is
+top-ranked among all open-source models of the same size. Especially in resource-constrained scenarios, this model
+achieves an excellent balance between efficiency and effectiveness, providing a powerful option for users seeking
+lightweight deployment.

 ## Glm4Config

--- a/docs/source/en/model_doc/glm4v.md
+++ b/docs/source/en/model_doc/glm4v.md
@ -23,6 +23,29 @@ rendered properly in your Markdown viewer.

 # GLM-4.1V

+## Overview
+
+**GLM-4.1V-9B-Thinking** is a bilingual vision-language model optimized for reasoning, built on GLM-4-9B. It introduces
+a "thinking paradigm" with reinforcement learning, achieving state-of-the-art results among 10B-class models and
+rivaling 72B-scale models. It supports 64k context, 4K resolution, and arbitrary aspect ratios, with an open-source base
+model for further research. You can check our paper [here](https://huggingface.co/papers/2507.01006). and below is a abstract.
+
+*We present GLM-4.1V-Thinking, a vision-language model (VLM) designed to advance general-purpose multimodal understanding
+and reasoning. In this report, we share our key findings in the development of the reasoning-centric training framework.
+We first develop a capable vision foundation model with significant potential through large-scale pre-training, which
+arguably sets the upper bound for the final performance. We then propose Reinforcement Learning with Curriculum
+Sampling (RLCS) to unlock the full potential of the model, leading to comprehensive capability enhancement across a
+diverse range of tasks, including STEM problem solving, video understanding, content recognition, coding, grounding,
+GUI-based agents, and long document understanding. We open-source GLM-4.1V-9B-Thinking, which achieves state-of-the-art
+performance among models of comparable size. In a comprehensive evaluation across 28 public benchmarks, our model
+outperforms Qwen2.5-VL-7B on nearly all tasks and achieves comparable or even superior performance on 18 benchmarks
+relative to the significantly larger Qwen2.5-VL-72B. Notably, GLM-4.1V-9B-Thinking also demonstrates competitive or
+superior performance compared to closed-source models such as GPT-4o on challenging tasks including long document
+understanding and STEM reasoning, further underscoring its strong capabilities. Code, models and more information
+are released at https://github.com/THUDM/GLM-4.1V-Thinking.*
+
+## Usage
+
 The example below demonstrates how to generate text based on an image with [`Pipeline`] or the [`AutoModel`] class.

 <hfoptions id="usage">
--- a/docs/source/en/model_doc/gpt2.md
+++ b/docs/source/en/model_doc/gpt2.md
@ -57,7 +57,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2", torch_dtype=torch.float16, device_map="auto", attn_implementation="sdpa")
 tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")

-input_ids = tokenzier("Hello, I'm a language model". return_tensors="pt").to("cuda")
+input_ids = tokenizer("Hello, I'm a language model", return_tensors="pt").to("cuda")

 output = model.generate(**input_ids, cache_implementation="static")
 print(tokenizer.decode(output[0], skip_special_tokens=True))
--- a/docs/source/en/model_doc/led.md
+++ b/docs/source/en/model_doc/led.md
@ -14,62 +14,135 @@ rendered properly in your Markdown viewer.

 -->

-# LED
-
+<div style="float: right;">
    <div class="flex flex-wrap space-x-1">
           <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
            <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
    </div>
+</div>

-## Overview
+# LED

-The LED model was proposed in [Longformer: The Long-Document Transformer](https://huggingface.co/papers/2004.05150) by Iz
-Beltagy, Matthew E. Peters, Arman Cohan.
+[Longformer-Encoder-Decoder (LED)](https://huggingface.co/papers/2004.05150) is an encoder-decoder  transformer model for sequence-to-sequence tasks like summarization. It extends [Longformer](.longformer), an encoder-only model designed to handle long inputs, by adding a decoder layer. The decoder uses full self-attention on the encoded tokens and previously decoded locations. Because of Longformer's linear self-attention mechanism, LED is more efficient than standard encoder-decoder models when processing long sequences.

-The abstract from the paper is the following:
+You can find all the original [LED] checkpoints under the [Ai2](https://huggingface.co/allenai/models?search=led) organization.

-*Transformer-based models are unable to process long sequences due to their self-attention operation, which scales
-quadratically with the sequence length. To address this limitation, we introduce the Longformer with an attention
-mechanism that scales linearly with sequence length, making it easy to process documents of thousands of tokens or
-longer. Longformer's attention mechanism is a drop-in replacement for the standard self-attention and combines a local
-windowed attention with a task motivated global attention. Following prior work on long-sequence transformers, we
-evaluate Longformer on character-level language modeling and achieve state-of-the-art results on text8 and enwik8. In
-contrast to most prior work, we also pretrain Longformer and finetune it on a variety of downstream tasks. Our
-pretrained Longformer consistently outperforms RoBERTa on long document tasks and sets new state-of-the-art results on
-WikiHop and TriviaQA. We finally introduce the Longformer-Encoder-Decoder (LED), a Longformer variant for supporting
-long document generative sequence-to-sequence tasks, and demonstrate its effectiveness on the arXiv summarization
-dataset.*
+> [!TIP]
+> This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten).
+>
+> Click on the LED models in the right sidebar for more examples of how to apply LED to different language tasks.

-## Usage tips
+The example below demonstrates how to summarize text with [`Pipeline`], [`AutoModel`], and from the command line.

- [`LEDForConditionalGeneration`] is an extension of
-  [`BartForConditionalGeneration`] exchanging the traditional *self-attention* layer with
-  *Longformer*'s *chunked self-attention* layer. [`LEDTokenizer`] is an alias of
-  [`BartTokenizer`].
- LED works very well on long-range *sequence-to-sequence* tasks where the `input_ids` largely exceed a length of
-  1024 tokens.
- LED pads the `input_ids` to be a multiple of `config.attention_window` if required. Therefore a small speed-up is
-  gained, when [`LEDTokenizer`] is used with the `pad_to_multiple_of` argument.
- LED makes use of *global attention* by means of the `global_attention_mask` (see
-  [`LongformerModel`]). For summarization, it is advised to put *global attention* only on the first
-  `<s>` token. For question answering, it is advised to put *global attention* on all tokens of the question.
- To fine-tune LED on all 16384, *gradient checkpointing* can be enabled in case training leads to out-of-memory (OOM)
-  errors. This can be done by executing `model.gradient_checkpointing_enable()`. 
- Moreover, the `use_cache=False`
-  flag can be used to disable the caching mechanism to save memory.
- LED is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
-  the left.
+<hfoptions id="usage">
+<hfoption id="Pipeline">

-This model was contributed by [patrickvonplaten](https://huggingface.co/patrickvonplaten).
+```python
+import torch
+from transformers import pipeline
+
+pipeline = pipeline(
+    task="summarization",
+    model="allenai/led-base-16384",
+    torch_dtype=torch.float16,
+    device=0
+)
+pipeline("""Plants are among the most remarkable and essential life forms on Earth, possessing a unique ability to produce their own food through a process known as photosynthesis. This complex biochemical process is fundamental not only to plant life but to virtually all life on the planet.
+Through photosynthesis, plants capture energy from sunlight using a green pigment called chlorophyll, which is located in specialized cell structures called chloroplasts. In the presence of light, plants absorb carbon dioxide from the atmosphere through small pores in their leaves called stomata, and take in water from the soil through their root systems.
+These ingredients are then transformed into glucose, a type of sugar that serves as a source of chemical energy, and oxygen, which is released as a byproduct into the atmosphere. The glucose produced during photosynthesis is not just used immediately; plants also store it as starch or convert it into other organic compounds like cellulose, which is essential for building their cellular structure.
+This energy reserve allows them to grow, develop leaves, produce flowers, bear fruit, and carry out various physiological processes throughout their lifecycle.""")
+```
+
+</hfoption>
+<hfoption id="AutoModel">
+
+```python
+import torch
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+
+tokenizer = AutoTokenizer.from_pretrained(
+    "allenai/led-base-16384"
+)
+model = AutoModelForSeq2SeqLM.from_pretrained(
+    "allenai/led-base-16384",
+    torch_dtype=torch.float16,
+    device_map="auto"
+)
+
+input_text = """Plants are among the most remarkable and essential life forms on Earth, possessing a unique ability to produce their own food through a process known as photosynthesis. This complex biochemical process is fundamental not only to plant life but to virtually all life on the planet.
+Through photosynthesis, plants capture energy from sunlight using a green pigment called chlorophyll, which is located in specialized cell structures called chloroplasts. In the presence of light, plants absorb carbon dioxide from the atmosphere through small pores in their leaves called stomata, and take in water from the soil through their root systems.
+These ingredients are then transformed into glucose, a type of sugar that serves as a source of chemical energy, and oxygen, which is released as a byproduct into the atmosphere. The glucose produced during photosynthesis is not just used immediately; plants also store it as starch or convert it into other organic compounds like cellulose, which is essential for building their cellular structure.
+This energy reserve allows them to grow, develop leaves, produce flowers, bear fruit, and carry out various physiological processes throughout their lifecycle."""
+input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+
+# Place global attention on the first token
+global_attention_mask = torch.zeros_like(input_ids.input_ids).to("cuda")
+global_attention_mask[:, 0] = 1
+
+output = model.generate(**input_ids, global_attention_mask=global_attention_mask, cache_implementation="static")
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+
+</hfoption>
+<hfoption id="transformers-cli">
+
+```bash
+!echo -e "Plants are among the most remarkable and essential life forms on Earth, possessing a unique ability to produce their own food through a process known as photosynthesis. This complex biochemical process is fundamental not only to plant life but to virtually all life on the planet. Through photosynthesis, plants capture energy from sunlight using a green pigment called chlorophyll, which is located in specialized cell structures called chloroplasts." | transformers-cli run --task summarization --model allenai/led-base-16384 --device 0
+```
+</hfoption>
+</hfoptions>
+
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
+
+The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to int4.
+
+```python
+import torch
+from transformers import BitsAndBytesConfig, AutoModelForSeq2SeqLM, AutoTokenizer
+
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.bfloat16,
+    bnb_4bit_quant_type="nf4"
+)
+model = AutoModelForSeq2SeqLM.from_pretrained(
+    "allenai/led-large-16384",
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    quantization_config=quantization_config
+)
+
+tokenizer = AutoTokenizer.from_pretrained(
+    "allenai/led-large-16384"
+)
+
+input_text = """Plants are among the most remarkable and essential life forms on Earth, possessing a unique ability to produce their own food through a process known as photosynthesis. This complex biochemical process is fundamental not only to plant life but to virtually all life on the planet.
+Through photosynthesis, plants capture energy from sunlight using a green pigment called chlorophyll, which is located in specialized cell structures called chloroplasts. In the presence of light, plants absorb carbon dioxide from the atmosphere through small pores in their leaves called stomata, and take in water from the soil through their root systems.
+These ingredients are then transformed into glucose, a type of sugar that serves as a source of chemical energy, and oxygen, which is released as a byproduct into the atmosphere. The glucose produced during photosynthesis is not just used immediately; plants also store it as starch or convert it into other organic compounds like cellulose, which is essential for building their cellular structure.
+This energy reserve allows them to grow, develop leaves, produce flowers, bear fruit, and carry out various physiological processes throughout their lifecycle."""
+input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+
+# Place global attention on the first token
+global_attention_mask = torch.zeros_like(input_ids.input_ids).to("cuda")
+global_attention_mask[:, 0] = 1
+
+output = model.generate(**input_ids, global_attention_mask=global_attention_mask, cache_implementation="static")
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+
+## Notes
+
+- [`LEDForConditionalGeneration`] is an extension of [`BartForConditionalGeneration`] exchanging the traditional self-attention layer with Longformer's chunked self-attention layer. [`LEDTokenizer`] is an alias of [`BartTokenizer`].
+- LED pads the `input_ids` to be a multiple of `config.attention_window` if required. A small speedup is gained when [`LEDTokenizer`] is used with the `pad_to_multiple_of` argument.
+- LED works best on long-range sequence-to-sequence tasks where the `input_ids` are significantly longer than 1024 tokens.
+- LED uses global attention by means of the `global_attention_mask` (see [`LongformerModel`]). For summarization, it is advised to put global attention only on the first `<s>` token. For question answering, it is advised to put global attention on all tokens of the question.
+- To fine-tune LED on all 16384 parameters, gradient checkpointing can be enabled in case training leads to out-of-memory (OOM) errors. Enable gradient checkpointing by adding `model.gradient_checkpointing_enable()` and setting `use_cache=False` to disable the caching mechanism to save memory.
+- Inputs should be padded on the right because LED uses absolute position embeddings.

 ## Resources

- [A notebook showing how to evaluate LED](https://colab.research.google.com/drive/12INTTR6n64TzS4RrXZxMSXfrOd9Xzamo?usp=sharing).
- [A notebook showing how to fine-tune LED](https://colab.research.google.com/drive/12LjJazBl7Gam0XBPy_y0CTOJZeZ34c2v?usp=sharing).
- [Text classification task guide](../tasks/sequence_classification)
- [Question answering task guide](../tasks/question_answering)
- [Translation task guide](../tasks/translation)
- [Summarization task guide](../tasks/summarization)
+- Read the [LED on Arxiv notebook](https://colab.research.google.com/drive/12INTTR6n64TzS4RrXZxMSXfrOd9Xzamo?usp=sharing) to see how LED can achieve state-of-the-art performance on Arxiv article summarization.
+- Read the [Fine-tune LED notebook](https://colab.research.google.com/drive/12LjJazBl7Gam0XBPy_y0CTOJZeZ34c2v?usp=sharing) to learn how to fine-tune LED on PubMed articles.

 ## LEDConfig

--- a/docs/source/en/model_doc/lfm2.md
+++ b/docs/source/en/model_doc/lfm2.md
@ -0,0 +1,84 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
+# LFM2
+
+## Overview
+
+[LFM2](https://www.liquid.ai/blog/liquid-foundation-models-v2-our-second-series-of-generative-ai-models) represents a new generation of Liquid Foundation Models developed by [Liquid AI](https://liquid.ai/), specifically designed for edge AI and on-device deployment. 
+
+The models are available in three sizes (350M, 700M, and 1.2B parameters) and are engineered to run efficiently on CPU, GPU, and NPU hardware, making them particularly well-suited for applications requiring low latency, offline operation, and privacy.
+
+## Architecture
+
+The architecture consists of 16 blocks total: 10 double-gated short-range convolution blocks and 6 blocks of grouped query attention. This design stems from the concept of dynamical systems, where linear operations are modulated by input-dependent gates, allowing for "liquid" dynamics that can adapt in real-time. The short convolutions are particularly optimized for embedded SoC CPUs, making them ideal for devices that require fast, local inference without relying on cloud connectivity.
+
+The key architectural innovation of LFM2 lies in its systematic approach to balancing quality, latency, and memory efficiency through our STAR neural architecture search engine. Using STAR, Liquid AI optimized the models for real-world performance on embedded hardware, measuring actual peak memory usage and inference speed on Qualcomm Snapdragon processors. This results in models that achieve 2x faster decode and prefill performance compared to similar-sized models, while maintaining superior benchmark performance across knowledge, mathematics, instruction following, and multilingual tasks.
+
+## Example
+
+The following example shows how to generate an answer using the `AutoModelForCausalLM` class.
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+# Load model and tokenizer
+model_id = "LiquidAI/LFM2-1.2B"
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    device_map="auto",
+    torch_dtype="bfloat16",
+)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+# Generate answer
+prompt = "What is C. elegans?"
+input_ids = tokenizer.apply_chat_template(
+    [{"role": "user", "content": prompt}],
+    add_generation_prompt=True,
+    return_tensors="pt",
+    tokenize=True,
+)
+
+output = model.generate(
+    input_ids,
+    do_sample=True,
+    temperature=0.3,
+    min_p=0.15,
+    repetition_penalty=1.05,
+    max_new_tokens=512,
+)
+
+print(tokenizer.decode(output[0], skip_special_tokens=False))
+```
+
+## Lfm2Config
+
+[[autodoc]] Lfm2Config
+
+## Lfm2Model
+
+[[autodoc]] Lfm2Model
+    - forward
+
+## Lfm2ForCausalLM
+
+[[autodoc]] Lfm2ForCausalLM
+    - forward
--- a/docs/source/en/model_doc/llava_next.md
+++ b/docs/source/en/model_doc/llava_next.md
@ -14,153 +14,69 @@ rendered properly in your Markdown viewer.

 -->

-# LLaVA-NeXT
-
+<div style="float: right;">
  <div class="flex flex-wrap space-x-1">
        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
  </div>
+</div>

-## Overview
+# LLaVA-NeXT

-The LLaVA-NeXT model was proposed in [LLaVA-NeXT: Improved reasoning, OCR, and world knowledge](https://llava-vl.github.io/blog/2024-01-30-llava-next/) by Haotian Liu, Chunyuan Li, Yuheng Li, Bo Li, Yuanhan Zhang, Sheng Shen, Yong Jae Lee. LLaVa-NeXT (also called LLaVa-1.6) improves upon [LLaVa](llava) by increasing the input image resolution and training on an improved visual instruction tuning dataset to improve OCR and common sense reasoning.
+[LLaVA‑NeXT](https://llava-vl.github.io/blog/2024-05-10-llava-next-stronger-llms/) improves on [Llava](./llava) by increasing the input image resolution by 4x more pixels and supporting 3 aspect ratios (up to 672x672, 336x1344, 1344x336) to better grasp visual details. It is also trained on an improved visual instruction tuning dataset covering more scenarios and applications to improve OCR and common sense reasoning.

-The introduction from the blog is the following:
+You can find all the original LLaVA‑NeXT checkpoints under the [LLaVA-NeXT](https://huggingface.co/collections/llava-hf/llava-next-65f75c4afac77fd37dbbe6cf) collection.

-*In October 2023, we released LLaVA-1.5 with a simple and efficient design along with great performance on a benchmark suite of 12 datasets. It has since served as the foundation of many comprehensive studies of data, model, and capabilities of large multimodal models (LMM), and has enabled various new applications.
+> [!TIP]
+> This model was contributed by [nielsr](https://huggingface.co/nielsr).
+>
+> Click on the LLaVA‑NeXT models in the right sidebar for more examples of how to apply Llava-NeXT to different multimodal tasks.

-Today, we are thrilled to present LLaVA-NeXT, with improved reasoning, OCR, and world knowledge. LLaVA-NeXT even exceeds Gemini Pro on several benchmarks.
+The example below demonstrates how to generate text based on an image with [`Pipeline`] or the [`AutoModel`] class.

-Compared with LLaVA-1.5, LLaVA-NeXT has several improvements:
+<hfoptions id="usage">

-Increasing the input image resolution to 4x more pixels. This allows it to grasp more visual details. It supports three aspect ratios, up to 672x672, 336x1344, 1344x336 resolution.
-Better visual reasoning and OCR capability with an improved visual instruction tuning data mixture.
-Better visual conversation for more scenarios, covering different applications. Better world knowledge and logical reasoning.
-Efficient deployment and inference with SGLang.
-Along with performance improvements, LLaVA-NeXT maintains the minimalist design and data efficiency of LLaVA-1.5. It re-uses the pretrained connector of LLaVA-1.5, and still uses less than 1M visual instruction tuning samples. The largest 34B variant finishes training in ~1 day with 32 A100s.*
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/llava_next_overview.png"
-alt="drawing" width="600"/>
-
-<small> LLaVa-NeXT incorporates a higher input resolution by encoding various patches of the input image. Taken from the <a href="https://huggingface.co/papers/2310.03744">original paper.</a> </small>
-
-This model was contributed by [nielsr](https://huggingface.co/nielsr).
-The original code can be found [here](https://github.com/haotian-liu/LLaVA/tree/main).
-
-## Usage tips
-
- We advise users to use `padding_side="left"` when computing batched generation as it leads to more accurate results. Simply make sure to call `processor.tokenizer.padding_side = "left"` before generating.
-
-<Tip warning={true}>
-
- Llava-Next uses different number of patches for images and thus has to pad the inputs inside modeling code, aside from the padding done when processing the inputs. The default setting is "left-padding" if model is in `eval()` mode, otherwise "right-padding".
-
-</Tip>
-
-
-> [!NOTE]
-> LLaVA models after release v4.46 will raise warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you.
-Adding these attributes means that LLaVA will try to infer the number of image tokens required per image and expand the text with as many `<image>` placeholders as there will be tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there will be failure when merging the embeddings.
-The attributes can be obtained from model config, as `model.config.vision_config.patch_size` or `model.config.vision_feature_select_strategy`. The `num_additional_image_tokens` should be `1` if the vision backbone adds a CLS token or `0` if nothing extra is added to the vision patches.
-
-
-### Formatting Prompts with Chat Templates  
-
-Each **checkpoint** is trained with a specific prompt format, depending on the underlying large language model backbone. To ensure correct formatting, use the processor’s `apply_chat_template` method.  
-
-**Important:**  
- You must construct a conversation history — passing a plain string won't work.  
- Each message should be a dictionary with `"role"` and `"content"` keys.  
- The `"content"` should be a list of dictionaries for different modalities like `"text"` and `"image"`.  
-
-
-Here’s an example of how to structure your input. We will use [llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf) and a conversation history of text and image.
+<hfoption id="Pipeline">
+
+```python
+import torch  
+from transformers import pipeline  
+
+pipeline = pipeline(  
+    task="image-text-to-text",  
+    model="llava-hf/llava-v1.6-mistral-7b-hf",  
+    device=0,  
+    torch_dtype=torch.bfloat16  
+)  
+messages = [  
+    {  
+        "role": "user",  
+        "content": [  
+            {  
+                "type": "image",  
+                "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",  
+            },  
+            { "type": "text", "text": "Describe this image."},  
+        ]  
+    }  
+]  
+pipeline(text=messages, max_new_tokens=20, return_full_text=False)
+```
+
+</hfoption>
+
+<hfoption id="AutoModel">

 ```python
-from transformers import LlavaNextProcessor
-
-processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
-
-conversation = [
-    {
-        "role": "user",
-        "content": [
-            {"type": "image"},
-            {"type": "text", "text": "What’s shown in this image?"},
-        ],
-    },
-    {
-        "role": "assistant",
-        "content": [{"type": "text", "text": "This image shows a red stop sign."},]
-    },
-    {
-
-        "role": "user",
-        "content": [
-            {"type": "text", "text": "Describe the image in more details."},
-        ],
-    },
-]
-
-text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
-
-# Note that the template simply formats your prompt, you still have to tokenize it and obtain pixel values for your images
-print(text_prompt)
->>> "[INST] <image>\nWhat's shown in this image? [/INST] This image shows a red stop sign. [INST] Describe the image in more details. [/INST]"
-```
-
- If you want to construct a chat prompt yourself, below is a list of possible formats
-.
-[llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf) requires the following format:
-```bash
-"[INST] <image>\nWhat is shown in this image? [/INST]"
-```
-
-[llava-v1.6-vicuna-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-vicuna-7b-hf) and [llava-v1.6-vicuna-13b-hf](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf) require the following format:
-```bash
-"A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: <image>\nWhat is shown in this image? ASSISTANT:"
-```
-
-[llava-v1.6-34b-hf](https://huggingface.co/llava-hf/llava-v1.6-34b-hf) requires the following format:
-```bash
-"<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n<image>\nWhat is shown in this image?<|im_end|><|im_start|>assistant\n"
-```
-
-[llama3-llava-next-8b-hf](https://huggingface.co/llava-hf/llava-next-8b-hf) requires the following format:
-
-```bash
-"<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.<|eot_id|><|start_header_id|><|start_header_id|>user<|end_header_id|>\n\n<image>\nWhat is shown in this image?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
-```
-
-[llava-next-72b-hf](https://huggingface.co/llava-hf/llava-next-72b-hf) and [llava-next-110b-hf](https://huggingface.co/llava-hf/llava-next-110b-hf) require the following format:
-
-```bash
-"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<image>\nWhat is shown in this image?<|im_end|>\n<|im_start|>assistant\n"
-```
-
-🚀 **Bonus:** If you're using `transformers>=4.49.0`, you can also get a vectorized output from `apply_chat_template`. See the **Usage Examples** below for more details on how to use it.
-
-
-
-## Usage example
-
-### Single image inference
-
-Here's how to load the model and perform inference in half-precision (`torch.float16`):
-
-```python
-from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
 import torch  
-from PIL import Image
 import requests  
+from PIL import Image  
+from transformers import AutoProcessor, LlavaNextForConditionalGeneration  

-processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
+processor = AutoProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")  
+model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16).to("cuda")  

-model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16)
-model.to("cuda:0")
-
-# prepare image and text prompt, using the appropriate prompt template
 url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true"  
 image = Image.open(requests.get(url, stream=True).raw)  

@ -174,128 +90,103 @@ conversation = [
    },  
 ]  
 prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)  
-inputs = processor(image, prompt, return_tensors="pt").to("cuda:0")
-
-# autoregressively complete prompt
+inputs = processor(image, prompt, return_tensors="pt").to("cuda")  
 output = model.generate(**inputs, max_new_tokens=100)  
-
 print(processor.decode(output[0], skip_special_tokens=True))  
 ```

-### Multi image inference
+</hfoption>

-LLaVa-Next can perform inference with multiple images as input, where images either belong to the same prompt or different prompts (in batched inference). Here is how you can do it:
+</hfoptions>
+
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
+
+The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to int4.

 ```python
+import torch  
 import requests  
 from PIL import Image  
-import torch
-from transformers import AutoProcessor, AutoModelForImageTextToText
+from transformers import AutoModelForImageTextToText, AutoProcessor, BitsAndBytesConfig  

-# Load the model in half-precision
-model = AutoModelForImageTextToText.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, device_map="auto")
-processor = AutoProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
-
-# Get three different images
-url = "https://www.ilankelman.org/stopsigns/australia.jpg"
-image_stop = Image.open(requests.get(url, stream=True).raw)
-
-url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-image_cats = Image.open(requests.get(url, stream=True).raw)
-
-url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
-image_snowman = Image.open(requests.get(url, stream=True).raw)
-
-# Prepare a batch of two prompts, where the first one is a multi-turn conversation and the second is not
-conversation_1 = [
-    {
-        "role": "user",
-        "content": [
-            {"type": "image"},
-            {"type": "text", "text": "What is shown in this image?"},
-            ],
-    },
-    {
-        "role": "assistant",
-        "content": [
-            {"type": "text", "text": "There is a red stop sign in the image."},
-            ],
-    },
-    {
-        "role": "user",
-        "content": [
-            {"type": "image"},
-            {"type": "text", "text": "What about this image? How many cats do you see?"},
-            ],
-    },
-]
-
-conversation_2 = [
-    {
-        "role": "user",
-        "content": [
-            {"type": "image"},
-            {"type": "text", "text": "What is shown in this image?"},
-            ],
-    },
-]
-
-prompt_1 = processor.apply_chat_template(conversation_1, add_generation_prompt=True)
-prompt_2 = processor.apply_chat_template(conversation_2, add_generation_prompt=True)
-prompts = [prompt_1, prompt_2]
-
-# We can simply feed images in the order they have to be used in the text prompt
-# Each "<image>" token uses one image leaving the next for the subsequent "<image>" tokens
-inputs = processor(images=[image_stop, image_cats, image_snowman], text=prompts, padding=True, return_tensors="pt").to(model.device)
-
-# Generate
-generate_ids = model.generate(**inputs, max_new_tokens=30)
-processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
-```
-
-## Model optimization
-
-### Quantization using Bitsandbytes
-
-The model can be loaded in 8 or 4 bits, greatly reducing the memory requirements while maintaining the performance of the original model. First make sure to install bitsandbytes, `pip install bitsandbytes`, and to have access to a GPU/accelerator that is supported by the library.
-
-<Tip>
-
-bitsandbytes is being refactored to support multiple backends beyond CUDA. Currently, ROCm (AMD GPU) and Intel CPU implementations are mature, with Intel XPU in progress and Apple Silicon support expected by Q4/Q1. For installation instructions and the latest backend updates, visit [this link](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend).
-
-We value your feedback to help identify bugs before the full release! Check out [these docs](https://huggingface.co/docs/bitsandbytes/main/en/non_cuda_backends) for more details and feedback links.
-
-</Tip>
-
-Simply change the snippet above with:
-
-```python
-from transformers import AutoModelForImageTextToText, BitsAndBytesConfig
-
-# specify how to quantize the model
-quantization_config = BitsAndBytesConfig(
+quant_config = BitsAndBytesConfig(  
    load_in_4bit=True,  
-    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,  
+    bnb_4bit_quant_type="nf4"  
 )  

-model = AutoModelForImageTextToText.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", quantization_config=quantization_config, device_map="auto")
+processor = AutoProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")  
+model = AutoModelForImageTextToText.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", quantization_config=quant_config, device_map="auto")  
+
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/llava_next_ocr.png"  
+image = Image.open(requests.get(url, stream=True).raw)  
+
+conversation = [  
+    {  
+        "role": "user",  
+        "content": [  
+            {"type": "image"},  
+            {"type": "text", "text": "What does this chart show?"},  
+        ],  
+    },  
+]  
+prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)  
+inputs = processor(image, prompt, return_tensors="pt").to("cuda")  
+
+with torch.inference_mode():  
+    output = model.generate(**inputs, max_new_tokens=100)  
+print(processor.decode(output[0], skip_special_tokens=True))  
 ```

-### Use Flash-Attention 2 to further speed-up generation

-First make sure to install flash-attn. Refer to the [original repository of Flash Attention](https://github.com/Dao-AILab/flash-attention) regarding that package installation. Simply change the snippet above with:
+## Notes
+
+* Different checkpoints (Mistral, Vicuna, etc.) require a specific prompt format depending on the underlying LLM. Always use [`~ProcessorMixin.apply_chat_template`] to ensure correct formatting. Refer to the [Templates](../chat_templating) guide for more details.
+
+* Set `padding_side="left"` during batched generation for more accurate results.
+
+```py
+processor.tokenizer.padding_side = "left"
+```
+
+* LLaVA-NeXT uses different numbers of patches for images and pads the inputs inside the modeling code except when padding is done during processing. The default setting is *left-padding* if the model is in `eval()` mode, otherwise it is *right-padding*.
+
+* LLaVA models after v4.46 raises warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}`, and `processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add these attributes to the processor if you own the model checkpoint or open a PR if it isn't.
+
+  Adding these attributes means LLaVA will try to infer the number of image tokens required per image and expand the text with the same number of `<image>` token placeholders. There are usually ~500 tokens per image, so make sure the text is not truncated because it will cause a failure when merging the embeddings. The attributes can be found in `model.config.vision_config.patch_size` or `model.config.vision_feature_select_strategy`.
+
+  The `num_additional_image_tokens` should be `1` if the vision backbone adds a `CLS` token or `0` if nothing extra is added.
+
+* The example below demonstrates inference with multiple input images.

 ```python
-from transformers import AutoModelForImageTextToText
+from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
+from PIL import Image
+import requests, torch

-model = AutoModelForImageTextToText.from_pretrained(
-    model_id,
-    torch_dtype=torch.float16,
-    use_flash_attention_2=True
-).to(0)
+processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
+model = LlavaNextForConditionalGeneration.from_pretrained(
+    "llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16
+).to("cuda")
+
+# Load multiple images
+url1 = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/llava_next_ocr.png"
+url2 = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/llava_next_comparison.png"
+
+image1 = Image.open(requests.get(url1, stream=True).raw)
+image2 = Image.open(requests.get(url2, stream=True).raw)
+
+conversation = [
+    {"role": "user", "content": [{"type": "image"}, {"type": "image"}, {"type": "text", "text": "Compare these two images and describe the differences."}]}
+]
+prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
+inputs = processor([image1, image2], prompt, return_tensors="pt").to("cuda")
+
+output = model.generate(**inputs, max_new_tokens=100)
+print(processor.decode(output[0], skip_special_tokens=True))
 ```

+
 ## LlavaNextConfig

 [[autodoc]] LlavaNextConfig
--- a/docs/source/en/model_doc/mamba.md
+++ b/docs/source/en/model_doc/mamba.md
@ -28,6 +28,7 @@ You can find all the original Mamba checkpoints under the [State Space Models](h


 > [!TIP]
+> This model was contributed by [Molbap](https://huggingface.co/Molbap) and [AntonV](https://huggingface.co/AntonV).
 > Click on the Mamba models in the right sidebar for more examples of how to apply Mamba to different language tasks.

 The example below demonstrates how to generate text with [`Pipeline`], [`AutoModel`], and from the command line.
--- a/docs/source/en/model_doc/mamba2.md
+++ b/docs/source/en/model_doc/mamba2.md
@ -26,6 +26,7 @@ rendered properly in your Markdown viewer.
 You can find all the original Mamba 2 checkpoints under the [State Space Models](https://huggingface.co/state-spaces) organization, but the examples shown below use [mistralai/Mamba-Codestral-7B-v0.1](https://huggingface.co/mistralai/Mamba-Codestral-7B-v0.1) because a Hugging Face implementation isn't supported yet for the original checkpoints.

 > [!TIP]
+> This model was contributed by [ArthurZ](https://huggingface.co/ArthurZ).
 > Click on the Mamba models in the right sidebar for more examples of how to apply Mamba to different language tasks.

 The example below demonstrates how to generate text with [`Pipeline`], [`AutoModel`], and from the command line.
--- a/docs/source/en/model_doc/marian.md
+++ b/docs/source/en/model_doc/marian.md
@ -14,8 +14,7 @@ rendered properly in your Markdown viewer.

 -->

-# MarianMT
-
+<div style="float: right;">
    <div class="flex flex-wrap space-x-1">
        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
        <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
@ -24,149 +23,129 @@ rendered properly in your Markdown viewer.
        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
    </div>
+</div>

-## Overview
-
-A framework for translation models, using the same models as BART. Translations should be similar, but not identical to output in the test set linked to in each model card.
-This model was contributed by [sshleifer](https://huggingface.co/sshleifer).
+# MarianMT


-## Implementation Notes

- Each model is about 298 MB on disk, there are more than 1,000 models.
- The list of supported language pairs can be found [here](https://huggingface.co/Helsinki-NLP).
- Models were originally trained by [Jörg Tiedemann](https://researchportal.helsinki.fi/en/persons/j%C3%B6rg-tiedemann) using the [Marian](https://marian-nmt.github.io/) C++ library, which supports fast training and translation.
- All models are transformer encoder-decoders with 6 layers in each component. Each model's performance is documented
-  in a model card.
- The 80 opus models that require BPE preprocessing are not supported.
- The modeling code is the same as [`BartForConditionalGeneration`] with a few minor modifications:
+[MarianMT](https://huggingface.co/papers/1804.00344) is a machine translation model trained with the Marian framework which is written in pure C++. The framework includes its own custom auto-differentiation engine and efficient meta-algorithms to train encoder-decoder models like BART.

-  - static (sinusoid) positional embeddings (`MarianConfig.static_position_embeddings=True`)
-  - no layernorm_embedding (`MarianConfig.normalize_embedding=False`)
-  - the model starts generating with `pad_token_id` (which has 0 as a token_embedding) as the prefix (Bart uses
-    `<s/>`),
- Code to bulk convert models can be found in `convert_marian_to_pytorch.py`.
+All MarianMT models are transformer encoder-decoders with 6 layers in each component, use static sinusoidal positional embeddings, don't have a layernorm embedding, and the model starts generating with the prefix `pad_token_id` instead of `<s/>`.


-## Naming

- All model names use the following format: `Helsinki-NLP/opus-mt-{src}-{tgt}`
- The language codes used to name models are inconsistent. Two digit codes can usually be found [here](https://developers.google.com/admin-sdk/directory/v1/languages), three digit codes require googling "language
-  code {code}".
- Codes formatted like `es_AR` are usually `code_{region}`. That one is Spanish from Argentina.
- The models were converted in two stages. The first 1000 models use ISO-639-2 codes to identify languages, the second
-  group use a combination of ISO-639-5 codes and ISO-639-2 codes.
+You can find all the original MarianMT checkpoints under the [Language Technology Research Group at the University of Helsinki](https://huggingface.co/Helsinki-NLP/models?search=opus-mt) organization.


-## Examples
+> [!TIP]
+> This model was contributed by [sshleifer](https://huggingface.co/sshleifer).
+>
+> Click on the MarianMT models in the right sidebar for more examples of how to apply MarianMT to translation tasks.

- Since Marian models are smaller than many other translation models available in the library, they can be useful for
-  fine-tuning experiments and integration tests.
- [Fine-tune on GPU](https://github.com/huggingface/transformers/blob/master/examples/legacy/seq2seq/train_distil_marian_enro.sh)

-## Multilingual Models
+The example below demonstrates how to translate text using [`Pipeline`] or the [`AutoModel`] class.

- All model names use the following format: `Helsinki-NLP/opus-mt-{src}-{tgt}`:
- If a model can output multiple languages, and you should specify a language code by prepending the desired output
-  language to the `src_text`.
- You can see a models's supported language codes in its model card, under target constituents, like in [opus-mt-en-roa](https://huggingface.co/Helsinki-NLP/opus-mt-en-roa).
- Note that if a model is only multilingual on the source side, like `Helsinki-NLP/opus-mt-roa-en`, no language
-  codes are required.
-
-New multi-lingual models from the [Tatoeba-Challenge repo](https://github.com/Helsinki-NLP/Tatoeba-Challenge)
-require 3 character language codes:
+<hfoptions id="usage">
+<hfoption id="Pipeline">

 ```python
->>> from transformers import MarianMTModel, MarianTokenizer

->>> src_text = [
-...     ">>fra<< this is a sentence in english that we want to translate to french",
-...     ">>por<< This should go to portuguese",
-...     ">>esp<< And this to Spanish",
-... ]
+import torch
+from transformers import pipeline

->>> model_name = "Helsinki-NLP/opus-mt-en-roa"
->>> tokenizer = MarianTokenizer.from_pretrained(model_name)
->>> print(tokenizer.supported_language_codes)
-['>>zlm_Latn<<', '>>mfe<<', '>>hat<<', '>>pap<<', '>>ast<<', '>>cat<<', '>>ind<<', '>>glg<<', '>>wln<<', '>>spa<<', '>>fra<<', '>>ron<<', '>>por<<', '>>ita<<', '>>oci<<', '>>arg<<', '>>min<<']
+pipeline = pipeline("translation_en_to_de", model="Helsinki-NLP/opus-mt-en-de", torch_dtype=torch.float16, device=0)
+pipeline("Hello, how are you?")

->>> model = MarianMTModel.from_pretrained(model_name)
->>> translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))
->>> [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
-["c'est une phrase en anglais que nous voulons traduire en français",
- 'Isto deve ir para o português.',
- 'Y esto al español']
 ```

-Here is the code to see all available pretrained models on the hub:
+</hfoption>
+
+<hfoption id="AutoModel">

 ```python
-from huggingface_hub import list_models

-model_list = list_models()
-org = "Helsinki-NLP"
-model_ids = [x.id for x in model_list if x.id.startswith(org)]
-suffix = [x.split("/")[1] for x in model_ids]
-old_style_multi_models = [f"{org}/{s}" for s in suffix if s != s.lower()]
+import torch
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
+model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-de", torch_dtype=torch.float16, attn_implementation="sdpa", device_map="auto")
+
+inputs = tokenizer("Hello, how are you?", return_tensors="pt").to("cuda")
+outputs = model.generate(**inputs, cache_implementation="static")
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+
 ```

-## Old Style Multi-Lingual Models
+</hfoption>
+</hfoptions>

-These are the old style multi-lingual models ported from the OPUS-MT-Train repo: and the members of each language
-group:
-
-```python no-style
-['Helsinki-NLP/opus-mt-NORTH_EU-NORTH_EU',
- 'Helsinki-NLP/opus-mt-ROMANCE-en',
- 'Helsinki-NLP/opus-mt-SCANDINAVIA-SCANDINAVIA',
- 'Helsinki-NLP/opus-mt-de-ZH',
- 'Helsinki-NLP/opus-mt-en-CELTIC',
- 'Helsinki-NLP/opus-mt-en-ROMANCE',
- 'Helsinki-NLP/opus-mt-es-NORWAY',
- 'Helsinki-NLP/opus-mt-fi-NORWAY',
- 'Helsinki-NLP/opus-mt-fi-ZH',
- 'Helsinki-NLP/opus-mt-fi_nb_no_nn_ru_sv_en-SAMI',
- 'Helsinki-NLP/opus-mt-sv-NORWAY',
- 'Helsinki-NLP/opus-mt-sv-ZH']
-GROUP_MEMBERS = {
- 'ZH': ['cmn', 'cn', 'yue', 'ze_zh', 'zh_cn', 'zh_CN', 'zh_HK', 'zh_tw', 'zh_TW', 'zh_yue', 'zhs', 'zht', 'zh'],
- 'ROMANCE': ['fr', 'fr_BE', 'fr_CA', 'fr_FR', 'wa', 'frp', 'oc', 'ca', 'rm', 'lld', 'fur', 'lij', 'lmo', 'es', 'es_AR', 'es_CL', 'es_CO', 'es_CR', 'es_DO', 'es_EC', 'es_ES', 'es_GT', 'es_HN', 'es_MX', 'es_NI', 'es_PA', 'es_PE', 'es_PR', 'es_SV', 'es_UY', 'es_VE', 'pt', 'pt_br', 'pt_BR', 'pt_PT', 'gl', 'lad', 'an', 'mwl', 'it', 'it_IT', 'co', 'nap', 'scn', 'vec', 'sc', 'ro', 'la'],
- 'NORTH_EU': ['de', 'nl', 'fy', 'af', 'da', 'fo', 'is', 'no', 'nb', 'nn', 'sv'],
- 'SCANDINAVIA': ['da', 'fo', 'is', 'no', 'nb', 'nn', 'sv'],
- 'SAMI': ['se', 'sma', 'smj', 'smn', 'sms'],
- 'NORWAY': ['nb_NO', 'nb', 'nn_NO', 'nn', 'nog', 'no_nb', 'no'],
- 'CELTIC': ['ga', 'cy', 'br', 'gd', 'kw', 'gv']
-}
-```
-
-Example of translating english to many romance languages, using old-style 2 character language codes

+Use the [AttentionMaskVisualizer](https://github.com/huggingface/transformers/blob/beb9b5b02246b9b7ee81ddf938f93f44cfeaad19/src/transformers/utils/attention_visualizer.py#L139) to better understand what tokens the model can and cannot attend to.

 ```python
->>> from transformers import MarianMTModel, MarianTokenizer
+from transformers.utils.attention_visualizer import AttentionMaskVisualizer

->>> src_text = [
-...     ">>fr<< this is a sentence in english that we want to translate to french",
-...     ">>pt<< This should go to portuguese",
-...     ">>es<< And this to Spanish",
-... ]
+visualizer = AttentionMaskVisualizer("Helsinki-NLP/opus-mt-en-de")
+visualizer("Hello, how are you?")
+```
+<div class="flex justify-center">
+   <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/marianmt-attn-mask.png"/>
+</div>

->>> model_name = "Helsinki-NLP/opus-mt-en-ROMANCE"
->>> tokenizer = MarianTokenizer.from_pretrained(model_name)
+## Notes
+
+- MarianMT models are ~298MB on disk and there are more than 1000 models. Check this [list](https://huggingface.co/Helsinki-NLP) for supported language pairs. The language codes may be inconsistent. Two digit codes can be found [here](https://developers.google.com/admin-sdk/directory/v1/languages) while three digit codes may require further searching.
+- Models that require BPE preprocessing are not supported.
+- All model names use the following format: `Helsinki-NLP/opus-mt-{src}-{tgt}`. Language codes formatted like `es_AR` usually refer to the `code_{region}`. For example, `es_AR` refers to Spanish from Argentina.
+- If a model can output multiple languages, prepend the desired output language to `src_txt` as shown below. New multilingual models from the [Tatoeba-Challenge](https://github.com/Helsinki-NLP/Tatoeba-Challenge) require 3 character language codes.
+
+```python
+
+from transformers import MarianMTModel, MarianTokenizer
+
+# Model trained on multiple source languages → multiple target languages
+# Example: multilingual to Arabic (arb)
+model_name = "Helsinki-NLP/opus-mt-mul-mul"  # Tatoeba Challenge model
+tokenizer = MarianTokenizer.from_pretrained(model_name)
+model = MarianMTModel.from_pretrained(model_name)
+
+# Prepend the desired output language code (3-letter ISO 639-3)
+src_texts = ["arb>> Hello, how are you today?"]
+
+# Tokenize and translate
+inputs = tokenizer(src_texts, return_tensors="pt", padding=True, truncation=True)
+translated = model.generate(**inputs)
+
+# Decode and print result
+translated_texts = tokenizer.batch_decode(translated, skip_special_tokens=True)
+print(translated_texts[0])

->>> model = MarianMTModel.from_pretrained(model_name)
->>> translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))
->>> tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
-["c'est une phrase en anglais que nous voulons traduire en français",
- 'Isto deve ir para o português.',
- 'Y esto al español']
 ```
   
-## Resources
+- Older multilingual models use 2 character language codes.

- [Translation task guide](../tasks/translation)
- [Summarization task guide](../tasks/summarization)
- [Causal language modeling task guide](../tasks/language_modeling)
+```python
+
+from transformers import MarianMTModel, MarianTokenizer
+
+# Example: older multilingual model (like en → many)
+model_name = "Helsinki-NLP/opus-mt-en-ROMANCE"  # English → French, Spanish, Italian, etc.
+tokenizer = MarianTokenizer.from_pretrained(model_name)
+model = MarianMTModel.from_pretrained(model_name)
+
+# Prepend the 2-letter ISO 639-1 target language code (older format)
+src_texts = [">>fr<< Hello, how are you today?"]
+
+# Tokenize and translate
+inputs = tokenizer(src_texts, return_tensors="pt", padding=True, truncation=True)
+translated = model.generate(**inputs)
+
+# Decode and print result
+translated_texts = tokenizer.batch_decode(translated, skip_special_tokens=True)
+print(translated_texts[0])
+
+```

 ## MarianConfig

--- a/docs/source/en/model_doc/mistral.md
+++ b/docs/source/en/model_doc/mistral.md
@ -139,6 +139,10 @@ Use the [AttentionMaskVisualizer](https://github.com/huggingface/transformers/bl

 [[autodoc]] MistralConfig

+## MistralCommonTokenizer
+
+[[autodoc]] MistralCommonTokenizer
+
 ## MistralModel

 [[autodoc]] MistralModel
--- a/docs/source/en/model_doc/mistral3.md
+++ b/docs/source/en/model_doc/mistral3.md
@ -227,6 +227,10 @@ This example also how to use `BitsAndBytes` to load the model in 4bit quantizati

 [[autodoc]] Mistral3Config

+## MistralCommonTokenizer
+
+[[autodoc]] MistralCommonTokenizer
+
 ## Mistral3Model

 [[autodoc]] Mistral3Model
--- a/docs/source/en/model_doc/mixtral.md
+++ b/docs/source/en/model_doc/mixtral.md
@ -197,6 +197,10 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h

 [[autodoc]] MixtralConfig

+## MistralCommonTokenizer
+
+[[autodoc]] MistralCommonTokenizer
+
 ## MixtralModel

 [[autodoc]] MixtralModel
--- a/docs/source/en/model_doc/mobilenet_v2.md
+++ b/docs/source/en/model_doc/mobilenet_v2.md
@ -114,6 +114,7 @@ print(f"The predicted class label is: {predicted_class_label}")

 [[autodoc]] MobileNetV2ImageProcessor
    - preprocess
+    - post_process_semantic_segmentation

 ## MobileNetV2ImageProcessorFast

--- a/docs/source/en/model_doc/modernbert-decoder.md
+++ b/docs/source/en/model_doc/modernbert-decoder.md
@ -0,0 +1,155 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+<div style="float: right;">
+  <div class="flex flex-wrap space-x-1">
+    <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+    <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+    <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+  </div>
+</div>
+
+# ModernBERT Decoder
+
+ModernBERT Decoder is the same architecture as [ModernBERT](https://huggingface.co/papers/2412.13663) but trained from scratch with a causal language modeling (CLM) objective. This allows for using the same architecture for comparing encoders and decoders. This is the decoder architecture implementation of ModernBERT, designed for autoregressive text generation tasks.
+
+Like the encoder version, ModernBERT Decoder incorporates modern architectural improvements such as rotary positional embeddings to support sequences of up to 8192 tokens, unpadding to avoid wasting compute on padding tokens, GeGLU layers, and alternating attention patterns. However, it uses causal (unidirectional) attention to enable autoregressive generation.
+
+> [!TIP]
+> Click on the ModernBERT Decoder models in the right sidebar for more examples of how to apply ModernBERT Decoder to different text generation tasks.
+
+The example below demonstrates how to use ModernBERT Decoder for text generation with [`Pipeline`], [`AutoModel`], and from the command line.
+
+<hfoptions id="usage">
+<hfoption id="Pipeline">
+
+```py
+import torch
+from transformers import pipeline
+
+generator = pipeline(
+    task="text-generation",
+    model="blab-jhu/test-32m-dec",
+    torch_dtype=torch.float16,
+    device=0
+)
+generator("The future of artificial intelligence is", max_length=50, num_return_sequences=1)
+
+# For sequence classification
+classifier = pipeline(
+    task="text-classification",
+    model="blab-jhu/test-32m-dec",
+    torch_dtype=torch.float16,
+    device=0
+)
+classifier("This movie is really great!")
+```
+
+</hfoption>
+<hfoption id="AutoModel">
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("blab-jhu/test-32m-dec")
+model = AutoModelForCausalLM.from_pretrained(
+    "blab-jhu/test-32m-dec",
+    torch_dtype=torch.float16,
+    device_map="auto",
+)
+
+prompt = "The future of artificial intelligence is"
+inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
+
+with torch.no_grad():
+    outputs = model.generate(
+        **inputs,
+        max_length=50,
+        num_return_sequences=1,
+        temperature=0.7,
+        do_sample=True,
+        pad_token_id=tokenizer.eos_token_id
+    )
+
+generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+print(f"Generated text: {generated_text}")
+
+# For sequence classification
+from transformers import AutoModelForSequenceClassification
+
+classifier_model = AutoModelForSequenceClassification.from_pretrained(
+    "blab-jhu/test-32m-dec",
+    torch_dtype=torch.float16,
+    device_map="auto",
+    num_labels=2
+)
+
+text = "This movie is really great!"
+inputs = tokenizer(text, return_tensors="pt").to("cuda")
+
+with torch.no_grad():
+    outputs = classifier_model(**inputs)
+    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
+    predicted_class = torch.argmax(predictions, dim=-1)
+
+print(f"Predicted class: {predicted_class.item()}")
+print(f"Prediction probabilities: {predictions}")
+```
+
+</hfoption>
+<hfoption id="transformers CLI">
+
+```bash
+echo "The future of artificial intelligence is" | transformers run --task text-generation --model your-username/modernbert-decoder-base --device 0
+```
+
+</hfoption>
+</hfoptions>
+
+## ModernBertDecoderConfig
+
+[[autodoc]] ModernBertDecoderConfig
+
+<frameworkcontent>
+<pt>
+
+## ModernBertDecoderModel
+
+[[autodoc]] ModernBertDecoderModel
+    - forward
+
+## ModernBertDecoderForCausalLM
+
+[[autodoc]] ModernBertDecoderForCausalLM
+    - forward
+
+## ModernBertDecoderForSequenceClassification
+
+[[autodoc]] ModernBertDecoderForSequenceClassification
+    - forward
+
+### Usage tips
+
+The ModernBertDecoder model can be fine-tuned for various text generation tasks using the HuggingFace Transformers library. It supports efficient inference with features like:
+
+- **Causal attention**: Ensures autoregressive generation by masking future tokens
+- **Sliding window attention**: Alternates between local and global attention patterns for efficiency
+- **Rotary positional embeddings**: Enables handling of longer sequences up to 8000 tokens
+- **FlashAttention support**: Optimized attention computation for faster training and inference
+
+</pt>
+</frameworkcontent>
--- a/docs/source/en/model_doc/perception_lm.md
+++ b/docs/source/en/model_doc/perception_lm.md
@ -0,0 +1,68 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# PerceptionLM
+
+## Overview
+
+The PerceptionLM model was proposed in [PerceptionLM: Open-Access Data and Models for Detailed Visual Understanding](https://ai.meta.com/research/publications/perceptionlm-open-access-data-and-models-for-detailed-visual-understanding/) by Jang Hyun Cho et al. It's a fully open, reproducible model for transparent research in image and video understanding. PLM consists of
+a vision encoder with a small scale (<8B parameters) LLM decoder.
+
+The abstract from the paper is the following:
+
+*Vision-language models are integral to computer vision research, yet many high-performing models
+remain closed-source, obscuring their data, design and training recipe. The research community
+has responded by using distillation from black-box models to label training data, achieving strong
+benchmark results, at the cost of measurable scientific progress. However, without knowing the details
+of the teacher model and its data sources, scientific progress remains difficult to measure. In this
+paper, we study building a Perception Language Model (PLM) in a fully open and reproducible
+framework for transparent research in image and video understanding. We analyze standard training
+pipelines without distillation from proprietary models and explore large-scale synthetic data to identify
+critical data gaps, particularly in detailed video understanding. To bridge these gaps, we release 2.8M
+human-labeled instances of fine-grained video question-answer pairs and spatio-temporally grounded
+video captions. Additionally, we introduce PLM–VideoBench, a suite for evaluating challenging video
+understanding tasks focusing on the ability to reason about “what”, “where”, “when”, and “how” of a
+video. We make our work fully reproducible by providing data, training recipes, code & models.*
+
+
+This model was contributed by [shumingh](https://huggingface.co/shumingh).
+The original code can be found [here](https://github.com/facebookresearch/perception_models).
+
+
+## PerceptionLMConfig
+
+[[autodoc]] PerceptionLMConfig
+
+## PerceptionLMProcessor
+
+[[autodoc]] PerceptionLMProcessor
+
+## PerceptionLMImageProcessorFast
+
+[[autodoc]] PerceptionLMImageProcessorFast
+
+## PerceptionLMVideoProcessor
+
+[[autodoc]] PerceptionLMVideoProcessor
+
+## PerceptionLMModel
+
+[[autodoc]] PerceptionLMModel
+
+## PerceptionLMForConditionalGeneration
+
+[[autodoc]] PerceptionLMForConditionalGeneration
+    - forward
--- a/docs/source/en/model_doc/phi4_multimodal.md
+++ b/docs/source/en/model_doc/phi4_multimodal.md
@ -9,44 +9,53 @@ specific language governing permissions and limitations under the License.
 rendered properly in your Markdown viewer.
 -->

-# Phi4 Multimodal
+<div style="float: right;">
+  <div class="flex flex-wrap space-x-1">
+    <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-EE4C2C?logo=pytorch&logoColor=white&style=flat">
+  </div>
+</div>

-## Overview
+## Phi4 Multimodal

-Phi4 Multimodal is a lightweight open multimodal foundation model that leverages the language, vision, and speech research and datasets used for Phi-3.5 and 4.0 models. The model processes text, image, and audio inputs, generating text outputs, and comes with 128K token context length. The model underwent an enhancement process, incorporating both supervised fine-tuning, direct preference optimization and RLHF (Reinforcement Learning from Human Feedback) to support precise instruction adherence and safety measures. The languages that each modal supports are the following:
+[Phi4 Multimodal](https://huggingface.co/papers/2503.01743) is a multimodal model capable of text, image, and speech and audio inputs or any combination of these. It features a mixture of LoRA adapters for handling different inputs, and each input is routed to the appropriate encoder.

- Text: Arabic, Chinese, Czech, Danish, Dutch, English, Finnish, French, German, Hebrew, Hungarian, Italian, Japanese, Korean, Norwegian, Polish, Portuguese, Russian, Spanish, Swedish, Thai, Turkish, Ukrainian
- Vision: English
- Audio: English, Chinese, German, French, Italian, Japanese, Spanish, Portuguese
+You can find all the original Phi4 Multimodal checkpoints under the [Phi4](https://huggingface.co/collections/microsoft/phi-4-677e9380e514feb5577a40e4) collection.

-This model was contributed by [Cyril Vallez](https://huggingface.co/cyrilvallez). The most recent code can be
-found [here](https://github.com/huggingface/transformers/blob/main/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py).
+> [!TIP]
+> This model was contributed by [cyrilvallez](https://huggingface.co/cyrilvallez).
+>
+> Click on the Phi-4 Multimodal in the right sidebar for more examples of how to apply Phi-4 Multimodal to different tasks.

+The example below demonstrates how to generate text based on an image with [`Pipeline`] or the [`AutoModel`] class.

-## Usage tips
+<hfoptions id="usage">
+<hfoption id="Pipeline">

-`Phi4-multimodal-instruct` can be found on the [Huggingface Hub](https://huggingface.co/microsoft/Phi-4-multimodal-instruct)
+```python
+from transformers import pipeline
+generator = pipeline("text-generation", model="microsoft/Phi-4-multimodal-instruct", torch_dtype="auto", device=0)

-In the following, we demonstrate how to use it for inference depending on the input modalities (text, image, audio).
+prompt = "Explain the concept of multimodal AI in simple terms."
+
+result = generator(prompt, max_length=50)
+print(result[0]['generated_text'])
+```
+
+</hfoption>
+<hfoption id="AutoModel">

 ```python
 import torch
 from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig

-
-# Define model path
 model_path = "microsoft/Phi-4-multimodal-instruct"
 device = "cuda:0"

-# Load model and processor
 processor = AutoProcessor.from_pretrained(model_path)
 model = AutoModelForCausalLM.from_pretrained(model_path, device_map=device, torch_dtype=torch.float16)

-# Optional: load the adapters (note that without them, the base model will very likely not work well)
-model.load_adapter(model_path, adapter_name="speech", device_map=device, adapter_kwargs={"subfolder": 'speech-lora'})
 model.load_adapter(model_path, adapter_name="vision", device_map=device, adapter_kwargs={"subfolder": 'vision-lora'})

-# Part : Image Processing
 messages = [
    {
        "role": "user",
@ -57,7 +66,7 @@ messages = [
    },
 ]

-model.set_adapter("vision") # if loaded, activate the vision adapter
+model.set_adapter("vision")
 inputs = processor.apply_chat_template(
    messages,
    add_generation_prompt=True,
@ -66,7 +75,6 @@ inputs = processor.apply_chat_template(
    return_tensors="pt",
 ).to(device)

-# Generate response
 generate_ids = model.generate(
    **inputs,
    max_new_tokens=1000,
@ -77,10 +85,27 @@ response = processor.batch_decode(
    generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
 )[0]
 print(f'>>> Response\n{response}')
+```

+</hfoption>
+</hfoptions>

-# Part 2: Audio Processing
-model.set_adapter("speech") # if loaded, activate the speech adapter
+## Notes
+
+The example below demonstrates inference with an audio and text input.
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
+
+model_path = "microsoft/Phi-4-multimodal-instruct"
+device = "cuda:0"
+
+processor = AutoProcessor.from_pretrained(model_path)
+model = AutoModelForCausalLM.from_pretrained(model_path, device_map=device,  torch_dtype=torch.float16)
+
+model.load_adapter(model_path, adapter_name="speech", device_map=device, adapter_kwargs={"subfolder": 'speech-lora'})
+model.set_adapter("speech")
 audio_url = "https://upload.wikimedia.org/wikipedia/commons/b/b0/Barbara_Sahakian_BBC_Radio4_The_Life_Scientific_29_May_2012_b01j5j24.flac"
 messages = [
    {
@ -110,6 +135,7 @@ response = processor.batch_decode(
    generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
 )[0]
 print(f'>>> Response\n{response}')
+
 ```

 ## Phi4MultimodalFeatureExtractor
--- a/docs/source/en/model_doc/pixtral.md
+++ b/docs/source/en/model_doc/pixtral.md
@ -86,6 +86,10 @@ output = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up

 [[autodoc]] PixtralVisionConfig

+## MistralCommonTokenizer
+
+[[autodoc]] MistralCommonTokenizer
+
 ## PixtralVisionModel

 [[autodoc]] PixtralVisionModel
--- a/docs/source/en/model_doc/speech_to_text_2.md
+++ b/docs/source/en/model_doc/speech_to_text_2.md
@ -61,19 +61,16 @@ predicted token ids.
 - Step-by-step Speech Translation

 ```python
->>> import torch
 >>> from transformers import Speech2Text2Processor, SpeechEncoderDecoderModel
 >>> from datasets import load_dataset
->>> import soundfile as sf

 >>> model = SpeechEncoderDecoderModel.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
 >>> processor = Speech2Text2Processor.from_pretrained("facebook/s2t-wav2vec2-large-en-de")


->>> def map_to_array(batch):
-...     speech, _ = sf.read(batch["file"])
-...     batch["speech"] = speech
-...     return batch
+>>> def map_to_array(example):
+...     example["speech"] = example["audio"]["array"]
+...     return example


 >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
--- a/docs/source/en/model_doc/switch_transformers.md
+++ b/docs/source/en/model_doc/switch_transformers.md
@ -14,35 +14,90 @@ rendered properly in your Markdown viewer.

 -->

-# SwitchTransformers
-
+<div style="float: right;">
    <div class="flex flex-wrap space-x-1">
        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
    </div>
+</div>

-## Overview
+# Switch Transformers

-The SwitchTransformers model was proposed in [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://huggingface.co/papers/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer.
+[Switch Transformers](https://huggingface.co/papers/2101.03961) is a sparse T5 model where the MLP layer is replaced by a Mixture-of-Experts (MoE). A routing mechanism associates each token with an expert and each expert is a dense MLP. Sparsity enables better scaling and the routing mechanism allows the model to select relevant weights on the fly which increases model capacity.

-The Switch Transformer model uses a sparse T5 encoder-decoder architecture, where the MLP are replaced by a Mixture of Experts (MoE). A routing mechanism (top 1 in this case) associates each token to one of the expert, where each expert is a dense MLP. While switch transformers have a lot more weights than their equivalent dense models, the sparsity allows better scaling and better finetuning performance at scale.
-During a forward pass, only a fraction of the weights are used. The routing mechanism allows the model to select relevant weights on the fly which increases the model capacity without increasing the number of operations.
+You can find all the original Switch Transformers checkpoints under the [Switch Transformer](https://huggingface.co/collections/google/switch-transformers-release-6548c35c6507968374b56d1f) collection.

-The abstract from the paper is the following:

-*In deep learning, models typically reuse the same parameters for all inputs. Mixture of Experts (MoE) defies this and instead selects different parameters for each incoming example. The result is a sparsely-activated model -- with outrageous numbers of parameters -- but a constant computational cost. However, despite several notable successes of MoE, widespread adoption has been hindered by complexity, communication costs and training instability -- we address these with the Switch Transformer. We simplify the MoE routing algorithm and design intuitive improved models with reduced communication and computational costs. Our proposed training techniques help wrangle the instabilities and we show large sparse models may be trained, for the first time, with lower precision (bfloat16) formats. We design models based off T5-Base and T5-Large to obtain up to 7x increases in pre-training speed with the same computational resources. These improvements extend into multilingual settings where we measure gains over the mT5-Base version across all 101 languages. Finally, we advance the current scale of language models by pre-training up to trillion parameter models on the "Colossal Clean Crawled Corpus" and achieve a 4x speedup over the T5-XXL model.*
+> [!TIP]
+> This model was contributed by [ybelkada](https://huggingface.co/ybelkada) and [ArthurZ](https://huggingface.co/ArthurZ).
+>
+> Click on the Switch Transformers models in the right sidebar for more examples of how to apply Switch Transformers to different natural language tasks.

-This model was contributed by [Younes Belkada](https://huggingface.co/ybelkada) and [Arthur Zucker](https://huggingface.co/ArthurZ).
-The original code can be found [here](https://github.com/google/flaxformer/tree/main/flaxformer/architectures/moe).
+The example below demonstrates how to predict the masked token with [`Pipeline`], [`AutoModel`], and from the command line.

-## Usage tips
+<hfoptions id="usage">
+<hfoption id="Pipeline">

- SwitchTransformers uses the [`T5Tokenizer`], which can be loaded directly from each model's repository.
- The released weights are pretrained on English [Masked Language Modeling](https://moon-ci-docs.huggingface.co/docs/transformers/pr_19323/en/glossary#general-terms) task, and should be finetuned.
+```python
+import torch
+from transformers import pipeline

-## Resources
+pipeline = pipeline(
+    task="text2text-generation", 
+    model="google/switch-base-8",
+    torch_dtype=torch.float16,
+    device=0
+)
+print(pipeline("The capital of France is <extra_id_0>."))
+```
+
+</hfoption>
+<hfoption id="AutoModel">
+
+```python
+import torch
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("google/switch-base-8")
+model = AutoModelForSeq2SeqLM.from_pretrained("google/switch-base-8", device_map="auto", torch_dtype=torch.float16)
+
+input_text = "The capital of France is <extra_id_0>."
+input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(0)
+
+outputs = model.generate(input_ids)
+print(tokenizer.decode(outputs[0]))
+```
+
+</hfoption>
+<hfoption id="transformers CLI">
+
+```bash
+echo -e "The capital of France is <extra_id_0>." | transformers run --task text2text-generation --model google/switch-base-8 --device 0
+# [{'generated_text': 'Paris.'}]
+```
+
+</hfoption>
+</hfoptions>
+
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
+
+The example below uses [bitsandbytes](../quantization/bitsandbytes/) to only quantize the weights to 8-bits.
+
+```py
+# pip install bitsandbytes
+import torch
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, BitsAndBytesConfig
+
+tokenizer = AutoTokenizer.from_pretrained("google/switch-base-8")
+quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+model = AutoModelForSeq2SeqLM.from_pretrained("google/switch-base-8", device_map="auto", quantization_config=quantization_config)
+
+input_text = "The capital of France is <extra_id_0>."
+input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(0)
+
+outputs = model.generate(input_ids)
+print(tokenizer.decode(outputs[0]))
+```

- [Translation task guide](../tasks/translation)
- [Summarization task guide](../tasks/summarization)

 ## SwitchTransformersConfig

--- a/docs/source/en/model_doc/t5gemma.md
+++ b/docs/source/en/model_doc/t5gemma.md
@ -14,16 +14,25 @@ specific language governing permissions and limitations under the License.
 rendered properly in your Markdown viewer.

 -->
-
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
+</div>

 # T5Gemma

-T5Gemma (aka encoder-decoder Gemma) was proposed in a [research paper](https://arxiv.org/abs/2504.06225) by Google. It is a family of encoder-decoder large langauge models, developed by adapting pretrained decoder-only models into encoder-decoder. T5Gemma includes pretrained and instruction-tuned variants. The architecture is based on transformer encoder-decoder design following T5, with improvements from Gemma 2: GQA, RoPE, GeGLU activation, RMSNorm, and interleaved local/global attention.
+T5Gemma (aka encoder-decoder Gemma) was proposed in a [research paper](https://arxiv.org/abs/2504.06225) by Google. It is a family of encoder-decoder large language models, developed by adapting pretrained decoder-only models into encoder-decoder. T5Gemma includes pretrained and instruction-tuned variants. The architecture is based on transformer encoder-decoder design following T5, with improvements from Gemma 2: GQA, RoPE, GeGLU activation, RMSNorm, and interleaved local/global attention.

 T5Gemma has two groups of model sizes: 1) [Gemma 2](https://ai.google.dev/gemma/docs/core/model_card_2) sizes (2B-2B, 9B-2B, and 9B-9B), which are based on the offical Gemma 2 models (2B and 9B); and 2) [T5](https://arxiv.org/abs/1910.10683) sizes (Small, Base, Large, and XL), where are pretrained under the Gemma 2 framework following T5 configuration. In addition, we also provide a model at ML size (medium large, ~2B in total), which is in-between T5 Large and T5 XL.

 The pretrained varaints are trained with two objectives: prefix language modeling with knowledge distillation (PrefixLM) and UL2, separately. We release both variants for each model size. The instruction-turned varaints was post-trained with supervised fine-tuning and reinforcement learning.

+> [!TIP]
+> Click on the T5Gemma models in the right sidebar for more examples of how to apply T5Gemma to different language tasks.
+
 The example below demonstrates how to chat with the model with [`Pipeline`] or the [`AutoModel`] class, and from the command line.

 <hfoptions id="usage">
@ -35,43 +44,52 @@ import torch
 from transformers import pipeline

 pipe = pipeline(
-    task="text2text-generation",
-    model="google/t5gemma-placeholder",
+    "text2text-generation",
+    model="google/t5gemma-2b-2b-prefixlm-it",
    torch_dtype=torch.bfloat16,
-    device="cuda",
+    device="cuda",  # replace with "mps" to run on a Mac device
 )

-pipe("Question: Why is the sky blue?\nAnswer:", max_new_tokens=50)
+messages = [
+    {"role": "user", "content": "Tell me an unknown interesting biology fact about the brain."},
+]
+prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+
+pipe(prompt, max_new_tokens=32)
 ```

 </hfoption>
 <hfoption id="AutoModel">

 ```python
-import torch
+# pip install accelerate
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+import torch

-tokenizer = AutoTokenizer.from_pretrained("google/t5gemma-placeholder")
+tokenizer = AutoTokenizer.from_pretrained("google/t5gemma-2b-2b-prefixlm-it")
 model = AutoModelForSeq2SeqLM.from_pretrained(
-    "google/t5gemma-placeholder",
+    "google/t5gemma-2b-2b-prefixlm-it",
+    device_map="auto",
    torch_dtype=torch.bfloat16,
-    device_map="auto"
 )

-input_text = "Question: Why is the sky blue?\nAnswer:"
-input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+messages = [
+    {"role": "user", "content": "Tell me an unknown interesting biology fact about the brain."},
+]
+input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", return_dict=True, add_generation_prompt=True).to("cuda")

 outputs = model.generate(**input_ids, max_new_tokens=32)
-print(tokenizer.decode(outputs[0], skip_special_tokens=True))
-
+print(tokenizer.decode(outputs[0]))
 ```

 </hfoption>
 <hfoption id="transformers CLI">

 ```
-echo -e "Question: Why is the sky blue? Answer:" | transformers run --task text2text-generation --model google/t5gemma-placeholder --device 0
+echo -e "Write me a poem about Machine Learning. Answer:" | transformers run --task text2text-generation --model google/t5gemma-2b-2b-prefixlm --device 0
 ```
+</hfoption>
+</hfoptions>

 ## T5GemmaConfig

--- a/docs/source/en/model_doc/vitpose.md
+++ b/docs/source/en/model_doc/vitpose.md
@ -10,52 +10,39 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# ViTPose
-
+<div style="float: right;">
  <div class="flex flex-wrap space-x-1">
    <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
  </div>
+</div>

-## Overview
+# ViTPose

-The ViTPose model was proposed in [ViTPose: Simple Vision Transformer Baselines for Human Pose Estimation](https://huggingface.co/papers/2204.12484) by Yufei Xu, Jing Zhang, Qiming Zhang, Dacheng Tao. ViTPose employs a standard, non-hierarchical [Vision Transformer](vit) as backbone for the task of keypoint estimation. A simple decoder head is added on top to predict the heatmaps from a given image. Despite its simplicity, the model gets state-of-the-art results on the challenging MS COCO Keypoint Detection benchmark. The model was further improved in [ViTPose++: Vision Transformer for Generic Body Pose Estimation](https://huggingface.co/papers/2212.04246) where the authors employ
-a mixture-of-experts (MoE) module in the ViT backbone along with pre-training on more data, which further enhances the performance.
+[ViTPose](https://huggingface.co/papers/2204.12484) is a vision transformer-based model for keypoint (pose) estimation. It uses a simple, non-hierarchical [ViT](./vit) backbone and a lightweight decoder head. This architecture simplifies model design, takes advantage of transformer scalability, and can be adapted to different training strategies.

-The abstract from the paper is the following:
-
-*Although no specific domain knowledge is considered in the design, plain vision transformers have shown excellent performance in visual recognition tasks. However, little effort has been made to reveal the potential of such simple structures for pose estimation tasks. In this paper, we show the surprisingly good capabilities of plain vision transformers for pose estimation from various aspects, namely simplicity in model structure, scalability in model size, flexibility in training paradigm, and transferability of knowledge between models, through a simple baseline model called ViTPose. Specifically, ViTPose employs plain and non-hierarchical vision transformers as backbones to extract features for a given person instance and a lightweight decoder for pose estimation. It can be scaled up from 100M to 1B parameters by taking the advantages of the scalable model capacity and high parallelism of transformers, setting a new Pareto front between throughput and performance. Besides, ViTPose is very flexible regarding the attention type, input resolution, pre-training and finetuning strategy, as well as dealing with multiple pose tasks. We also empirically demonstrate that the knowledge of large ViTPose models can be easily transferred to small ones via a simple knowledge token. Experimental results show that our basic ViTPose model outperforms representative methods on the challenging MS COCO Keypoint Detection benchmark, while the largest model sets a new state-of-the-art.*
+[ViTPose++](https://huggingface.co/papers/2212.04246) improves on ViTPose by incorporating a mixture-of-experts (MoE) module in the backbone and using more diverse pretraining data.

 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/vitpose-architecture.png"
 alt="drawing" width="600"/>

-<small> ViTPose architecture. Taken from the <a href="https://huggingface.co/papers/2204.12484">original paper.</a> </small>
+You can find all ViTPose and ViTPose++ checkpoints under the [ViTPose collection](https://huggingface.co/collections/usyd-community/vitpose-677fcfd0a0b2b5c8f79c4335).

-This model was contributed by [nielsr](https://huggingface.co/nielsr) and [sangbumchoi](https://github.com/SangbumChoi).
-The original code can be found [here](https://github.com/ViTAE-Transformer/ViTPose).
-
-## Usage Tips
-
-ViTPose is a so-called top-down keypoint detection model. This means that one first uses an object detector, like [RT-DETR](rt_detr.md), to detect people (or other instances) in an image. Next, ViTPose takes the cropped images as input and predicts the keypoints for each of them.
+The example below demonstrates pose estimation with the [`VitPoseForPoseEstimation`] class.

 ```py
 import torch
 import requests
 import numpy as np
-
+import supervision as sv
 from PIL import Image
-
 from transformers import AutoProcessor, RTDetrForObjectDetection, VitPoseForPoseEstimation

 device = "cuda" if torch.cuda.is_available() else "cpu"

-url = "http://images.cocodataset.org/val2017/000000000139.jpg"
+url = "https://www.fcbarcelona.com/fcbarcelona/photo/2021/01/31/3c55a19f-dfc1-4451-885e-afd14e890a11/mini_2021-01-31-BARCELONA-ATHLETIC-BILBAOI-30.JPG"
 image = Image.open(requests.get(url, stream=True).raw)

-# ------------------------------------------------------------------------
-# Stage 1. Detect humans on the image
-# ------------------------------------------------------------------------
-
-# You can choose any detector of your choice
+# Detect humans in the image
 person_image_processor = AutoProcessor.from_pretrained("PekingU/rtdetr_r50vd_coco_o365")
 person_model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd_coco_o365", device_map=device)

@ -67,7 +54,7 @@ with torch.no_grad():
 results = person_image_processor.post_process_object_detection(
    outputs, target_sizes=torch.tensor([(image.height, image.width)]), threshold=0.3
 )
-result = results[0]  # take first image results
+result = results[0]

 # Human label refers 0 index in COCO dataset
 person_boxes = result["boxes"][result["labels"] == 0]
@ -77,10 +64,7 @@ person_boxes = person_boxes.cpu().numpy()
 person_boxes[:, 2] = person_boxes[:, 2] - person_boxes[:, 0]
 person_boxes[:, 3] = person_boxes[:, 3] - person_boxes[:, 1]

-# ------------------------------------------------------------------------
-# Stage 2. Detect keypoints for each person found
-# ------------------------------------------------------------------------
-
+# Detect keypoints for each person found
 image_processor = AutoProcessor.from_pretrained("usyd-community/vitpose-base-simple")
 model = VitPoseForPoseEstimation.from_pretrained("usyd-community/vitpose-base-simple", device_map=device)

@ -90,54 +74,7 @@ with torch.no_grad():
    outputs = model(**inputs)

 pose_results = image_processor.post_process_pose_estimation(outputs, boxes=[person_boxes])
-image_pose_result = pose_results[0]  # results for first image
-```
-
-### ViTPose++ models
-
-The best [checkpoints](https://huggingface.co/collections/usyd-community/vitpose-677fcfd0a0b2b5c8f79c4335) are those of the [ViTPose++ paper](https://huggingface.co/papers/2212.04246). ViTPose++ models employ a so-called [Mixture-of-Experts (MoE)](https://huggingface.co/blog/moe) architecture for the ViT backbone, resulting in better performance.
-
-The ViTPose+ checkpoints use 6 experts, hence 6 different dataset indices can be passed. 
-An overview of the various dataset indices is provided below:
-
- 0: [COCO validation 2017](https://cocodataset.org/#overview) dataset, using an object detector that gets 56 AP on the "person" class
- 1: [AiC](https://github.com/fabbrimatteo/AiC-Dataset) dataset
- 2: [MPII](https://www.mpi-inf.mpg.de/departments/computer-vision-and-machine-learning/software-and-datasets/mpii-human-pose-dataset) dataset
- 3: [AP-10K](https://github.com/AlexTheBad/AP-10K) dataset
- 4: [APT-36K](https://github.com/pandorgan/APT-36K) dataset
- 5: [COCO-WholeBody](https://github.com/jin-s13/COCO-WholeBody) dataset
-
-Pass the `dataset_index` argument in the forward of the model to indicate which experts to use for each example in the batch. Example usage is shown below:
-
-```python
-image_processor = AutoProcessor.from_pretrained("usyd-community/vitpose-plus-base")
-model = VitPoseForPoseEstimation.from_pretrained("usyd-community/vitpose-plus-base", device=device)
-
-inputs = image_processor(image, boxes=[person_boxes], return_tensors="pt").to(device)
-
-dataset_index = torch.tensor([0], device=device) # must be a tensor of shape (batch_size,)
-
-with torch.no_grad():
-    outputs = model(**inputs, dataset_index=dataset_index)
-```
-
-The ViTPose+ checkpoints use 6 experts, hence 6 different dataset indices can be passed. 
-An overview of the various dataset indices is provided below:
-
- 0: [COCO validation 2017](https://cocodataset.org/#overview) dataset, using an object detector that gets 56 AP on the "person" class
- 1: [AiC](https://github.com/fabbrimatteo/AiC-Dataset) dataset
- 2: [MPII](https://www.mpi-inf.mpg.de/departments/computer-vision-and-machine-learning/software-and-datasets/mpii-human-pose-dataset) dataset
- 3: [AP-10K](https://github.com/AlexTheBad/AP-10K) dataset
- 4: [APT-36K](https://github.com/pandorgan/APT-36K) dataset
- 5: [COCO-WholeBody](https://github.com/jin-s13/COCO-WholeBody) dataset
-
-
-### Visualization
-
-To visualize the various keypoints, one can either leverage the `supervision` [library](https://github.com/roboflow/supervision (requires `pip install supervision`):
-
-```python
-import supervision as sv
+image_pose_result = pose_results[0]

 xy = torch.stack([pose_result['keypoints'] for pose_result in image_pose_result]).cpu().numpy()
 scores = torch.stack([pose_result['scores'] for pose_result in image_pose_result]).cpu().numpy()
@ -162,11 +99,86 @@ annotated_frame = vertex_annotator.annotate(
    scene=annotated_frame,
    key_points=key_points
 )
+annotated_frame
 ```

-Alternatively, one can also visualize the keypoints using [OpenCV](https://opencv.org/) (requires `pip install opencv-python`):
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/vitpose.png"/>
+</div>

-```python
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
+
+The example below uses [torchao](../quantization/torchao) to only quantize the weights to int4.
+
+```py
+# pip install torchao
+import torch
+import requests
+import numpy as np
+from PIL import Image
+from transformers import AutoProcessor, RTDetrForObjectDetection, VitPoseForPoseEstimation, TorchAoConfig
+
+url = "https://www.fcbarcelona.com/fcbarcelona/photo/2021/01/31/3c55a19f-dfc1-4451-885e-afd14e890a11/mini_2021-01-31-BARCELONA-ATHLETIC-BILBAOI-30.JPG"
+image = Image.open(requests.get(url, stream=True).raw)
+
+person_image_processor = AutoProcessor.from_pretrained("PekingU/rtdetr_r50vd_coco_o365")
+person_model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd_coco_o365", device_map=device)
+
+inputs = person_image_processor(images=image, return_tensors="pt").to(device)
+
+with torch.no_grad():
+    outputs = person_model(**inputs)
+
+results = person_image_processor.post_process_object_detection(
+    outputs, target_sizes=torch.tensor([(image.height, image.width)]), threshold=0.3
+)
+result = results[0]
+
+person_boxes = result["boxes"][result["labels"] == 0]
+person_boxes = person_boxes.cpu().numpy()
+
+person_boxes[:, 2] = person_boxes[:, 2] - person_boxes[:, 0]
+person_boxes[:, 3] = person_boxes[:, 3] - person_boxes[:, 1]
+
+quantization_config = TorchAoConfig("int4_weight_only", group_size=128)
+
+image_processor = AutoProcessor.from_pretrained("usyd-community/vitpose-plus-huge")
+model = VitPoseForPoseEstimation.from_pretrained("usyd-community/vitpose-plus-huge", device_map=device, quantization_config=quantization_config)
+
+inputs = image_processor(image, boxes=[person_boxes], return_tensors="pt").to(device)
+
+with torch.no_grad():
+    outputs = model(**inputs)
+
+pose_results = image_processor.post_process_pose_estimation(outputs, boxes=[person_boxes])
+image_pose_result = pose_results[0]
+```
+
+## Notes
+
+- Use [`AutoProcessor`] to automatically prepare bounding box and image inputs.
+- ViTPose is a top-down pose estimator. It uses a object detector to detect individuals first before keypoint prediction.
+- ViTPose++ has 6 different MoE expert heads (COCO validation `0`, AiC `1`, MPII `2`, AP-10K `3`, APT-36K `4`, COCO-WholeBody `5`) which supports 6 different datasets. Pass a specific value corresponding to the dataset to the `dataset_index` to indicate which expert to use.
+
+    ```py
+    from transformers import AutoProcessor, VitPoseForPoseEstimation
+
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+
+    image_processor = AutoProcessor.from_pretrained("usyd-community/vitpose-plus-base")
+    model = VitPoseForPoseEstimation.from_pretrained("usyd-community/vitpose-plus-base", device=device)
+
+    inputs = image_processor(image, boxes=[person_boxes], return_tensors="pt").to(device)
+    dataset_index = torch.tensor([0], device=device) # must be a tensor of shape (batch_size,)
+
+    with torch.no_grad():
+        outputs = model(**inputs, dataset_index=dataset_index)
+    ```
+
+- [OpenCV](https://opencv.org/) is an alternative option for visualizing the estimated pose.
+
+    ```py
+    # pip install opencv-python
    import math
    import cv2

@ -220,7 +232,6 @@ def draw_links(image, keypoints, scores, keypoint_edges, link_colors, keypoint_s
                    else:
                        cv2.line(image, (x1, y1), (x2, y2), color, thickness=thickness)

-
    # Note: keypoint_edges and color palette are dataset-specific
    keypoint_edges = model.config.edges

@ -267,14 +278,13 @@ for pose_result in image_pose_result:
    pose_image = Image.fromarray(numpy_image)
    pose_image
    ```
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/vitpose-coco.jpg" alt="drawing" width="600"/>

 ## Resources

-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ViTPose. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+Refer to resources below to learn more about using ViTPose.

- A demo of ViTPose on images and video can be found [here](https://huggingface.co/spaces/hysts/ViTPose-transformers).
- A notebook illustrating inference and visualization can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/ViTPose/Inference_with_ViTPose_for_human_pose_estimation.ipynb).
+- This [notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/ViTPose/Inference_with_ViTPose_for_body_pose_estimation.ipynb) demonstrates inference and visualization.
+- This [Space](https://huggingface.co/spaces/hysts/ViTPose-transformers) demonstrates ViTPose on images and video.

 ## VitPoseImageProcessor

--- a/docs/source/en/model_doc/wav2vec2.md
+++ b/docs/source/en/model_doc/wav2vec2.md
@ -172,9 +172,9 @@ Otherwise, [`~Wav2Vec2ProcessorWithLM.batch_decode`] performance will be slower
 >>> dataset = dataset.cast_column("audio", datasets.Audio(sampling_rate=16_000))


->>> def map_to_array(batch):
-...     batch["speech"] = batch["audio"]["array"]
-...     return batch
+>>> def map_to_array(example):
+...     example["speech"] = example["audio"]["array"]
+...     return example


 >>> # prepare speech data for batch inference
--- a/docs/source/en/perf_train_tpu_tf.md
+++ b/docs/source/en/perf_train_tpu_tf.md
@ -1,355 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# TPU
-
-TPU (Tensor Processing Unit) is a type of hardware designed to accelerate tensor computations for training and inference. TPUs are generally accessed through Google cloud services, but smaller TPUs are also available for free from [Google Colab](https://colab.research.google.com/notebooks/tpu.ipynb) or [Kaggle](https://www.kaggle.com/docs/tpu).
-
-This guide focuses on training a Keras model for sequence classification on a TPU from Google Colab. Make sure the TPU runtime is enabled by going to **Runtime > Change runtime type** and selecting a TPU.
-
-Run the command below to install the latest version of Transformers and [Datasets](https://huggingface.co/docs/datasets).
-
-```py
-!pip install --U transformers datasets
-```
-
-Create an instance of [tf.distribute.cluster_resolver.TPUClusterResolver](https://www.tensorflow.org/api_docs/python/tf/distribute/cluster_resolver/TPUClusterResolver), and then connect to the remote cluster and initialize the TPUs.
-
-```py
-import tensorflow as tf
-
-resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
-tf.config.experimental_connect_to_cluster(resolver)
-tf.tpu.experimental.initialize_tpu_system(resolver)
-```
-
-There are various distribution strategies for running your model on multiple TPUs. The [tpu.distribute.TPUStrategy](https://www.tensorflow.org/api_docs/python/tf/distribute/TPUStrategy) offers synchronized distributed training.
-
-```py
-strategy = tf.distribute.TPUStrategy(resolver)
-```
-
-Load and tokenize a dataset - this example uses [CoLA](https://huggingface.co/datasets/nyu-mll/glue/viewer/cola) from the GLUE benchmark - and pad all samples to the maximum length so it is easier to load as an array and to avoid [XLA compilation issues](#xla).
-
-```py
-from transformers import AutoTokenizer
-from datasets import load_dataset
-import numpy as np
-
-dataset = load_dataset("glue", "cola")["train"]
-tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
-
-train_data = tokenizer(
-    dataset["sentence"],
-    padding="max_length",
-    truncation=True,
-    max_length=128,
-    return_tensors="np",
-)
-train_data = dict(train_data)
-train_labels = np.array(dataset["label"])
-```
-
-The model **must** be created inside [Strategy.scope](https://www.tensorflow.org/api_docs/python/tf/distribute/MirroredStrategy#scope) in order to replicate the model layers on each TPU device.
-
-```py
-from transformers import TFAutoModelForSequenceClassification
-
-with strategy.scope():
-    model = TFAutoModelForSequenceClassification.from_pretrained(model_checkpoint)
-    model.compile(optimizer="adam")
-```
-
-TPUs only accept [tf.data.Dataset](https://www.tensorflow.org/api_docs/python/tf/data/Dataset) inputs unlike the Keras [fit](https://keras.io/api/models/model_training_apis/#fit-method) method which accepts a broader range of inputs.
-
-```py
-BATCH_SIZE = 8 * strategy.num_replicas_in_sync
-
-tf_dataset = tf.data.Dataset.from_tensor_slices((train_data, train_labels))
-tf_dataset = tf_dataset.shuffle(len(tf_dataset))
-tf_dataset = tf_dataset.batch(BATCH_SIZE, drop_remainder=True)
-```
-
-Finally, call [fit](https://keras.io/api/models/model_training_apis/#fit-method) to start training.
-
-```py
-model.fit(tf_dataset)
-```
-
-## Large datasets
-
-The dataset created above pads every sample to the maximum length and loads the whole dataset into memory. This may not be possible if you're working with larger datasets. When training on large datasets, you may want to create a [tf.TFRecord](https://www.tensorflow.org/tutorials/load_data/tfrecord) or stream the data.
-
-### tf.TFRecord
-
-[tf.TFRecord](https://www.tensorflow.org/tutorials/load_data/tfrecord) is the standard [tf.data](https://www.tensorflow.org/guide/data) format for storing training data. For very large training jobs, it's worth preprocessing your data and storing it in the `tf.TFRecord` format and building a `tf.data` pipeline on top. Refer to the table below to help you decide whether `tf.TFRecord` is helpful for you.
-
-| pros | cons |
-|---|---|
-| works on all TPU instances | costs associated with cloud storage |
-| supports huge datasets and massive throughput | some data types (images) can take a lot of space to store |
-| suitable for training on entire TPU pods |  |
-| preprocessing is done in advance, maximizing training speed |  |
-
-Preprocess and tokenize the dataset before writing it to a `tf.TFRecord` to avoid writing every time the data is loaded.
-
-An exception is made for *train-time augmentations*, because augmentations applied after writing to a `tf.TFRecord` results in the same augmentation for each epoch. Instead, apply augmentations in the `tf.data` pipeline that loads the data.
-
-> [!TIP]
-> In practice, you probably won't be able to load the entire dataset in memory. Load a chunk of the dataset at a time and convert it to `TFRecord`, and repeat until the entire dataset is in the `TFRecord` format. Then you can use a list of all the files to create a `TFRecordDataset`. The example below demonstrates a single file for simplicity.
-
-```py
-tokenized_data = tokenizer(
-    dataset["sentence"],
-    padding="max_length",
-    truncation=True,
-    max_length=128,
-    return_tensors="np",
-)
-labels = dataset["label"]
-
-with tf.io.TFRecordWriter("dataset.tfrecords") as file_writer:
-    for i in range(len(labels)):
-        features = {
-            "input_ids": tf.train.Feature(
-                int64_list=tf.train.Int64List(value=tokenized_data["input_ids"][i])
-            ),
-            "attention_mask": tf.train.Feature(
-                int64_list=tf.train.Int64List(value=tokenized_data["attention_mask"][i])
-            ),
-            "labels": tf.train.Feature(
-                int64_list=tf.train.Int64List(value=[labels[i]])
-            ),
-        }
-        features = tf.train.Features(feature=features)
-        example = tf.train.Example(features=features)
-        record_bytes = example.SerializeToString()
-        file_writer.write(record_bytes)
-```
-
-Build a [TFRecordDataset](https://www.tensorflow.org/api_docs/python/tf/data/TFRecordDataset) using the saved filename to load it.
-
-```py
-def decode_fn(sample):
-    features = {
-        "input_ids": tf.io.FixedLenFeature((128,), dtype=tf.int64),
-        "attention_mask": tf.io.FixedLenFeature((128,), dtype=tf.int64),
-        "labels": tf.io.FixedLenFeature((1,), dtype=tf.int64),
-    }
-    return tf.io.parse_example(sample, features)
-
-# TFRecordDataset can handle gs:// paths
-tf_dataset = tf.data.TFRecordDataset(["gs://matt-tf-tpu-tutorial-datasets/cola/dataset.tfrecords"])
-tf_dataset = tf_dataset.map(decode_fn)
-tf_dataset = tf_dataset.shuffle(len(dataset)).batch(BATCH_SIZE, drop_remainder=True)
-tf_dataset = tf_dataset.apply(
-    tf.data.experimental.assert_cardinality(len(labels) // BATCH_SIZE)
-)
-```
-
-The dataset can now be passed to the [fit](https://keras.io/api/models/model_training_apis/#fit-method) method.
-
-```py
-model.fit(tf_dataset)
-```
-
-### Stream from raw data
-
-Data can be stored in its native format and preprocessed in a [tf.data](https://www.tensorflow.org/guide/data) pipeline as the data is loaded. This approach isn't supported for many models with complex tokenization schemes, but some models like BERT are supported because their tokenization can be compiled. Refer to the table below to help you decide whether this approach is helpful for you.
-
-| pros | cons |
-|---|---|
-| suitable for highly compressed big data in native format (images, audio) | requires writing a full preprocessing pipeline |
-| convenient if raw data is available in a public cloud bucket | complex preprocessing on-the-fly can hurt throughput |
-| works on all TPU instances if data is stored in Google Cloud | must place data in cloud storage if not already there |
-|  | not as suitable for text data because writing a tokenization pipeline is hard (use `TFRecord` for text) |
-
-The example below demonstrates streaming data for an image model.
-
-Load an image dataset and get a list of the underlying image file paths and labels.
-
-```py
-from datasets import load_dataset
-
-image_dataset = load_dataset("beans", split="train")
-filenames = image_dataset["image_file_path"]
-labels = image_dataset["labels"]
-```
-
-Convert the local filenames in the dataset into `gs://` paths in Google Cloud Storage.
-
-```py
-# strip everything but the category directory and filenames
-base_filenames = ['/'.join(filename.split('/')[-2:]) for filename in filenames]
-# prepend the Google Cloud base path to everything instead
-gs_paths = ["gs://matt-tf-tpu-tutorial-datasets/beans/"+filename for filename in base_filenames]
-
-# create tf_dataset
-tf_dataset = tf.data.Dataset.from_tensor_slices(
-    {"filename": gs_paths, "labels": labels}
-)
-tf_dataset = tf_dataset.shuffle(len(tf_dataset))
-```
-
-Transformers preprocessing classes like [`AutoImageProcessor`] are framework-agnostic and can't be compiled into a pipeline by `tf.data`. To get around this, get the normalization values (`mean` and `std`) from the [`AutoImageProcessor`] and use them in the `tf.data` pipeline.
-
-```py
-from transformers import AutoImageProcessor
-
-processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
-image_size = (processor.size["height"], processor.size["width"])
-image_mean = processor.image_mean
-image_std = processor.image_std
-```
-
-Use these normalization values to create a function to load and preprocess the images.
-
-```py
-BATCH_SIZE = 8 * strategy.num_replicas_in_sync
-
-def decode_fn(sample):
-    image_data = tf.io.read_file(sample["filename"])
-    image = tf.io.decode_jpeg(image_data, channels=3)
-    image = tf.image.resize(image, image_size)
-    array = tf.cast(image, tf.float32)
-    array /= 255.0
-    array = (array - image_mean) / image_std
-    array = tf.transpose(array, perm=[2, 0, 1])
-    return {"pixel_values": array, "labels": sample["labels"]}
-
-tf_dataset = tf_dataset.map(decode_fn)
-tf_dataset = tf_dataset.batch(BATCH_SIZE, drop_remainder=True)
-print(tf_dataset.element_spec)
-```
-
-The dataset can now be passed to the [fit](https://keras.io/api/models/model_training_apis/#fit-method) method.
-
-```py
-from transformers import TFAutoModelForImageClassification
-
-with strategy.scope():
-    model = TFAutoModelForImageClassification.from_pretrained(image_model_checkpoint)
-    model.compile(optimizer="adam")
-
-model.fit(tf_dataset)
-```
-
-### Stream with prepare_tf_dataset
-
-[`~TFPreTrainedModel.prepare_tf_dataset`] creates a `tf.data` pipeline that loads samples from [tf.data.Dataset](https://www.tensorflow.org/api_docs/python/tf/data/Dataset). The pipeline uses [tf.numpy_function]() or [`~datasets.Dataset.from_generator`], which can't be compiled by TensorFlow, to access the underlying `tf.data.Dataset`. It also won't work on a Colab TPU or TPU Nodes because the pipeline streams data from a local disk. Refer to the table below to help you decide whether this approach is helpful for you.
-
-| pros | cons |
-|---|---|
-| simple code | only works on TPU VM |
-| same approach on TPU/GPU | data must be available as a Hugging Face Dataset |
-| dataset doesn't have to fit in memory | data must fit on local storage |
-| supports variable padding | data loading may be a bottleneck on a big TPU pod slice |
-
-[`~TFPreTrainedModel.prepare_tf_dataset`] only works on [TPU VM](#tpu-types). Add the tokenizer output as columns in the dataset since the dataset is stored on disk, which means it can handle data larger than the available memory. Use [`~TFPreTrainedModel.prepare_tf_dataset`] to stream data from the dataset by wrapping it with a `tf.data` pipeline.
-
-```py
-def tokenize_function(examples):
-    return tokenizer(
-        examples["sentence"], padding="max_length", truncation=True, max_length=128
-    )
-# add the tokenizer output to the dataset as new columns
-dataset = dataset.map(tokenize_function)
-
-# prepare_tf_dataset() chooses columns that match the models input names
-tf_dataset = model.prepare_tf_dataset(
-    dataset, batch_size=BATCH_SIZE, shuffle=True, tokenizer=tokenizer
-)
-```
-
-The dataset can now be passed to the [fit](https://keras.io/api/models/model_training_apis/#fit-method) method.
-
-```py
-from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
-
-with strategy.scope():
-    model = TFAutoModelForSequenceClassification.from_pretrained(model_checkpoint)
-    model.compile(optimizer="adam")
-
-model.fit(tf_dataset)
-```
-
-## TPU types
-
-There are two types of TPUs, a TPU Node and a TPU VM.
-
-A TPU Node indirectly accesses a remote TPU. It requires a separate VM to initialize your network and data pipeline, and then forwards it to the remote node. Google Colab TPUs are an example of a TPU Node. You can't use local data because the TPU is remotely located, and data must be stored in Google Cloud Storage where the data pipeline can access it.
-
-TPU VM are connected directly to the machine the TPU is located on, and they are generally easier to work with, especially when it comes to your data pipeline.
-
-> [!TIP]
-> We recommend avoiding TPU Nodes if possible because it is more difficult to debug than TPU VMs. TPU Nodes may also be unsupported in the future and become a legacy access method.
-
-A single TPU (v2-8, v3-8, v4-8) runs 8 replicas. TPUs can exist in **pods** which run hundreds or even thousands of replicas simultaneously. When you only use a portion of a pod, it is referred to as a **pod slice**. On Google Colab, you'll typically get a single v2-8 TPU.
-
-## XLA
-
-[XLA](https://openxla.org/xla) is a linear algebra compiler for high-performance execution and it is used by default to improve performance on TPUs.
-
-Before executing your code on a TPU, it's a good idea to try it first on a CPU or GPU because it is easier to debug. You can train for a few steps to make sure the model and data pipeline work as expected. Set `jit_compile=True` in the [compile](https://keras.io/api/models/model_training_apis/#compile-method) method to enable XLA compilation (but remember to remove this line of code before running on a TPU).
-
-The section below outlines three rules for making your code XLA-compatible. Transformers enforce the first two rules for models and loss functions by default, but don't forget about them if you're writing your own models and loss functions.
-
-### Data dependent conditionals
-
-Any `if` statements cannot depend on values inside a [tf.Tensor](https://www.tensorflow.org/api_docs/python/tf/Tensor). The code below can't be compiled by XLA.
-
-```py
-if tf.reduce_sum(tensor) > 10:
-    tensor = tensor / 2.0
-```
-
-To compile with XLA, use [tf.cond](https://www.tensorflow.org/api_docs/python/tf/cond) or remove the conditional and use indicator variables instead as shown below.
-
-```py
-sum_over_10 = tf.cast(tf.reduce_sum(tensor) > 10, tf.float32)
-tensor = tensor / (1.0 + sum_over_10)
-```
-
-### Data dependent shapes
-
-The shape of a [tf.Tensor](https://www.tensorflow.org/api_docs/python/tf/Tensor) cannot depend on their values. For example, [tf.unique](https://www.tensorflow.org/api_docs/python/tf/unique) can't be compiled because it returns a tensor containing an instance of each unique value in the input. The shape of this output depends on how repetitive the input [tf.Tensor](https://www.tensorflow.org/api_docs/python/tf/Tensor) is.
-
-This is an issue during **label masking**, where labels are set to a negative value to indicate they should be ignored when computing the loss. The code below can't be compiled by XLA because the shape of `masked_outputs` and `masked_labels` depend on how many positions are masked.
-
-```py
-label_mask = labels >= 0
-masked_outputs = outputs[label_mask]
-masked_labels = labels[label_mask]
-loss = compute_loss(masked_outputs, masked_labels)
-mean_loss = torch.mean(loss)
-```
-
-To compile with XLA, avoid the data-dependent shapes by computing the loss for every position and zeroing out the masked positions in both the numerator and denominator when calculating the mean. Convert `tf.bool` to `tf.float32` as an indicator variable to make your code XLA-compatible.
-
-```py
-label_mask = tf.cast(labels >= 0, tf.float32)
-loss = compute_loss(outputs, labels)
-loss = loss * label_mask
-mean_loss = tf.reduce_sum(loss) / tf.reduce_sum(label_mask)
-```
-
-### Recompile different input shapes
-
-XLA recompiles your model if input shapes are variable which create huge performance problems. It is especially common in text models because input texts have variable lengths after tokenization.
-
-> [!WARNING]
-> Execessive padding can also severely slow down training because requires more compute and memory to process.
-
-To avoid different shapes, use padding to pad all your inputs to the same length and use an `attention_mask`. Try padding batches of samples to a multiple of 32 or 64 tokens. Use the parameters `padding="max_length"`, `padding="longest"`, or `pad_to_multiple_of` to help with padding. This often increases the number of tokens by a small amount, but it significantly reduces the number of unique input shapes because every input shape is a multiple of 32 or 64. Fewer unique input shapes requires fewer recompilation.
--- a/docs/source/en/quantization/hqq.md
+++ b/docs/source/en/quantization/hqq.md
@ -20,7 +20,7 @@ rendered properly in your Markdown viewer.

 HQQ further supports fine-tuning with [PEFT](https://huggingface.co/docs/peft) and is fully compatible with [torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) for even faster inference and training.

-Install HQQ with the following command to get the latest version and to build its corresponding CUDA kernels.
+Install HQQ with the following command to get the latest version and to build its corresponding CUDA kernels if you are using a cuda device. It also support Intel XPU with pure pytorch implementation.

 ```bash
 pip install hqq
@ -34,13 +34,14 @@ You can choose to either replace all the linear layers in a model with the same
 Quantize a model by creating a [`HqqConfig`] and specifying the `nbits` and `group_size` to replace for all the linear layers ([torch.nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html)) of the model.

 ``` py
+import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, HqqConfig

 quant_config = HqqConfig(nbits=8, group_size=64)
 model = transformers.AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.1-8B", 
    torch_dtype=torch.float16, 
-    device_map="cuda", 
+    device_map="auto", 
    quantization_config=quant_config
 )
 ```
@ -67,7 +68,7 @@ quant_config  = HqqConfig(dynamic_config={
 model = transformers.AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.1-8B", 
    torch_dtype=torch.float16, 
-    device_map="cuda", 
+    device_map="auto", 
    quantization_config=quant_config
 )
 ```
--- a/docs/source/en/serving.md
+++ b/docs/source/en/serving.md
@ -16,7 +16,9 @@ rendered properly in your Markdown viewer.

 # Serving

-Transformer models can be served for inference with specialized libraries such as Text Generation Inference (TGI) and vLLM. These libraries are specifically designed to optimize performance with LLMs and include many unique optimization features that may not be included in Transformers.
+Transformer models can be efficiently deployed using libraries such as vLLM, Text Generation Inference (TGI), and others. These libraries are designed for production-grade user-facing services, and can scale to multiple servers and millions of concurrent users.
+
+You can also serve transformer models easily using the `transformers serve` CLI. This is ideal for experimentation purposes, or to run models locally for personal and private use.

 ## TGI

@ -62,3 +64,164 @@ vllm serve Qwen/Qwen2.5-1.5B-Instruct \
    --model-impl transformers \
    --trust-remote-code
 ```
+
+## Serve CLI
+
+> [!WARNING]
+> This section is experimental and subject to change in future versions
+
+<!-- TODO: LLMs -> models, after we add audio/image input/output support -->
+You can serve LLMs supported by `transformers` with the `transformers serve` CLI. It spawns a local server that offers a chat Completions API compatible with the OpenAI SDK, which is the _de facto_ standard for LLM conversations. This way, you can use the server from many third party applications, or test it using the `transformers chat` CLI ([docs](conversations.md#chat-cli)).
+
+To launch a server, simply use the `transformers serve` CLI command:
+
+```shell
+transformers serve
+```
+
+The simplest way to interact with the server is through our `transformers chat` CLI
+
+```shell
+transformers chat localhost:8000 --model-name-or-path Qwen/Qwen3-4B
+```
+
+or by sending an HTTP request with `cURL`, e.g.
+
+```shell
+curl -X POST http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{"messages": [{"role": "system", "content": "hello"}], "temperature": 0.9, "max_tokens": 1000, "stream": true, "model": "Qwen/Qwen2.5-0.5B-Instruct"}'
+```
+
+from which you'll receive multiple chunks in the Completions API format
+
+```shell
+data: {"object": "chat.completion.chunk", "id": "req_0", "created": 1751377863, "model": "Qwen/Qwen2.5-0.5B-Instruct", "system_fingerprint": "", "choices": [{"delta": {"role": "assistant", "content": "", "tool_call_id": null, "tool_calls": null}, "index": 0, "finish_reason": null, "logprobs": null}]}
+
+data: {"object": "chat.completion.chunk", "id": "req_0", "created": 1751377863, "model": "Qwen/Qwen2.5-0.5B-Instruct", "system_fingerprint": "", "choices": [{"delta": {"role": "assistant", "content": "", "tool_call_id": null, "tool_calls": null}, "index": 0, "finish_reason": null, "logprobs": null}]}
+
+(...)
+```
+
+The server is also an MCP client, so it can interact with MCP tools in agentic use cases. This, of course, requires the use of an LLM that is designed to use tools.
+
+> [!TIP]
+> At the moment, MCP tool usage in `transformers` is limited to the `qwen` family of models.
+
+<!-- TODO: example with a minimal python example, and explain that it is possible to pass a full generation config in the request -->
+
+
+### Usage example 1: apps with local requests (feat. Jan)
+
+This example shows how to use `transformers serve` as a local LLM provider for the [Jan](https://jan.ai/) app. Jan is a ChatGPT-alternative graphical interface, fully running on your machine. The requests to `transformers serve` come directly from the local app -- while this section focuses on Jan, you can extrapolate some instructions to other apps that make local requests.
+
+To connect `transformers serve` with Jan, you'll need to set up a new model provider ("Settings" > "Model Providers"). Click on "Add Provider", and set a new name. In your new model provider page, all you need to set is the "Base URL" to the following pattern:
+
+```shell
+http://[host]:[port]/v1
+```
+
+where `host` and `port` are the `transformers serve` CLI parameters (`localhost:8000` by default). After setting this up, you should be able to see some models in the "Models" section, hitting "Refresh". Make sure you add some text in the "API key" text field too -- this data is not actually used, but the field can't be empty. Your custom model provider page should look like this:
+
+<h3 align="center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_serve_jan_model_providers.png"/>
+</h3>
+
+You are now ready to chat!
+
+> [!TIP]
+> You can add any `transformers`-compatible model to Jan through `transformers serve`. In the custom model provider you created, click on the "+" button in the "Models" section and add its Hub repository name, e.g. `Qwen/Qwen3-4B`.
+
+To conclude this example, let's look into a more advanced use-case. If you have a beefy machine to serve models with, but prefer using Jan on a different device, you need to add port forwarding. If you have `ssh` access from your Jan machine into your server, this can be accomplished by typing the following to your Jan machine's terminal
+
+```
+ssh -N -f -L 8000:localhost:8000 your_server_account@your_server_IP -p port_to_ssh_into_your_server
+```
+
+Port forwarding is not Jan-specific: you can use it to connect `transformers serve` running in a different machine with an app of your choice.
+
+
+### Usage example 2: apps with external requests (feat. Cursor)
+
+This example shows how to use `transformers serve` as a local LLM provider for [Cursor](https://cursor.com/), the popular IDE. Unlike in the previous example, requests to `transformers serve` will come from an external IP (Cursor's server IPs), which requires some additional setup. Furthermore, some of Cursor's requests require [CORS](https://developer.mozilla.org/en-US/docs/Web/HTTP/Guides/CORS), which is disabled by default for security reasons.
+
+To launch our server with CORS enabled, run
+
+```shell
+transformers serve --enable-cors
+```
+
+We'll also need to expose our server to external IPs. A potential solution is to use [`ngrok`](https://ngrok.com/), which has a permissive free tier. After setting up your `ngrok` account and authenticating on your server machine, you run
+
+```shell
+ngrok http [port]
+```
+
+where `port` is the port used by `transformers serve` (`8000` by default). On the terminal where you launched `ngrok`, you'll see an https address in the "Forwarding" row, as in the image below. This is the address to send requests to.
+
+<h3 align="center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_serve_ngrok.png"/>
+</h3>
+
+We're now ready to set things up on the app side! In Cursor, while we can't set a new provider, we can change the endpoint for OpenAI requests in the model selection settings. First, navigate to "Settings" > "Cursor Settings", "Models" tab, and expand the "API Keys" collapsible. To set our `transformers serve` endpoint, follow this order:
+1. Unselect ALL models in the list above (e.g. `gpt4`, ...);
+2. Add and select the model you want to use (e.g. `Qwen/Qwen3-4B`)
+3. Add some random text to OpenAI API Key. This field won't be used, but it can’t be empty;
+4. Add the https address from `ngrok` to the "Override OpenAI Base URL" field, appending `/v1` to the address (i.e. `https://(...).ngrok-free.app/v1`);
+5. Hit "Verify".
+
+After you follow these steps, your "Models" tab should look like the image below. Your server should also have received a few requests from the verification step.
+
+<h3 align="center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_serve_cursor.png"/>
+</h3>
+
+You are now ready to use your local model in Cursor! For instance, if you toggle the AI Pane, you can select the model you added and ask it questions about your local files.
+
+<h3 align="center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_serve_cursor_chat.png"/>
+</h3>
+
+
+### Usage example 3: `tiny-agents` CLI and MCP Tools
+
+To showcase the use of MCP tools, let's see how to integrate the `transformers serve` server with the [`tiny-agents`](https://huggingface.co/blog/python-tiny-agents) CLI.
+
+> [!TIP]
+> Many Hugging Face Spaces can be used as MCP servers, as in this example. You can find all compatible Spaces [here](https://huggingface.co/spaces?filter=mcp-server).
+
+The first step to use MCP tools is to let the model know which tools are available. As an example, let's consider a `tiny-agents` configuration file with a reference to an [image generation MCP server](https://evalstate-flux1-schnell.hf.space/).
+
+```json
+{
+    "model": "Menlo/Jan-nano",
+    "endpointUrl": "http://localhost:8000",
+    "servers": [
+        {
+            "type": "sse",
+            "url": "https://evalstate-flux1-schnell.hf.space/gradio_api/mcp/sse"
+        }
+    ]
+}
+```
+
+You can then launch your `tiny-agents` chat interface with the following command.
+
+```bash
+tiny-agents run path/to/your/config.json
+```
+
+If you have `transformers serve` running in the background, you're ready to use MCP tools from a local model! For instance, here's the example of a chat session with `tiny-agents`:
+
+```bash
+Agent loaded with 1 tools:
+ • flux1_schnell_infer
+»  Generate an image of a cat on the moon
+<Tool req_0_tool_call>flux1_schnell_infer {"prompt": "a cat on the moon", "seed": 42, "randomize_seed": true, "width": 1024, "height": 1024, "num_inference_steps": 4}
+
+Tool req_0_tool_call
+[Binary Content: Image image/webp, 57732 bytes]
+The task is complete and the content accessible to the User
+Image URL: https://evalstate-flux1-schnell.hf.space/gradio_api/file=/tmp/gradio/3dbddc0e53b5a865ed56a4e3dbdd30f3f61cf3b8aabf1b456f43e5241bd968b8/image.webp
+380576952
+
+I have generated an image of a cat on the moon using the Flux 1 Schnell Image Generator. The image is 1024x1024 pixels and was created with 4 inference steps. Let me know if you would like to make any changes or need further assistance!
+```
--- a/docs/source/en/tf_xla.md
+++ b/docs/source/en/tf_xla.md
@ -1,129 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# XLA
-
-[[open-in-colab]]
-
-[Accelerated Linear Algebra (XLA)](https://openxla.org/xla) is a linear algebra compiler that optimizes model runtime across different hardware and frameworks.
-
-This guide will look specifically at how to accelerate *TensorFlow* models with XLA.
-
-## TensorFlow
-
-XLA can potentially accelerate a TensorFlow model without making any source code changes. It is already packaged with the TensorFlow library, and it is triggered with `jit_compile` in any graph creating function such as [tf.function](https://www.tensorflow.org/api_docs/python/tf/function).
-
-If you're using Keras methods like [fit](https://keras.io/api/models/model_training_apis/#fit-method) and [predict](https://keras.io/api/models/model_training_apis/#predict-method), enable XLA by passing `jit_compile=True` to [compile](https://keras.io/api/models/model_training_apis/#compile-method).
-
-```py
-model.compile(jit_compile=True)
-```
-
-XLA can be used to accelerate any arbitrary [tf.function](https://www.tensorflow.org/api_docs/python/tf/function).
-
-Models with a TensorFlow implementation like [GPT2](./model_doc/gpt2), [T5](./model_doc/t5), [OPT](./model_doc/opt), and [Whisper](./model_doc/whisper) are XLA compatible. The speed up depends on a model, but in general, TensorFlow models in Transformers get a ~100x speed up.
-
-### Functions
-
-A typical forward pass in a TensorFlow model is shown below. To run a forward pass with XLA, wrap the model with [tf.function](https://www.tensorflow.org/api_docs/python/tf/function) and set `jit_compile=True`.
-
-```diff
-import tensorflow as tf
-
-model = tf.keras.Sequential(
-    [tf.keras.layers.Dense(10, input_shape=(10,), activation="relu"), tf.keras.layers.Dense(5, activation="softmax")]
-)
-# Generate random inputs for the model.
-batch_size = 16
-input_vector_dim = 10
-random_inputs = tf.random.normal((batch_size, input_vector_dim))
-
-# Run a forward pass.
- _ = model(random_inputs)
-+ xla_fn = tf.function(model, jit_compile=True)
-+ _ = xla_fn(random_inputs)
-```
-
-The default `call` function of the model is used to compile the XLA graph. But if there's any other model function you want to compile with XLA, wrap them with [tf.function](https://www.tensorflow.org/api_docs/python/tf/function).
-
-```py
-my_xla_fn = tf.function(model.my_xla_fn, jit_compile=True)
-```
-
-### Text generation
-
-You could also compile other model functions with XLA. For example, enable XLA for text generation by wrapping [`~TFGenerationMixin.generate`] with [tf.function](https://www.tensorflow.org/api_docs/python/tf/function).
-
-```py
-import tensorflow as tf
-from transformers import AutoTokenizer, TFAutoModelForCausalLM
-# Will error if the minimal version of Transformers is not installed.
-from transformers.utils import check_min_version
-
-check_min_version("4.21.0")
-
-tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", padding_side="left", pad_token="</s>")
-model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2")
-input_string = ["TensorFlow is"]
-
-xla_generate = tf.function(model.generate, jit_compile=True)
-
-tokenized_input = tokenizer(input_string, return_tensors="tf")
-generated_tokens = xla_generate(**tokenized_input, num_beams=2)
-
-decoded_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
-print(f"Generated -- {decoded_text}")
-"Generated -- TensorFlow is an open-source, open-source, distributed-source application framework for the"
-```
-
-## Tracing
-
-When executing an XLA-enabled function for the first time, it tries to infer the computation graph in a process known as *tracing*. This is a time-consuming step, but any consecutive calls to the function will be much faster because it won't have to trace the computation graph again.
-
-To ensure a function is only traced once, the inputs must have the same shape as when the graph was built. This usually isn't an issue for fixed input shapes like images, but it can be an issue for inputs with variable shapes like text.
-
-One way to handle this is to pad your text so it always has the same shape. Configure padding options such as [pad_to_multiple_of](https://hf.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.pad.pad_to_multiple_of) in the tokenizer.
-
-```py
-import tensorflow as tf
-from transformers import AutoTokenizer, TFAutoModelForCausalLM
-
-tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", padding_side="left", pad_token="</s>")
-model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2")
-input_string = ["TensorFlow is"]
-
-xla_generate = tf.function(model.generate, jit_compile=True)
-
-# Call tokenizer with padding options.
-tokenized_input = tokenizer(input_string, pad_to_multiple_of=8, padding=True, return_tensors="tf")
-
-generated_tokens = xla_generate(**tokenized_input, num_beams=2)
-decoded_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
-print(f"Generated -- {decoded_text}")
-```
-
-In addition to the input shape, any changes to the generation options at any point also triggers tracing.
-
-## Resources
-
-Learn more about XLA with the following resources.
-
- A [notebook](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/91_tf_xla_generate.ipynb) demonstrating XLA-compatible encoder-decoder and decoder-only text generation models.
- The [Faster Text Generation with TensorFlow and XLA](https://hf.co/blog/tf-xla-generate) blog post compares benchmarks for XLA-compatible models and provides a friendly introduction to XLA in TensorFlow.
- The [How Hugging Face improved Text Generation performance with XLA](https://blog.tensorflow.org/2022/11/how-hugging-face-improved-text-generation-performance-with-xla.html) blog post discusses the design philosophy behind adding XLA to TensorFlow models in Transformers.
- The [Introduction to graphs and tf.function](https://www.tensorflow.org/guide/intro_to_graphs) guide.
- The [Better performance with tf.function](https://www.tensorflow.org/guide/function) guide.
- The [XLA](https://openxla.org/xla) documentation.
--- a/docs/source/en/tools.md
+++ b/docs/source/en/tools.md
@ -14,5 +14,9 @@ rendered properly in your Markdown viewer.

 -->

+# Tools
+
+(deprecated)
+
 > [!WARNING]
 > Agents and tools were spun out into the standalone [smolagents](https://huggingface.co/docs/smolagents/index) library. They were removed from `transformers` in v4.52.
--- a/docs/source/en/trainer.md
+++ b/docs/source/en/trainer.md
@ -187,13 +187,13 @@ from torch import nn
 from transformers import Trainer

 class CustomTrainer(Trainer):
-    def compute_losss(self, model: nn.Module, inputs: dict[str, Union[torch.Tensor, Any]], return_outputs: bool = False num_items_in_batch: Optional[torch.Tensor] = None):
+    def compute_loss(self, model: nn.Module, inputs: dict[str, Union[torch.Tensor, Any]], return_outputs: bool = False num_items_in_batch: Optional[torch.Tensor] = None):
        labels = inputs.pop("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss for 3 labels with different weights
-        reduction = "mean" if num_items_in_batch is not None else "sum"
+        reduction = "sum" if num_items_in_batch is not None else "mean"
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0, 3.0], device=model.device, reduction=reduction))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        if num_items_in_batch is not None:
--- a/docs/source/it/perf_train_special.md
+++ b/docs/source/it/perf_train_special.md
@ -7,6 +7,7 @@ http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.

 ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
 rendered properly in your Markdown viewer.
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@ -157,6 +157,8 @@
    title: (번역중) VPTQ
  - local: quantization/quanto
    title: Quanto
+  - local: quantization/quark
+    title: Quark
  - local: quantization/eetq
    title: EETQ
  - local: in_translation
@ -225,7 +227,7 @@
 - sections:
  - local: philosophy
    title: 이념과 목표
-  - local: in_translation
+  - local: glossary
    title: (번역중) Glossary
  - local: task_summary
    title: 🤗 Transformers로 할 수 있는 작업
--- a/docs/source/ko/glossary.md
+++ b/docs/source/ko/glossary.md
@ -0,0 +1,454 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 용어집(Glossary)
+
+이 용어집은 전반적인 머신러닝 및 🤗 Transformers 관련 용어를 정의하여 문서를 더 잘 이해하는 데 도움을 줍니다.
+
+## A
+
+### 어텐션 마스크 (attention mask)
+
+어텐션 마스크(attention mask)는 여러 시퀀스를 배치(batch)로 처리할 때 사용되는 선택적 인자입니다.
+
+<Youtube id="M6adb1j2jPI"/>
+
+이 인자는 모델에게 어떤 토큰에 주의를 기울여야 하는지, 그리고 어떤 토큰은 무시해야 하는지를 알려줍니다.
+
+예를 들어, 다음 두 개의 시퀀스가 있다고 가정해 봅시다:
+
+```python
+>>> from transformers import BertTokenizer
+
+>>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-cased")
+
+>>> sequence_a = "This is a short sequence."
+>>> sequence_b = "This is a rather long sequence. It is at least longer than the sequence A."
+
+>>> encoded_sequence_a = tokenizer(sequence_a)["input_ids"]
+>>> encoded_sequence_b = tokenizer(sequence_b)["input_ids"]
+```
+
+인코딩된 버전들의 길이가 다릅니다:
+
+```python
+>>> len(encoded_sequence_a), len(encoded_sequence_b)
+(8, 19)
+```
+
+따라서 이 두 시퀀스를 그대로 하나의 텐서에 넣을 수는 없습니다. 첫 번째 시퀀스를 두 번째 길이에 맞춰 패딩 하거나, 반대로 두 번째 시퀀스를 첫 번째 길이에 맞춰 잘라내야 합니다.
+
+첫 번째 경우에는 ID 목록이 패딩 인덱스로 확장됩니다. 이렇게 패딩을 적용하려면 토크나이저에 리스트를 전달하고 다음과 같이 요청할 수 있습니다:
+
+```python
+>>> padded_sequences = tokenizer([sequence_a, sequence_b], padding=True)
+```
+
+첫 번째 문장 오른쪽에 0이 추가되어 두 번째 문장과 길이가 같아진 것을 볼 수 있습니다:
+
+```python
+>>> padded_sequences["input_ids"]
+[[101, 1188, 1110, 170, 1603, 4954, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1188, 1110, 170, 1897, 1263, 4954, 119, 1135, 1110, 1120, 1655, 2039, 1190, 1103, 4954, 138, 119, 102]]
+```
+
+이것은 PyTorch나 TensorFlow의 텐서로 변환될 수 있습니다. 어텐션 마스크는 모델이 패딩 된 인덱스를 참조하지 않도록 해당 위치를 나타내는 이진 텐서입니다. [`BertTokenizer`]의 경우, `1`은 어텐션이 필요한 값을 나타내고, `0`은 패딩 된 값을 나타냅니다. 이 어텐션 마스크는 토크나이저가 반환되는 딕셔너리의 "attention_mask" 키 아래에 포함되어 있습니다:
+
+```python
+>>> padded_sequences["attention_mask"]
+[[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
+```
+
+### 오토인코딩 모델 (autoencoding models)
+
+[인코더 모델](#encoder-models)과 [마스킹된 언어 모델링](#masked-language-modeling-mlm)을 참고하세요.
+
+### 자기회귀 모델 (autoregressive models)
+
+[인과적 언어 모델링](#causal-language-modeling)과 [디코더 모델](#decoder-models)을 참고하세요.
+
+## B
+
+### 백본 (backbone)
+
+백본(backbone)은 원시(hidden) 은닉 상태(hidden state) 또는 특징(feature)을 출력하는 네트워크(임베딩과 레이어)입니다. 일반적으로 이 백본은 해당 특징을 입력으로 받아 예측을 수행하는 [헤드](#head)와 연결됩니다. 예를 들어, [`ViTModel`]은 특정 헤드가 없는 백본입니다. 다른 모델들도[`VitModel`]을 백본으로 사용할 수 있으며, [DPT](model_doc/dpt)등이 그 예시입니다.
+
+## C
+
+### 인과적 언어 모델링 (causal language modeling)
+
+모델이 텍스트를 순서대로 읽으며 다음 단어를 예측해야 하는 사전 학습(pretraining) 작업입니다. 일반적으로 문장을 전체로 읽되, 모델 내부에서 특징 시점 이후의 토큰을 마스킹(masking)하여 다음 단어를 예측하게 됩니다.
+
+### 채널 (channel)
+
+컬러 이미지는 빨간색(R), 초록색(G), 파란색(B)의 세 채널 값을 조합하여 구성되며, 흑백 이미지는 단일 채널만을 가집니다. 🤗 Transformers에서는 이미지 텐서의 채널이 첫 번째 또는 마지막 차원에 위치할 수 있습니다:[`n_channels`, `height`, `width`] 또는 [`height`, `width`, `n_channels`]와 같은 형식입니다.
+
+### 연결 시간분류(connectionist temporal classification, CTC) 
+
+입력과 출력의 정렬 상태를 정확히 몰라도 모델이 학습할 수 있도록 돕는 알고리즘입니다. CTC는 주어진 입력에 대해 가능한 모든 출력의 확률 분포를 계산하고, 그중 가장 가능성이 높은 출력을 선택합니다. CTC는 말하는 속도의 차이 등 여러 이유로 음성과 텍스트가 항상 정확하게 일치하지 않기 때문에 음성 인식 작업에서 자주 사용됩니다.
+
+### 컨볼루션 (convolution)
+
+신경망에서 사용되는 레이어의 한 종류로, 입력 행렬에 대해 더 작은 행렬(커널 또는 필터)을 원소별로 곱한 뒤 그 값을 합산해 새로운 행렬을 만드는 연산입니다. 이 연산을 컨볼루션 연산이라고 하며, 입력 행렬 전체에 걸쳐 반복적으로 수행됩니다. 각 연산은 입력 행렬의 서로 다른 구간에 적용됩니다. 컨볼루션 신경망(CNN)은 컴퓨터 비전 분야에서 널리 사용됩니다.
+
+## D
+
+### 데이터 병렬화 (DataParallel)
+
+여러 개의 GPU에서 훈련을 수행할 때 사용하는 병렬화 기법으로, 동일한 모델 구성이 여러 번 복제되며 각 인스턴스는 서로 다른 데이터 조각을 받습니다. 모든 인스턴스는 병렬로 처리를 수행하며, 각 훈련 단계가 끝난 후 결과를 동기화합니다.
+
+DataParallel 방식에 대해 더 알아보려면 [여기](perf_train_gpu_many#dataparallel-vs-distributeddataparallel)를 참고하세요.
+
+### 디코더 입력 ID (decoder input IDs)
+
+이 입력은 인코더-디코더 모델에 특화된 것으로, 디코더에 전달될 input ID 들을 포함합니다. 이러한 입력은 번역이나 요약과 같은 시퀀스-투-시퀀스(sequence-to-sequence) 작업에 사용되며, 일반적으로 모델마다 고유한 방식으로 구성됩니다.
+
+대부분의 인코더-디코더 모델(BART, T5 등)은 `labels`로부터 자동으로 `decoder_input_ids`를 생성합니다. 이러한 모델에서는 학습 시 `labels`를 전달하는 것이 일반적으로 권장됩니다.
+
+시퀀스-투-시퀀스 학습에서 각 모델이 이러한 input ID를 어떻게 처리하는지는 모델 문서를 참고하시기를 바랍니다.
+
+### 디코더 모델 (decoder models)
+
+자기회귀 모델(Autoregressive models)이라고도 불리는 디코더 모델은 인과 언어 모델링(causal language modeling)이라 불리는 사전 학습 작업을 수행합니다. 이 작업에서는 모델이 텍스트를 순서대로 읽고 다음 단어를 예측해야 합니다. 일반적으로 문장의 전체를 읽되, 특정 시점 이후의 토큰은 마스크로 가려 예측하게 합니다.
+
+<Youtube id="d_ixlCubqQw"/>
+
+### 딥러닝 (deep learning)
+
+여러 층의 신경망(neural network)을 사용하는 머신러닝 알고리즘입니다.
+
+## E
+
+### 인코더 모델 (encoder models)
+
+자동 인코딩 모델(Autoencoding models)이라고도 불리는 인코더 모델은 텍스트나 이미지와 같은 입력을 받아 임베딩이라 불리는 압축된 수치 표현으로 반환합니다. 일반적으로 인코더 모델은 입력 시퀀스의 일부를 마스킹하고 더 의미 있는 표현을 생성하도록 학습하는 [masked language modeling](#masked-language-modeling-mlm)과 같은 기술을 사용하여 사전 학습됩니다.
+
+<Youtube id="H39Z_720T5s"/>
+
+## F
+
+### 특징 추출 (feature extraction)
+
+머신러닝 알고리즘이 더 효과적으로 학습할 수 있도록, 원시 데이터를 선택하고 변환하여 더 유용한 특징(feature) 집합으로 만드는 과정입니다. 예를 들어, 원시 텍스트를 워드 임베딩으로 변환하거나 이미지나 비디오 데이터에서 윤곽선이나 형태와 같은 중요한 특징을 추출하는 것이 있습니다.
+
+### 피드 포워드 청킹 (feed forward chunking)
+
+트랜스포머의 각 residual attention Block에서는 self-Attention Layer 다음에 보통 두 개의 Feed Forward Layer가 이어집니다. 이 Feed Forward Layers의 중간 임베딩 크기는 종종 모델의 히든 사이즈(hidden size)보다 큽니다(예:
+`google-bert/bert-base-uncased` 모델의 경우).
+
+입력 크기가 `[batch_size, sequence_length]`일 경우, 중간 Feed Forward 임베딩
+`[batch_size, sequence_length, config.intermediate_size]`을 저장하는 데 필요한 메모리는 전체 메모리 사용량의 큰 부분을 차지할 수 있습니다.
+[Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) 논문의 저자들은 이 연산이 `sequence_length` 차원에 대해 독립적이기 때문에,토큰마다 Feed Forward Layer의 출력 임베딩을 각 토큰별로 `[batch_size, config.hidden_size]`을 개별적으로 계산한 뒤, 이를 이어 붙여 `[batch_size, sequence_length, config.hidden_size]` 형태로 만들 수 있습니다.`n = sequence_length`. 이 방식은 계산 시간은 늘어나지만, 메모리 사용량은 줄어들게 됩니다.
+
+[`apply_chunking_to_forward`] 함수를 사용하는 모델의 경우, `chunk_size`는 병렬로 계산되는 출력 임베딩의 개수를 정의하며, 이는 메모리 사용량과 계산 시간 간의 트레이드오프를 결정합니다.
+`chunk_size`가 0으로 설정되면, 피드 포워드 청킹(Feed Forward Chunking)은 수행되지 않습니다.
+
+### 파인튜닝 모델 (finetuned models)
+
+파인튜닝(Finetuning)은 전이 학습(transfer learning)의 한 형태로, 사전 학습된 (pretrained) 모델을 사용하여 가중치를 고정(freeze)하고, 출력층을 새롭게 추가된 [모델 헤드](#head)로 교체한 뒤, 해당 모델 헤드를 목표 데이터셋에 맞게 학습시키는 방식입니다.
+
+자세한 내용은 [Fine-tune a pretrained model](https://huggingface.co/docs/transformers/training) 튜토리얼을 참고하시고, 🤗 Transformers를 사용해 모델을 파인 튜닝하는 방법도 함께 확인해 보세요.
+
+## H
+
+### 헤드 (head)
+
+모델 헤드(model head)란 신경망의 마지막 층을 의미하며, 이 층은 이전 층에서 나온 히든 상태(hidden states)를 받아 다른 차원으로 변환합니다. 각 작업(task)에 따라 서로 다른 모델 헤드가 사용됩니다. 예를 들어:
+
+  * [`GPT2ForSequenceClassification`]은 기본 [`GPT2Model`] 위에 시퀀스 분류를 위한 선형계층(linear layer)을 추가한 모델 헤드입니다.
+  * [`ViTForImageClassification`]은 이미지 분류를 위한 모델 헤드로, 기본 [`ViTModel`] 위에 `CLS` 토큰의 마지막 히든 상태에 선형 계층(linear layer)을 추가한 구조입니다.
+  * [`Wav2Vec2ForCTC`]는 기본 [`Wav2Vec2Model`] 위에 [CTC](#connectionist-temporal-classification-ctc)를 적용한 언어 모델링 헤드입니다.
+
+## I
+
+### 이미지 패치 (image patch)
+
+비전 기반 Transformer 모델은 이미지를 작은 패치로 분할한 후, 각 패치를 선형 임베딩하여 시퀀스로 모델에 입력합니다. 모델의 구성 파일에서 `patch_size`(또는 해상도)를 확인할 수 있습니다.
+
+### 인퍼런스 (inference)
+
+인퍼런스는 학습이 완료된 모델에 새로운 데이터를 입력하여 예측을 수행하는 과정입니다. 🤗 Transformer에서 인퍼런스를 수행하는 방법은 [Pipeline for inference](https://huggingface.co/docs/transformers/pipeline_tutorial) 튜토리얼을 참고하세요.
+
+### 입력 ID (input IDs)
+
+입력 ID는 종종 모델에 입력으로 전달해야 하는 유일한 필수 파라미터입니다. 이들은 토큰의 인덱스로, 모델이 입력으로 사용할 시퀀스를 구성하는 토큰들의 숫자 표현입니다.
+
+<Youtube id="VFp38yj8h3A"/>
+
+토크나이저마다 작동 방식은 다르지만, 기본 메커니즘은 동일합니다. 다음은 [WordPiece](https://arxiv.org/pdf/1609.08144.pdf) 토크나이저인 BERT 토크나이저를 사용한 예시입니다:
+
+```python
+>>> from transformers import BertTokenizer
+
+>>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-cased")
+
+>>> sequence = "A Titan RTX has 24GB of VRAM"
+```
+
+토크나이저는 시퀀스를 토크나이저의 토큰 목록에 있는 항목으로 분리합니다.
+
+```python
+>>> tokenized_sequence = tokenizer.tokenize(sequence)
+```
+
+토큰은 단어이거나 서브 워드(subword)입니다. 예를 들어, "VRAM"은 모델의 어휘 사전에 없는 단어이기 때문에 "V", "RA", "M"으로 나뉘었습니다. 이 토큰들이 개별 단어가 아니라 같은 단어의 일부임을 나타내기 위해 "RA"와 "M" 앞에 더블 해시(`##`)가 추가 됩니다.
+
+```python
+>>> print(tokenized_sequence)
+['A', 'Titan', 'R', '##T', '##X', 'has', '24', '##GB', 'of', 'V', '##RA', '##M']
+```
+
+이러한 토큰들은 모델이 이해할 수 있는 ID로 변환될 수 있습니다. 이 과정은 문장을 바로 토크나이저에 입력함으로써 수행되며, 성능 최적화를 위해 [🤗 Tokenizers](https://github.com/huggingface/tokenizers)의 Rust 구현을 활용합니다.
+
+```python
+>>> inputs = tokenizer(sequence)
+```
+
+토크나이저는 해당 모델이 올바르게 작동하는 데 필요한 모든 인자를 포함한 딕셔너리를 반환합니다. 토큰 인덱스는 `input_ids`라는 키에 저장됩니다.
+
+```python
+>>> encoded_sequence = inputs["input_ids"]
+>>> print(encoded_sequence)
+[101, 138, 18696, 155, 1942, 3190, 1144, 1572, 13745, 1104, 159, 9664, 2107, 102]
+```
+
+토크나이저는 (연결된 모델이 이를 사용하는 경우) 자동으로 "특수 토큰"을 추가합니다. 이들은 모델이 특정 상황에서 사용하는 특별한 ID입니다.
+
+이전의 ID 시퀀스를 디코딩하면,
+
+```python
+>>> decoded_sequence = tokenizer.decode(encoded_sequence)
+```
+
+우리는 다음과 같은 결과를 보게 될 것입니다.
+
+```python
+>>> print(decoded_sequence)
+[CLS] A Titan RTX has 24GB of VRAM [SEP]
+```
+
+이는 [`BertModel`]이 입력값을 기대하는 방식이기 때문입니다.
+
+## L
+
+### 레이블 (labels)
+
+레이블은 모델이 손실(loss)을 직접 계산할 수 있도록 전달되는 선택적 인자입니다. 이 레이블은 모델이 예측해야 할 정답 값을 의미하며, 모델은 예측값과 이 정답(label) 사이의 차이를 표준 손실 함수를 이용해 계산하게 됩니다.
+
+이 레이블(label)의 형태는 모델 헤드(model head)의 종류에 따라 달라집니다. 예를 들어:
+
+- 시퀀스 분류 모델([`BertForSequenceClassification`] 등)의 경우, 모델은
+  `(batch_size)` 차원의 텐서를 입력으로 받으며, 배치의 각 값은 전체 시퀀스에 대한 예상 레이블을 나타냅니다.
+- 토큰 분류 모델([`BertForTokenClassification`] 등)의 경우, 모델은 `(batch_size, seq_length)` 차원의 텐서를 입력으로 받으며, 각 값은 개별 토큰에 대한 예상 레이블을 나타냅니다.
+- 마스킹 언어 모델([`BertForMaskedLM`])의 경우, 모델은 `(batch_size,seq_length)` 차원의 텐서를 입력으로 받으며, 각 값은 개별 토큰에 대한 예상 레이블을 나타냅니다. 레이블은 마스킹 된 토큰의 토큰 ID이며, 나머지 토큰에 대해서는 무시할 값을 사용합니다(일반적으로 -100).
+- 시퀀스 투 시퀀스 작업([`BartForConditionalGeneration`], [`MBartForConditionalGeneration`]등)의 경우, 모델은 `(batch_size, tgt_seq_length)` 차원의 텐서를 입력으로 받으며, 각 값은 입력 시퀀스에 대응하는 타겟 시퀀스를 나타냅니다. 학습 중에는 BART와 T5가 적절한 `decoder_input_ids`와 디코더 attention 마스크를 내부적으로 생성하므로, 일반적으로 따로 제공할 필요가 없습니다. 단, 이는 Encoder-Decoder 프레임워크를 직접 활용하는 모델에는 적용되지 않습니다.
+- 이미지 분류 모델([`ViTForImageClassification`] 등)의 경우, 모델은 `(batch_size)` 차원의 텐서를 입력으로 받으며, 배치의 각 값은 개별 이미지에 대한 예상 레이블을 나타냅니다.
+- 시멘틱 세그멘테이션 모델([`SegformerForSemanticSegmentation`] 등)의 경우, 모델은 `(batch_size, height, width)` 차원의 텐서를 입력으로 받으며, 배치의 각 값은 개별 픽셀에 대한 예상 레이블을 나타냅니다.
+- 객체 탐지 모델([`DetrForObjectDetection`] 등)의 경우, 모델은 `class_labels`와 `boxes` 키를 포함하는 딕셔너리들의 리스트를 입력으로 받습니다. 배치의 각 값은 개별 이미지에 대한 예상 클래스 레이블과 바운딩 박스 정보를 나타냅니다.
+- 자동 음성 인식 모델([`Wav2Vec2ForCTC`] 등)의 경우 모델은 `(batch_size,target_length)` 차원의 텐서를 입력으로 받으며, 각 값은 개별 토큰에 대한 예상 레이블을 나타냅니다.
+  
+<Tip>
+
+모델마다 요구하는 레이블 형식이 다를 수 있으므로, 각 모델의 문서를 확인하여 해당 모델에 맞는 레이블 형식을 반드시 확인하세요!
+
+</Tip>
+
+기본 모델([`BertModel`] 등)은 레이블을 입력으로 받지 않습니다. 이러한 모델은 단순히 특징(feature)을 출력하는 기본 트랜스포머 모델이기 때문입니다.
+
+### 대규모 언어 모델 (LLM)
+
+대규모 데이터로 학습된 트랜스포머 언어 모델(GPT-3, BLOOM, OPT 등)을 지칭하는 일반적인 용어입니다. 이러한 모델은 학습할 수 있는 파라미터(parameter)의 수가 매우 많으며, 예를 들어 GPT-3는 약 1,750억 개의 파라미터를 가지고 있습니다.
+
+## M
+
+### 마스킹된 언어 모델링 (MLM)
+
+사전 학습 단계 중 하나로, 모델은 일부 토큰이 무작위로 마스킹 된 손상된 문장을 입력받고, 원래의 문장을 예측해야 합니다.
+
+### 멀티모달 (multimodal)
+
+텍스트와 이미지와 같은 다른 형태의 입력을 함께 사용하는 작업입니다.
+
+## N
+
+### 자연어 생성 (NLG)
+
+텍스트를 생성하는 모든 작업을 의미합니다. (예: [Write With Transformers](https://transformer.huggingface.co/), 번역 등).
+
+### 자연어 처리 (NLP)
+
+텍스트를 다루는 작업 전반을 지칭하는 일반적인 용어입니다.
+
+### 자연어 이해 (NLU)
+
+텍스트에 담긴 의미를 이해하는 모든 작업을 포함합니다. (예: 전체 문서 분류, 개별 단어 분류 등).
+
+## P
+
+### 파이프라인 (pipeline)
+
+🤗 Transformers에서 파이프라인은 데이터를 전처리하고 변환한 후, 모델을 통해 예측값을 반환하는 일련의 단계를 순차적으로 수행하는 추상화된 개념입니다. 파이프라인에 포함될 수 있는 단계로는 데이터 전처리, 특징 추출(feature extraction), 정규화(normalization) 등이 있습니다.   
+
+자세한 내용은 [Pipelines for inference](https://huggingface.co/docs/transformers/pipeline_tutorial) 문서를 참고하세요.
+
+### 파이프라인 병렬화 (PP)
+
+모델을 수직 방향(레이어 단위)으로 여러 GPU에 분할하여 병렬로 처리하는 병렬화 기법입니다. 각 GPU는 모델의 하나 또는 여러 개의 레이어만을 담당하며, 전체 파이프라인의 서로 다른 단계를 병렬로 처리하게 됩니다. 또한 각 GPU는 배치(batch)의 일부 작은 조각만 처리합니다. Pipeline Parallel 방식에 대해 더 알아보려면 [이 문서](perf_train_gpu_many#from-naive-model-parallelism-to-pipeline-parallelism)를 참고하세요.
+
+### 픽셀 값 (pixel values)
+
+이미지를 수치상으로 표현한 텐서로, 모델에 입력으로 전달됩니다. 이 텐서는 이미지 프로세서를 통해 생성되면, 값은 [`batch_size`, `num_channels`, `height`, `width`] 형태의 차원을 가집니다.
+
+### 풀링 (pooling)
+
+행렬의 특정 차원에서 최댓값이나 평균값을 취하여 더 작은 행렬로 줄이는 연산입니다. 풀링 계층은 주로 합성곱 계층 사이에 위치하여 특징 표현을 다운샘플링 하는 데 사용됩니다.
+
+### 포지션 ID (position IDs)
+
+RNN 모델과 달리 트랜스포머는 각 토큰의 위치 정보를 내부적으로 가지고 있지 않습니다. 따라서 모델은 `position_ids`를 사용하여 각 토큰이 시퀀스 내에서 어느 위치에 있는지를 인식합니다. 이 값은 선택적인 파라미터입니다. 모델에 `position_ids`를 전달하지 않으면, 절대 위치 임베딩 방식으로 자동 생성됩니다. 절대 위치 임베딩은 `[0, config.max_position_embeddings - 1]` 범위 내에서 선택됩니다. 일부 모델은 사인파 형태의 위치 임베딩(sinusoidal position embeddings) 또는 상대 위치 임베딩(relative position embeddings)과 같은 다른 유형의 위치 임베딩을 사용하기도 합니다.
+
+### 전처리 (preprocessing)
+
+머신러닝 모델이 쉽게 처리할 수 있도록 가공되지 않은 데이터를 정제하는 작업입니다. 예를 들어, 텍스트는 일반적으로 토큰화(tokenization) 과정을 거칩니다. 다른 입력 유형에 대한 전처리 방식이 궁금하다면 [Preprocess](https://huggingface.co/docs/transformers/preprocessing) 튜토리얼을 참고해 보세요.
+
+### 사전 학습된 모델 (pretrained model)
+
+일부 데이터(예: 위키피디아 전체)로 사전 학습(pretraining)된 모델입니다. 사전 학습은 자기 지도 학습(self-supervised learning)의 목표를 포함하며, 예를 들어 문장을 읽고 다음 단어를 예측하거나 ([causal language modeling](#causal-language-modeling)) 참고, 일부 단어를 마스킹하고 이를 예측하는 방식([masked language modeling](#masked-language-modeling-mlm))이 있습니다.
+
+음성 및 비전 모델은 고유의 사전 학습 목표를 가지고 있습니다. 예를 들어, Wav2Vec2는 음성 표현 중 "진짜"를 "가짜" 중에서 구분하는 대조 학습(contrastive learning) 방식으로 사전 학습된 음성 모델입니다. 반면, BEiT는 이미지 패치 중 일부를 마스킹하고 이를 예측하는 마스킹 이미지 모델링 방식으로 사전 학습된 비전 모델입니다. 이는 마스킹 언어 모델링과 유사한 방식입니다.
+
+## R
+
+### 순환 신경망 (RNN)
+
+텍스트와 같은 시퀀스 데이터를 처리하기 위해 레이어에 반복 구조(루프)를 사용하는 신경망 모델의 한 종류입니다.
+
+### 표현학습 (representation learning)
+
+머신러닝의 하위 분야로, 원시 데이터로부터 의미 있는 표현을 학습하는 데 중점을 둡니다. 대표적인 기법으로는 단어 임베딩, 오토인코더(autoencoder), 생성적 적대 신경망(GAN) 등이 있습니다.
+
+## S
+
+### 샘플링 속도 (sampling rate)
+
+샘플링 속도는 1초에 추출하는 (오디오 신호) 샘플의 개수를 헤르츠(Hz) 단위로 나타낸 측정값입니다. 이는 음성처럼 연속적인 신호를 디지털화하여 이산적인 형태로 만드는 결과입니다.
+
+### 셀프 어텐션 (self-attention)
+
+입력의 각 요소가 다른 어떤 요소에 주목해야 하는지를 스스로 판단하는 메커니즘입니다. 이는 모델이 문장에서 특정 단어만을 보는 것이 아니라, 다른 단어들과의 관계를 고려하여 어떤 정보에 더 집중해야 할지를 학습하게 합니다.
+
+### 자기지도 학습 (self-supervised learning) 
+
+레이블이 없는 데이터로부터 모델이 스스로 학습 목표를 정의하여 학습하는 머신러닝 기법의 한 종류입니다. [비지도 학습](#unsupervised-learning)이나 [지도 학습](#supervised-learning)과 달리, 학습 과정 자체는 감독 방식 되지만, 라벨이 명시적으로 주어지는 것은 아닙니다.
+
+예시로는 [마스크 언어 모델링](#masked-language-modeling-mlm)이 있으며, 이는 문장의 일부 토큰을 제거한 상태로 모델에 입력하고, 모델이 해당 토큰을 예측하도록 학습하는 방식입니다.
+
+### 준지도 학습 (semi-supervised learning)
+
+소량의 라벨이 달린 데이터와 대량의 라벨이 없는 데이터를 함께 사용하여 모델의 정확도를 높이는 머신러닝 훈련 기법의 넓은 범주입니다. 이는 [지도 학습](#supervised-learning)이나 [비지도 학습](#unsupervised-learning)과는 다른 방식입니다.
+
+준지도 학습 기법의 예로는 "자기 학습(self-training)"이 있습니다. 이 방식은 먼저 라벨이 있는 데이터로 모델을 학습시키고, 그 모델을 사용해 라벨이 없는 데이터에 대한 예측을 수행합니다. 모델이 가장 높은 확신을 가지고 예측한 라벨이 없는 데이터 일부를 라벨이 있는 데이터로 추가하고, 이를 통해 모델을 다시 학습시킵니다.
+
+### 시퀀스 투 시퀀스 (seq2seq)
+
+입력으로부터 새로운 시퀀스를 생성하는 모델입니다. 예를 들어 번역 모델이나 요약 모델이 이에 해당하며, 대표적인 예로는 [Bart](model_doc/bart)나[T5](model_doc/t5) 모델이 있습니다.
+
+### 분할 DDP (Sharded DDP)
+
+[ZeRO](#zero-redundancy-optimizer-zero) 개념을 기반으로 다양한 구현에서 사용되는 다른 이름으로 불립니다.
+
+### 스트라이드 (stride)
+
+[convolution](#convolution) 또는 [pooling](#pooling)에서 스트라이드(stride)는 커널이 행렬 위를 이동하는 간격을 의미합니다. 스트라이드가 1이면 커널이 한 픽셀씩 이동하고, 2이면 두 픽셀씩 이동합니다.
+
+### 지도학습 (supervised learning)
+
+정답이 포함된 라벨링된 데이터를 직접 사용하여 모델의 성능을 개선하는 학습 방식입니다. 학습 중인 모델에 데이터를 입력하고, 예측 결과를 정답과 비교하여 오차를 계산합니다. 모델은 이 오차를 기반으로 가중치를 업데이트하며, 이러한 과정을 반복하여 성능을 최적화합니다.
+
+## T
+
+### 텐서 병렬화 (TP)
+
+여러 GPU에서 훈련하기 위한 병렬화 기법으로, 각 텐서를 여러 덩어리(chunk)로 나눕니다. 따라서 전체 텐서가 단일 GPU에 상주하는 대신, 텐서의 각 조각(shard)이 지정된 GPU에 상주하게 됩니다. 이 조각들은 각각 다른 GPU에서 개별적으로 병렬 처리되며, 처리 단계가 끝날 때 결과가 동기화됩니다. 이러한 분할이 수평 방향으로 일어나기 때문에, 이는 때때로 수평적 병렬화라고 불립니다. Tensor Parallelism에 대해 더 알아보려면 [여기](perf_train_gpu_many#tensor-parallelism)를 참고하세요.
+
+### 토큰 (token)
+
+일반적인 단어 단위이지만, 때에 따라 서브 워드(자주 사용되지 않는 단어는 서브 워드로 분리됨)나 문장 부호도 포함될 수 있는 문장의 구성 요소입니다.
+
+### 토큰 타입 ID (token type IDs)
+
+일부 모델은 문장 쌍 분류나 질의 응답 작업을 수행하는 데 사용됩니다.
+
+<Youtube id="0u3ioSwev3s"/>
+
+이러한 작업에서는 두 개의 서로 다른 시퀀스를 하나의 "input_ids" 항목으로 결합해야 하며, 일반적으로 `[CLS]` 분류용 및 `[SEP]` 구분용과 같은 특수 토큰을 사용하여 처리합니다. 예를 들어, BERT 모델은 두 개의 시퀀스를 다음과 같은 방식으로 구성합니다:
+
+```python
+>>> # [CLS] SEQUENCE_A [SEP] SEQUENCE_B [SEP]
+```
+
+두 개의 시퀀스를 `tokenizer`에 리스트가 아닌 개별 인자로 전달하면, 토크나이저가 자동으로 이러한 문장을 생성해 줍니다. 예시는 다음과 같습니다:
+
+```python
+>>> from transformers import BertTokenizer
+
+>>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-cased")
+>>> sequence_a = "HuggingFace is based in NYC"
+>>> sequence_b = "Where is HuggingFace based?"
+
+>>> encoded_dict = tokenizer(sequence_a, sequence_b)
+>>> decoded = tokenizer.decode(encoded_dict["input_ids"])
+```
+
+결과는 아래와 같습니다:
+
+```python
+>>> print(decoded)
+[CLS] HuggingFace is based in NYC [SEP] Where is HuggingFace based? [SEP]
+```
+
+이 코드는 일부 모델이 두 개의 시퀀스를 어떻게 구분하는지 이해하는 데 충분합니다. 그러나 BERT와 같은 다른 모델은 토큰 타입 ID(또는 세그먼트 ID)를 추가로 사용합니다. 이 ID는 0과 1로 구성된 이진 마스크로, 두 시퀀스를 구분하는 역할을 합니다.
+
+토크나이저는 이 마스크를 "token_type_id" 항목으로 반환합니다:
+
+```python
+>>> encoded_dict["token_type_ids"]
+[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+```
+
+질문에 사용되는 첫 번째 시퀀스인 "context"는 모든 토큰이 `0`으로 표시됩니다. 반면 두 번째 시퀀스인 "question"은 모든 토큰이 `1`로 표시됩니다.
+
+일부 모델(예: [`XLNetModel`])은 `2`로 표시되는 추가 토큰을 사용하기도 합니다.
+
+### 전이학습 (transfer learning)
+
+사전 학습된(pretrained) 모델을 가져와 특정 작업에 맞는 데이터셋에 대해 추가 학습하는 기술입니다. 모델을 처음부터 학습시키는 대신, 기존 모델이 학습한 지식을 출발점으로 삼아 더욱 빠르게 학습할 수 있습니다. 이를 통해 학습 속도를 높이고 필요한 데이터양도 줄일 수 있습니다.
+
+### 트랜스포머 (transformer)
+
+셀프 어텐션 메커니즘을 기반으로 한 딥러닝 모델 아키텍처입니다.
+
+## U
+
+### 비지도 학습 (unsupervised learning)
+
+정답(레이블)이 포함되지 않은 데이터를 이용해 모델을 학습시키는 방식입니다. 비지도 학습은 데이터 분포의 통계적 특성을 활용해 유용한 패턴을 찾아냅니다.
+
+## Z
+
+### Zero Redundancy Optimizer (ZeRO)
+
+[TensorParallel](#tensor-parallelism-tp)과 유사하게 텐서를 샤딩(sharding)하는 병렬 처리 기법이지만, 순전파(forward)나 역전파(backward) 계산 시점에 전체 텐서를 다시 복원한다는 점에서 차이가 있습니다. 따라서 모델 자체를 수정할 필요가 없습니다. 이 방법은 GPU 메모리가 부족할 경우 이를 보완하기 위한 다양한 오프로딩 (offloading) 기법도 지원합니다. 
+ZeRO에 대해 더 알아보려면 [이 문서](perf_train_gpu_many#zero-data-parallelism)를 참고하세요.
--- a/docs/source/ko/quantization/quark.md
+++ b/docs/source/ko/quantization/quark.md
@ -0,0 +1,85 @@
+<!--Copyright 2025 Advanced Micro Devices, Inc. and The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Quark[[quark]]
+
+[Quark](https://quark.docs.amd.com/latest/)는 특정 데이터 타입, 알고리즘, 하드웨어에 구애받지 않도록 설계된 딥러닝 양자화 툴킷입니다. Quark에서는 다양한 전처리 전략, 알고리즘, 데이터 타입을 조합하여 사용할 수 있습니다.
+
+🤗 Transformers를 통해 통합된 PyTorch 지원은 주로 AMD CPU 및 GPU를 대상으로 하며, 주로 평가 목적으로 사용됩니다. 예를 들어, [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness)를 🤗 Transformers 백엔드와 함께 사용하여 Quark로 양자화된 다양한 모델을 원활하게 평가할 수 있습니다.
+
+Quark에 관심이 있는 사용자는 [문서](https://quark.docs.amd.com/latest/)를 참고하여 모델 양자화를 시작하고 지원되는 오픈 소스 라이브러리에서 사용할 수 있습니다!
+
+Quark는 자체 체크포인트/[설정 포맷](https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test/blob/main/config.json#L26)를 가지고 있지만, 다른 양자화/런타임 구현체 ([AutoAWQ](https://huggingface.co/docs/transformers/quantization/awq), [네이티브 fp8](https://huggingface.co/docs/transformers/quantization/finegrained_fp8))와 호환되는 직렬화 레이아웃으로 모델을 생성하는 것도 지원합니다.
+
+Transformer에서 Quark 양자화 모델을 로드하려면 먼저 라이브러리를 설치해야 합니다:
+
+```bash
+pip install amd-quark
+```
+
+## 지원 매트릭스[[Support matrix]]
+
+Quark를 통해 양자화된 모델은 함께 조합할 수 있는 광범위한 기능을 지원합니다. 구성에 관계없이 모든 양자화된 모델은 `PretrainedModel.from_pretrained`를 통해 원활하게 다시 로드할 수 있습니다.
+
+아래 표는 Quark에서 지원하는 몇 가지 기능을 보여줍니다:
+
+| **기능**                        | **Quark에서 지원하는 항목**                                                                             |   |
+|---------------------------------|-----------------------------------------------------------------------------------------------------------|---|
+| 데이터 타입                     | int8, int4, int2, bfloat16, float16, fp8_e5m2, fp8_e4m3, fp6_e3m2, fp6_e2m3, fp4, OCP MX, MX6, MX9, bfp16 |   |
+| 양자화 전 모델 변환 | SmoothQuant, QuaRot, SpinQuant, AWQ                                                                       |   |
+| 양자화 알고리즘                 | GPTQ                                                                                                      |   |
+| 지원 연산자                     | ``nn.Linear``, ``nn.Conv2d``, ``nn.ConvTranspose2d``, ``nn.Embedding``, ``nn.EmbeddingBag``               |   |
+| 세분성(Granularity)             | per-tensor, per-channel, per-block, per-layer, per-layer type                                             |   |
+| KV 캐시                         | fp8                                                                                                       |   |
+| 활성화 캘리브레이션             | MinMax / Percentile / MSE                                                                                 |   |
+| 양자화 전략                     | weight-only, static, dynamic, with or without output quantization                                         |   |
+
+## Hugging Face Hub의 모델[[Models on Hugging Face Hub]]
+
+Quark 네이티브 직렬화를 사용하는 공개 모델은 https://huggingface.co/models?other=quark 에서 찾을 수 있습니다.
+
+Quark는 [`quant_method="fp8"`을 이용하는 모델](https://huggingface.co/models?other=fp8)과 [`quant_method="awq"`을 사용하는 모델](https://huggingface.co/models?other=awq)도 지원하지만, Transformers는 이러한 모델을 [AutoAWQ](https://huggingface.co/docs/transformers/quantization/awq)를 통해 불러오거나 
+[🤗 Transformers의 네이티브 fp8 지원](https://huggingface.co/docs/transformers/quantization/finegrained_fp8)을 사용합니다.
+
+## Transformers에서 Quark모델 사용하기[[Using Quark models in Transformers]]
+
+다음은 Transformers에서 Quark 모델을 불러오는 방법의 예시입니다:
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_id = "EmbeddedLLM/Llama-3.1-8B-Instruct-w_fp8_per_channel_sym"
+model = AutoModelForCausalLM.from_pretrained(model_id)
+model = model.to("cuda")
+
+print(model.model.layers[0].self_attn.q_proj)
+# QParamsLinear(
+#   (weight_quantizer): ScaledRealQuantizer()
+#   (input_quantizer): ScaledRealQuantizer()
+#   (output_quantizer): ScaledRealQuantizer()
+# )
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+inp = tokenizer("Where is a good place to cycle around Tokyo?", return_tensors="pt")
+inp = inp.to("cuda")
+
+res = model.generate(**inp, min_new_tokens=50, max_new_tokens=100)
+
+print(tokenizer.batch_decode(res)[0])
+# <|begin_of_text|>Where is a good place to cycle around Tokyo? There are several places in Tokyo that are suitable for cycling, depending on your skill level and interests. Here are a few suggestions:
+# 1. Yoyogi Park: This park is a popular spot for cycling and has a wide, flat path that's perfect for beginners. You can also visit the Meiji Shrine, a famous Shinto shrine located in the park.
+# 2. Imperial Palace East Garden: This beautiful garden has a large, flat path that's perfect for cycling. You can also visit the
+```
--- a/examples/modular-transformers/configuration_duplicated_method.py
+++ b/examples/modular-transformers/configuration_duplicated_method.py
@ -0,0 +1,216 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from examples/modular-transformers/modular_duplicated_method.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_duplicated_method.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+
+from ...configuration_utils import PretrainedConfig
+from ...modeling_rope_utils import rope_config_validation
+
+
+class DuplicatedMethodConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DuplicatedMethodModel`]. It is used to instantiate an DuplicatedMethod
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the DuplicatedMethod-7B.
+    e.g. [meta-duplicated_method/DuplicatedMethod-2-7b-hf](https://huggingface.co/meta-duplicated_method/DuplicatedMethod-2-7b-hf)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the DuplicatedMethod model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`DuplicatedMethodModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. DuplicatedMethod 1 supports up to 2048 tokens,
+            DuplicatedMethod 2 up to 4096, CodeLlama up to 16384.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        pretraining_tp (`int`, *optional*, defaults to 1):
+            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
+            document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to
+            understand more about it. This value is necessary to ensure exact reproducibility of the pretraining
+            results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232).
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'duplicated_method3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'duplicated_method3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'duplicated_method3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'duplicated_method3'. Scaling factor applied to high frequency components of the RoPE
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        mlp_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
+        head_dim (`int`, *optional*):
+            The attention head dimension. If None, it will default to hidden_size // num_attention_heads
+
+    ```python
+    >>> from transformers import DuplicatedMethodModel, DuplicatedMethodConfig
+
+    >>> # Initializing a DuplicatedMethod duplicated_method-7b style configuration
+    >>> configuration = DuplicatedMethodConfig()
+
+    >>> # Initializing a model from the duplicated_method-7b style configuration
+    >>> model = DuplicatedMethodModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "duplicated_method"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    # Default tensor parallel plan for base model `DuplicatedMethodModel`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        mlp_bias=False,
+        head_dim=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.mlp_bias = mlp_bias
+        self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, copy it it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    @property
+    def vocab_size(self):
+        return 45
+
+    @vocab_size.setter
+    def vocab_size(self, value):
+        self.vocab_size = value
--- a/examples/modular-transformers/configuration_my_new_model.py
+++ b/examples/modular-transformers/configuration_my_new_model.py
@ -125,8 +125,6 @@ class MyNewModelConfig(PretrainedConfig):
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
-        new_param (`int`, *optional*, defaults to `False`):
-            A fun new parameter
    """

    model_type = "my_new_model"
--- a/examples/modular-transformers/modeling_dummy.py
+++ b/examples/modular-transformers/modeling_dummy.py
@ -1,446 +0,0 @@
-#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-#           This file was automatically generated from examples/modular-transformers/modular_dummy.py.
-#               Do NOT edit this file manually as any edits will be overwritten by the generation of
-#             the file from the modular. If any change should be done, please apply the change to the
-#                          modular_dummy.py file directly. One of our CI enforces this.
-#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-from typing import Callable, Optional
-
-import torch
-from torch import nn
-
-from ...activations import ACT2FN
-from ...cache_utils import Cache, DynamicCache
-from ...integrations import use_kernel_forward_from_hub
-from ...masking_utils import create_causal_mask
-from ...modeling_flash_attention_utils import FlashAttentionKwargs
-from ...modeling_layers import GradientCheckpointingLayer
-from ...modeling_outputs import BaseModelOutputWithPast
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
-from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
-from ...processing_utils import Unpack
-from ...utils import auto_docstring, can_return_tuple, logging
-from .configuration_dummy import DummyConfig
-
-
-logger = logging.get_logger(__name__)
-
-
-@use_kernel_forward_from_hub("RMSNorm")
-class DummyRMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        DummyRMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-
-    def extra_repr(self):
-        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
-
-
-class DummyRotaryEmbedding(nn.Module):
-    def __init__(self, config: DummyConfig, device=None):
-        super().__init__()
-        # BC: "rope_type" was originally "type"
-        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
-            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
-        else:
-            self.rope_type = "default"
-        self.max_seq_len_cached = config.max_position_embeddings
-        self.original_max_seq_len = config.max_position_embeddings
-
-        self.config = config
-        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
-
-        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.original_inv_freq = self.inv_freq
-
-    @torch.no_grad()
-    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
-    def forward(self, x, position_ids):
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
-        position_ids_expanded = position_ids[:, None, :].float()
-
-        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
-            emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos() * self.attention_scaling
-            sin = emb.sin() * self.attention_scaling
-
-        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
-
-
-class DummyMLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
-        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    def forward(self, x):
-        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-        return down_proj
-
-
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 4]
-    x2 = x[..., x.shape[-1] // 4 :]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
-    """Applies Rotary Position Embedding to the query and key tensors.
-
-    Args:
-        q (`torch.Tensor`): The query tensor.
-        k (`torch.Tensor`): The key tensor.
-        cos (`torch.Tensor`): The cosine part of the rotary embedding.
-        sin (`torch.Tensor`): The sine part of the rotary embedding.
-        position_ids (`torch.Tensor`, *optional*):
-            Deprecated and unused.
-        unsqueeze_dim (`int`, *optional*, defaults to 1):
-            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
-            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
-            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
-            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
-            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
-            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
-    Returns:
-        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
-    """
-    cos = cos.unsqueeze(unsqueeze_dim)
-    sin = sin.unsqueeze(unsqueeze_dim)
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-
-
-def eager_attention_forward(
-    module: nn.Module,
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    attention_mask: Optional[torch.Tensor],
-    scaling: float,
-    dropout: float = 0.0,
-    **kwargs,
-):
-    key_states = repeat_kv(key, module.num_key_value_groups)
-    value_states = repeat_kv(value, module.num_key_value_groups)
-
-    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
-    if attention_mask is not None:
-        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
-        attn_weights = attn_weights + causal_mask
-
-    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
-    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
-    attn_output = torch.matmul(attn_weights, value_states)
-    attn_output = attn_output.transpose(1, 2).contiguous()
-
-    return attn_output, attn_weights
-
-
-class DummyAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-
-    def __init__(self, config: DummyConfig, layer_idx: int):
-        super().__init__()
-        self.config = config
-        self.layer_idx = layer_idx
-        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
-        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
-        self.scaling = self.head_dim**-0.5
-        self.attention_dropout = config.attention_dropout
-        self.is_causal = True
-
-        self.q_proj = nn.Linear(
-            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
-        )
-        self.k_proj = nn.Linear(
-            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
-        )
-        self.v_proj = nn.Linear(
-            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
-        )
-        self.o_proj = nn.Linear(
-            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
-        )
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        position_embeddings: tuple[torch.Tensor, torch.Tensor],
-        attention_mask: Optional[torch.Tensor],
-        past_key_value: Optional[Cache] = None,
-        cache_position: Optional[torch.LongTensor] = None,
-        **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
-        input_shape = hidden_states.shape[:-1]
-        hidden_shape = (*input_shape, -1, self.head_dim)
-
-        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
-        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
-        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
-
-        cos, sin = position_embeddings
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
-
-        if past_key_value is not None:
-            # sin and cos are specific to RoPE models; cache_position needed for the static cache
-            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
-
-        attn_output, attn_weights = attention_interface(
-            self,
-            query_states,
-            key_states,
-            value_states,
-            attention_mask,
-            dropout=0.0 if not self.training else self.attention_dropout,
-            scaling=self.scaling,
-            **kwargs,
-        )
-
-        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
-        attn_output = self.o_proj(attn_output)
-        return attn_output, attn_weights
-
-
-class DummyDecoderLayer(GradientCheckpointingLayer):
-    def __init__(self, config: DummyConfig, layer_idx: int):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-
-        self.self_attn = DummyAttention(config=config, layer_idx=layer_idx)
-
-        self.mlp = DummyMLP(config)
-        self.input_layernorm = DummyRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = DummyRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        cache_position: Optional[torch.LongTensor] = None,
-        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
-        **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        residual = hidden_states
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-            cache_position=cache_position,
-            position_embeddings=position_embeddings,
-            **kwargs,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states,)
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        return outputs
-
-
-@auto_docstring
-class DummyPreTrainedModel(PreTrainedModel):
-    config_class = DummyConfig
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["DummyDecoderLayer"]
-    _skip_keys_device_placement = ["past_key_values"]
-    _supports_flash_attn_2 = True
-    _supports_sdpa = True
-    _supports_flex_attn = True
-    _supports_cache_class = True
-    _supports_quantized_cache = True
-    _supports_static_cache = True
-    _supports_attention_backend = True
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-        elif isinstance(module, DummyRMSNorm):
-            module.weight.data.fill_(1.0)
-
-
-@auto_docstring
-class DummyModel(DummyPreTrainedModel):
-    def __init__(self, config: DummyConfig):
-        super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.layers = nn.ModuleList(
-            [DummyDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
-        )
-        self.norm = DummyRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.rotary_emb = DummyRotaryEmbedding(config=config)
-        self.gradient_checkpointing = False
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-
-    @can_return_tuple
-    @auto_docstring
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Cache] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        cache_position: Optional[torch.LongTensor] = None,
-        **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
-    ) -> BaseModelOutputWithPast:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
-
-        if self.gradient_checkpointing and self.training and use_cache:
-            logger.warning_once(
-                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
-            )
-            use_cache = False
-
-        # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
-        if not isinstance(past_key_values, (type(None), Cache)):
-            raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-
-        if use_cache and past_key_values is None:
-            past_key_values = DynamicCache()
-
-        if cache_position is None:
-            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
-            cache_position = torch.arange(
-                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
-            )
-
-        if position_ids is None:
-            position_ids = cache_position.unsqueeze(0)
-
-        causal_mask = create_causal_mask(
-            config=self.config,
-            input_embeds=inputs_embeds,
-            attention_mask=attention_mask,
-            cache_position=cache_position,
-            past_key_values=past_key_values,
-        )
-
-        hidden_states = inputs_embeds
-
-        # create position embeddings to be shared across the decoder layers
-        position_embeddings = self.rotary_emb(hidden_states, position_ids)
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-
-        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
-            layer_outputs = decoder_layer(
-                hidden_states,
-                attention_mask=causal_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_values,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-                cache_position=cache_position,
-                position_embeddings=position_embeddings,
-                **flash_attn_kwargs,
-            )
-
-            hidden_states = layer_outputs[0]
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=past_key_values if use_cache else None,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
--- a/examples/modular-transformers/modeling_global_indexing.py
+++ b/examples/modular-transformers/modeling_global_indexing.py
@ -0,0 +1,169 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from examples/modular-transformers/modular_global_indexing.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_global_indexing.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+from typing import Callable, Optional
+
+import torch
+from torch import nn
+
+from transformers.modeling_utils import AttentionInterface
+
+from ...cache_utils import Cache
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs
+from .configuration_global_indexing import GlobalIndexingConfig
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+def custom_flex(x, **kwargs):
+    """Dummy function."""
+    return x
+
+
+ALL_ATTENTION_FUNCTIONS = AttentionInterface()
+# This indexing statement and associated function should be exported correctly!
+ALL_ATTENTION_FUNCTIONS["flex_attention"] = custom_flex
+
+
+class GlobalIndexingAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: GlobalIndexingConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_value: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
--- a/examples/modular-transformers/modeling_multimodal1.py
+++ b/examples/modular-transformers/modeling_multimodal1.py
@ -1,446 +0,0 @@
-#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-#           This file was automatically generated from examples/modular-transformers/modular_multimodal1.py.
-#               Do NOT edit this file manually as any edits will be overwritten by the generation of
-#             the file from the modular. If any change should be done, please apply the change to the
-#                          modular_multimodal1.py file directly. One of our CI enforces this.
-#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-from typing import Callable, Optional
-
-import torch
-from torch import nn
-
-from ...activations import ACT2FN
-from ...cache_utils import Cache, DynamicCache
-from ...integrations import use_kernel_forward_from_hub
-from ...masking_utils import create_causal_mask
-from ...modeling_flash_attention_utils import FlashAttentionKwargs
-from ...modeling_layers import GradientCheckpointingLayer
-from ...modeling_outputs import BaseModelOutputWithPast
-from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
-from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
-from ...processing_utils import Unpack
-from ...utils import auto_docstring, can_return_tuple, logging
-from .configuration_multimodal1 import Multimodal1TextConfig
-
-
-logger = logging.get_logger(__name__)
-
-
-@use_kernel_forward_from_hub("RMSNorm")
-class Multimodal1TextRMSNorm(nn.Module):
-    def __init__(self, hidden_size, eps=1e-6):
-        """
-        Multimodal1TextRMSNorm is equivalent to T5LayerNorm
-        """
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(hidden_size))
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-
-    def extra_repr(self):
-        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
-
-
-class Multimodal1TextRotaryEmbedding(nn.Module):
-    def __init__(self, config: Multimodal1TextConfig, device=None):
-        super().__init__()
-        # BC: "rope_type" was originally "type"
-        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
-            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
-        else:
-            self.rope_type = "default"
-        self.max_seq_len_cached = config.max_position_embeddings
-        self.original_max_seq_len = config.max_position_embeddings
-
-        self.config = config
-        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
-
-        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.original_inv_freq = self.inv_freq
-
-    @torch.no_grad()
-    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
-    def forward(self, x, position_ids):
-        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
-        position_ids_expanded = position_ids[:, None, :].float()
-
-        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
-        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
-            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
-            emb = torch.cat((freqs, freqs), dim=-1)
-            cos = emb.cos() * self.attention_scaling
-            sin = emb.sin() * self.attention_scaling
-
-        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
-
-
-class Multimodal1TextMLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.hidden_size = config.hidden_size
-        self.intermediate_size = config.intermediate_size
-        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
-        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
-        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
-        self.act_fn = ACT2FN[config.hidden_act]
-
-    def forward(self, x):
-        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
-        return down_proj
-
-
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
-    """Applies Rotary Position Embedding to the query and key tensors.
-
-    Args:
-        q (`torch.Tensor`): The query tensor.
-        k (`torch.Tensor`): The key tensor.
-        cos (`torch.Tensor`): The cosine part of the rotary embedding.
-        sin (`torch.Tensor`): The sine part of the rotary embedding.
-        position_ids (`torch.Tensor`, *optional*):
-            Deprecated and unused.
-        unsqueeze_dim (`int`, *optional*, defaults to 1):
-            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
-            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
-            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
-            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
-            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
-            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
-    Returns:
-        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
-    """
-    cos = cos.unsqueeze(unsqueeze_dim)
-    sin = sin.unsqueeze(unsqueeze_dim)
-    q_embed = (q * cos) + (rotate_half(q) * sin)
-    k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
-
-
-def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
-    """
-    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
-    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
-    """
-    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
-    if n_rep == 1:
-        return hidden_states
-    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
-    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
-
-
-def eager_attention_forward(
-    module: nn.Module,
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    attention_mask: Optional[torch.Tensor],
-    scaling: float,
-    dropout: float = 0.0,
-    **kwargs,
-):
-    key_states = repeat_kv(key, module.num_key_value_groups)
-    value_states = repeat_kv(value, module.num_key_value_groups)
-
-    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
-    if attention_mask is not None:
-        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
-        attn_weights = attn_weights + causal_mask
-
-    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
-    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
-    attn_output = torch.matmul(attn_weights, value_states)
-    attn_output = attn_output.transpose(1, 2).contiguous()
-
-    return attn_output, attn_weights
-
-
-class Multimodal1TextAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-
-    def __init__(self, config: Multimodal1TextConfig, layer_idx: int):
-        super().__init__()
-        self.config = config
-        self.layer_idx = layer_idx
-        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
-        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
-        self.scaling = self.head_dim**-0.5
-        self.attention_dropout = config.attention_dropout
-        self.is_causal = True
-
-        self.q_proj = nn.Linear(
-            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
-        )
-        self.k_proj = nn.Linear(
-            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
-        )
-        self.v_proj = nn.Linear(
-            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
-        )
-        self.o_proj = nn.Linear(
-            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
-        )
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        position_embeddings: tuple[torch.Tensor, torch.Tensor],
-        attention_mask: Optional[torch.Tensor],
-        past_key_value: Optional[Cache] = None,
-        cache_position: Optional[torch.LongTensor] = None,
-        **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
-        input_shape = hidden_states.shape[:-1]
-        hidden_shape = (*input_shape, -1, self.head_dim)
-
-        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
-        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
-        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
-
-        cos, sin = position_embeddings
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
-
-        if past_key_value is not None:
-            # sin and cos are specific to RoPE models; cache_position needed for the static cache
-            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        attention_interface: Callable = eager_attention_forward
-        if self.config._attn_implementation != "eager":
-            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
-
-        attn_output, attn_weights = attention_interface(
-            self,
-            query_states,
-            key_states,
-            value_states,
-            attention_mask,
-            dropout=0.0 if not self.training else self.attention_dropout,
-            scaling=self.scaling,
-            **kwargs,
-        )
-
-        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
-        attn_output = self.o_proj(attn_output)
-        return attn_output, attn_weights
-
-
-class Multimodal1TextDecoderLayer(GradientCheckpointingLayer):
-    def __init__(self, config: Multimodal1TextConfig, layer_idx: int):
-        super().__init__()
-        self.hidden_size = config.hidden_size
-
-        self.self_attn = Multimodal1TextAttention(config=config, layer_idx=layer_idx)
-
-        self.mlp = Multimodal1TextMLP(config)
-        self.input_layernorm = Multimodal1TextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = Multimodal1TextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        cache_position: Optional[torch.LongTensor] = None,
-        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
-        **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        residual = hidden_states
-        hidden_states = self.input_layernorm(hidden_states)
-
-        # Self Attention
-        hidden_states, self_attn_weights = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-            cache_position=cache_position,
-            position_embeddings=position_embeddings,
-            **kwargs,
-        )
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states,)
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        return outputs
-
-
-@auto_docstring
-class Multimodal1TextPreTrainedModel(PreTrainedModel):
-    config_class = Multimodal1TextConfig
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["Multimodal1TextDecoderLayer"]
-    _skip_keys_device_placement = ["past_key_values"]
-    _supports_flash_attn_2 = True
-    _supports_sdpa = True
-    _supports_flex_attn = True
-    _supports_cache_class = True
-    _supports_quantized_cache = True
-    _supports_static_cache = True
-    _supports_attention_backend = True
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-        elif isinstance(module, Multimodal1TextRMSNorm):
-            module.weight.data.fill_(1.0)
-
-
-@auto_docstring
-class Multimodal1TextModel(Multimodal1TextPreTrainedModel):
-    def __init__(self, config: Multimodal1TextConfig):
-        super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.layers = nn.ModuleList(
-            [Multimodal1TextDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
-        )
-        self.norm = Multimodal1TextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.rotary_emb = Multimodal1TextRotaryEmbedding(config=config)
-        self.gradient_checkpointing = False
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.embed_tokens
-
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-
-    @can_return_tuple
-    @auto_docstring
-    def forward(
-        self,
-        input_ids: Optional[torch.LongTensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[Cache] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        cache_position: Optional[torch.LongTensor] = None,
-        **flash_attn_kwargs: Unpack[FlashAttentionKwargs],
-    ) -> BaseModelOutputWithPast:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        if (input_ids is None) ^ (inputs_embeds is not None):
-            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
-
-        if self.gradient_checkpointing and self.training and use_cache:
-            logger.warning_once(
-                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
-            )
-            use_cache = False
-
-        # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
-        if not isinstance(past_key_values, (type(None), Cache)):
-            raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.")
-
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-
-        if use_cache and past_key_values is None:
-            past_key_values = DynamicCache()
-
-        if cache_position is None:
-            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
-            cache_position = torch.arange(
-                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
-            )
-
-        if position_ids is None:
-            position_ids = cache_position.unsqueeze(0)
-
-        causal_mask = create_causal_mask(
-            config=self.config,
-            input_embeds=inputs_embeds,
-            attention_mask=attention_mask,
-            cache_position=cache_position,
-            past_key_values=past_key_values,
-        )
-
-        hidden_states = inputs_embeds
-
-        # create position embeddings to be shared across the decoder layers
-        position_embeddings = self.rotary_emb(hidden_states, position_ids)
-
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-
-        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
-            layer_outputs = decoder_layer(
-                hidden_states,
-                attention_mask=causal_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_values,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-                cache_position=cache_position,
-                position_embeddings=position_embeddings,
-                **flash_attn_kwargs,
-            )
-
-            hidden_states = layer_outputs[0]
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
-        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
-        return BaseModelOutputWithPast(
-            last_hidden_state=hidden_states,
-            past_key_values=past_key_values if use_cache else None,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-        )
--- a/examples/modular-transformers/modeling_multimodal2.py
+++ b/examples/modular-transformers/modeling_multimodal2.py
@ -289,7 +289,6 @@ class Multimodal2VisionEncoder(nn.Module):
        self.layers = nn.ModuleList([Multimodal2VisionEncoderLayer(config) for _ in range(config.num_hidden_layers)])
        self.gradient_checkpointing = False

-    @can_return_tuple
    def forward(
        self,
        inputs_embeds,
@ -455,7 +454,6 @@ class Multimodal2VisionTransformer(nn.Module):
        self.encoder = Multimodal2VisionEncoder(config)
        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)

-    @can_return_tuple
    @auto_docstring
    def forward(
        self,
@ -500,6 +498,7 @@ class Multimodal2VisionPreTrainedModel(PreTrainedModel):
    supports_gradient_checkpointing = True
    _supports_sdpa = True
    _supports_flash_attn_2 = True
+    _supports_flash_attn_3 = True
    _supports_flex_attn = True
    _supports_attention_backend = True

--- a/examples/modular-transformers/modeling_my_new_model2.py
+++ b/examples/modular-transformers/modeling_my_new_model2.py
@ -12,13 +12,13 @@ from torch import nn
 from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache
 from ...masking_utils import create_causal_mask
-from ...modeling_flash_attention_utils import FlashAttentionKwargs
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutputWithPast, SequenceClassifierOutputWithPast
 from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
-from ...utils import auto_docstring, can_return_tuple, logging
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
+from ...utils.generic import check_model_inputs
 from .configuration_my_new_model2 import MyNewModel2Config


@ -65,7 +65,7 @@ class MyNewModel2RotaryEmbedding(nn.Module):
    def __init__(self, config: MyNewModel2Config, device=None):
        super().__init__()
        # BC: "rope_type" was originally "type"
-        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
        else:
            self.rope_type = "default"
@ -149,7 +149,7 @@ def eager_attention_forward(
    attention_mask: Optional[torch.Tensor],
    scaling: float,
    dropout: float = 0.0,
-    **kwargs,
+    **kwargs: Unpack[TransformersKwargs],
 ):
    key_states = repeat_kv(key, module.num_key_value_groups)
    value_states = repeat_kv(value, module.num_key_value_groups)
@ -200,8 +200,8 @@ class MyNewModel2Attention(nn.Module):
        attention_mask: Optional[torch.Tensor],
        past_key_value: Optional[Cache] = None,
        cache_position: Optional[torch.LongTensor] = None,
-        **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
        input_shape = hidden_states.shape[:-1]
        hidden_shape = (*input_shape, -1, self.head_dim)

@ -254,22 +254,19 @@ class MyNewModel2DecoderLayer(GradientCheckpointingLayer):
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
-        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = False,
        cache_position: Optional[torch.LongTensor] = None,
        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
-        **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor]:
        residual = hidden_states
        hidden_states = self.input_layernorm(hidden_states)
-
        # Self Attention
-        hidden_states, self_attn_weights = self.self_attn(
+        hidden_states, _ = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_value=past_key_value,
-            output_attentions=output_attentions,
            use_cache=use_cache,
            cache_position=cache_position,
            position_embeddings=position_embeddings,
@ -282,12 +279,7 @@ class MyNewModel2DecoderLayer(GradientCheckpointingLayer):
        hidden_states = self.post_attention_layernorm(hidden_states)
        hidden_states = self.mlp(hidden_states)
        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states,)
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        return outputs
+        return hidden_states


@auto_docstring
@ -298,12 +290,17 @@ class MyNewModel2PreTrainedModel(PreTrainedModel):
    _no_split_modules = ["MyNewModel2DecoderLayer"]
    _skip_keys_device_placement = ["past_key_values"]
    _supports_flash_attn_2 = True
+    _supports_flash_attn_3 = True
    _supports_sdpa = True
    _supports_flex_attn = True
    _supports_cache_class = True
    _supports_quantized_cache = True
    _supports_static_cache = True
    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": MyNewModel2DecoderLayer,
+        "attentions": MyNewModel2Attention,
+    }

    def _init_weights(self, module):
        std = self.config.initializer_range
@ -343,7 +340,7 @@ class MyNewModel2Model(MyNewModel2PreTrainedModel):
    def set_input_embeddings(self, value):
        self.embed_tokens = value

-    @can_return_tuple
+    @check_model_inputs
    @auto_docstring
    def forward(
        self,
@ -353,26 +350,12 @@ class MyNewModel2Model(MyNewModel2PreTrainedModel):
        past_key_values: Optional[Cache] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
-        **kwargs: Unpack[FlashAttentionKwargs],
+        **kwargs: Unpack[TransformersKwargs],
    ) -> BaseModelOutputWithPast:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
        if (input_ids is None) ^ (inputs_embeds is not None):
            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")

-        if self.gradient_checkpointing and self.training and use_cache:
-            logger.warning_once(
-                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
-            )
-            use_cache = False
-
        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(input_ids)

@ -394,6 +377,7 @@ class MyNewModel2Model(MyNewModel2PreTrainedModel):
            attention_mask=attention_mask,
            cache_position=cache_position,
            past_key_values=past_key_values,
+            position_ids=position_ids,
        )

        # embed positions
@ -408,42 +392,21 @@ class MyNewModel2Model(MyNewModel2PreTrainedModel):
        normalizer = torch.tensor(self.config.hidden_size**0.5, dtype=hidden_states.dtype)
        hidden_states = hidden_states * normalizer

-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-
        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-
-            layer_outputs = decoder_layer(
+            hidden_states = decoder_layer(
                hidden_states,
                attention_mask=causal_mask,
                position_ids=position_ids,
                past_key_value=past_key_values,
-                output_attentions=output_attentions,
                use_cache=use_cache,
                cache_position=cache_position,
                position_embeddings=position_embeddings,
                **kwargs,
            )
-
-            hidden_states = layer_outputs[0]
-
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-
        hidden_states = self.norm(hidden_states)
-
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-
        return BaseModelOutputWithPast(
            last_hidden_state=hidden_states,
            past_key_values=past_key_values if use_cache else None,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
        )


@ -488,8 +451,7 @@ class MyNewModel2ForSequenceClassification(MyNewModel2PreTrainedModel):
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
+        **kwargs: Unpack[TransformersKwargs],
    ) -> SequenceClassifierOutputWithPast:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@ -505,8 +467,7 @@ class MyNewModel2ForSequenceClassification(MyNewModel2PreTrainedModel):
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
+            **kwargs,
        )
        hidden_states = transformer_outputs.last_hidden_state
        logits = self.score(hidden_states)
--- a/examples/modular-transformers/modeling_new_task_model.py
+++ b/examples/modular-transformers/modeling_new_task_model.py
@ -96,6 +96,7 @@ class NewTaskModelPreTrainedModel(PreTrainedModel):
    _supports_quantized_cache = True
    _supports_static_cache = True
    _supports_flash_attn_2 = True
+    _supports_flash_attn_3 = True
    _supports_sdpa = True
    _supports_flex_attn = True
    _supports_attention_backend = True
@ -118,6 +119,8 @@ class NewTaskModelPreTrainedModel(PreTrainedModel):
 )
 class NewTaskModelModel(NewTaskModelPreTrainedModel):
    _checkpoint_conversion_mapping = {"language_model.model": "language_model"}
+    # we are filtering the logits/labels so we shouldn't divide the loss based on num_items_in_batch
+    accepts_loss_kwargs = False

    def __init__(self, config: NewTaskModelConfig):
        super().__init__(config)
@ -313,9 +316,11 @@ class NewTaskModelModel(NewTaskModelPreTrainedModel):
                special_image_mask = inputs_embeds == self.get_input_embeddings()(
                    torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
                )
+                special_image_mask = special_image_mask.all(-1)
            else:
-                special_image_mask = (input_ids == self.config.image_token_id).unsqueeze(-1)
-                special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device)
+                special_image_mask = input_ids == self.config.image_token_id
+
+            special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)

            if not is_torchdynamo_compiling() and inputs_embeds[special_image_mask].numel() != image_features.numel():
                image_tokens_in_text = (special_image_mask).sum(dim=1).sum(dim=0)[0]
@ -433,32 +438,6 @@ class NewTaskModelForNewTask(NewTaskModelPreTrainedModel, GenerationMixin):
        num_logits_to_keep: int = 0,
    ) -> Union[tuple, NewTaskModelCausalLMOutputWithPast]:
        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
-
-        Example:
-
-        ```python
-        >>> from PIL import Image
-        >>> import requests
-        >>> from transformers import AutoProcessor, NewTaskModelForNewTask
-
-        >>> model = NewTaskModelForNewTask.from_pretrained("google/new_task_model2-3b-mix-224")
-        >>> processor = AutoProcessor.from_pretrained("google/new_task_model2-3b-mix-224")
-
-        >>> prompt = "Where is the cat standing?"
-        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
-        >>> image = Image.open(requests.get(url, stream=True).raw)
-
-        >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")
-
-        >>> # Generate
-        >>> generate_ids = model.generate(**inputs,)
-        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        "Where is the cat standing?\nsnow"
-        ```
        Returns:
        """
        vlm_outputs = super().forward(
--- a/examples/modular-transformers/modeling_super.py
+++ b/examples/modular-transformers/modeling_super.py
@ -14,12 +14,12 @@ from transformers.modeling_outputs import CausalLMOutputWithPast
 from ...activations import ACT2FN
 from ...cache_utils import Cache
 from ...integrations import use_kernel_forward_from_hub
-from ...modeling_flash_attention_utils import FlashAttentionKwargs
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
-from ...utils import auto_docstring, can_return_tuple
+from ...utils import TransformersKwargs, auto_docstring
+from ...utils.generic import check_model_inputs
 from .configuration_super import SuperConfig


@ -48,7 +48,7 @@ class SuperRotaryEmbedding(nn.Module):
    def __init__(self, config: SuperConfig, device=None):
        super().__init__()
        # BC: "rope_type" was originally "type"
-        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
        else:
            self.rope_type = "default"
@ -148,7 +148,7 @@ def eager_attention_forward(
    attention_mask: Optional[torch.Tensor],
    scaling: float,
    dropout: float = 0.0,
-    **kwargs,
+    **kwargs: Unpack[TransformersKwargs],
 ):
    key_states = repeat_kv(key, module.num_key_value_groups)
    value_states = repeat_kv(value, module.num_key_value_groups)
@ -199,8 +199,8 @@ class SuperAttention(nn.Module):
        attention_mask: Optional[torch.Tensor],
        past_key_value: Optional[Cache] = None,
        cache_position: Optional[torch.LongTensor] = None,
-        **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
        input_shape = hidden_states.shape[:-1]
        hidden_shape = (*input_shape, -1, self.head_dim)

@ -253,22 +253,19 @@ class SuperDecoderLayer(GradientCheckpointingLayer):
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_value: Optional[Cache] = None,
-        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = False,
        cache_position: Optional[torch.LongTensor] = None,
        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
-        **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor]:
        residual = hidden_states
        hidden_states = self.input_layernorm(hidden_states)
-
        # Self Attention
-        hidden_states, self_attn_weights = self.self_attn(
+        hidden_states, _ = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_value=past_key_value,
-            output_attentions=output_attentions,
            use_cache=use_cache,
            cache_position=cache_position,
            position_embeddings=position_embeddings,
@ -281,12 +278,7 @@ class SuperDecoderLayer(GradientCheckpointingLayer):
        hidden_states = self.post_attention_layernorm(hidden_states)
        hidden_states = self.mlp(hidden_states)
        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states,)
-        if output_attentions:
-            outputs += (self_attn_weights,)
-
-        return outputs
+        return hidden_states


@auto_docstring
@ -297,12 +289,17 @@ class SuperPreTrainedModel(PreTrainedModel):
    _no_split_modules = ["SuperDecoderLayer"]
    _skip_keys_device_placement = ["past_key_values"]
    _supports_flash_attn_2 = True
+    _supports_flash_attn_3 = True
    _supports_sdpa = True
    _supports_flex_attn = True
    _supports_cache_class = True
    _supports_quantized_cache = True
    _supports_static_cache = True
    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": SuperDecoderLayer,
+        "attentions": SuperAttention,
+    }

    def _init_weights(self, module):
        std = self.config.initializer_range
@ -342,7 +339,7 @@ class SuperModel(SuperPreTrainedModel):
    def set_input_embeddings(self, value):
        self.embed_tokens = value

-    @can_return_tuple
+    @check_model_inputs
    @auto_docstring
    def forward(
        self,
--- a/examples/modular-transformers/modeling_switch_function.py
+++ b/examples/modular-transformers/modeling_switch_function.py
@ -11,9 +11,9 @@ import torch
 from torch import nn

 from ...cache_utils import Cache
-from ...modeling_flash_attention_utils import FlashAttentionKwargs
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
 from ...processing_utils import Unpack
+from ...utils import TransformersKwargs
 from .configuration_switch_function import SwitchFunctionConfig


@ -72,7 +72,7 @@ def eager_attention_forward(
    attention_mask: Optional[torch.Tensor],
    scaling: float,
    dropout: float = 0.0,
-    **kwargs,
+    **kwargs: Unpack[TransformersKwargs],
 ):
    key_states = repeat_kv(key, module.num_key_value_groups)
    value_states = repeat_kv(value, module.num_key_value_groups)
@ -123,8 +123,8 @@ class SwitchFunctionAttention(nn.Module):
        attention_mask: Optional[torch.Tensor],
        past_key_value: Optional[Cache] = None,
        cache_position: Optional[torch.LongTensor] = None,
-        **kwargs: Unpack[FlashAttentionKwargs],
-    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
        input_shape = hidden_states.shape[:-1]
        hidden_shape = (*input_shape, -1, self.head_dim)

--- a/examples/modular-transformers/modular_dummy.py
+++ b/examples/modular-transformers/modular_dummy.py
@ -1,15 +0,0 @@
-import torch
-
-from transformers.models.llama.modeling_llama import LlamaModel
-
-
-def rotate_half(x):
-    """Rotates half the hidden dims of the input."""
-    x1 = x[..., : x.shape[-1] // 4]
-    x2 = x[..., x.shape[-1] // 4 :]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-# example where we need some deps and some functions
-class DummyModel(LlamaModel):
-    pass
--- a/examples/modular-transformers/modular_duplicated_method.py
+++ b/examples/modular-transformers/modular_duplicated_method.py
@ -0,0 +1,11 @@
+from transformers.models.llama.configuration_llama import LlamaConfig
+
+
+class DuplicatedMethodConfig(LlamaConfig):
+    @property
+    def vocab_size(self):
+        return 45
+
+    @vocab_size.setter
+    def vocab_size(self, value):
+        self.vocab_size = value
--- a/examples/modular-transformers/modular_global_indexing.py
+++ b/examples/modular-transformers/modular_global_indexing.py
@ -0,0 +1,16 @@
+from transformers.modeling_utils import AttentionInterface
+from transformers.models.llama.modeling_llama import LlamaAttention
+
+
+def custom_flex(x, **kwargs):
+    """Dummy function."""
+    return x
+
+
+ALL_ATTENTION_FUNCTIONS = AttentionInterface()
+# This indexing statement and associated function should be exported correctly!
+ALL_ATTENTION_FUNCTIONS["flex_attention"] = custom_flex
+
+
+class GlobalIndexingAttention(LlamaAttention):
+    pass
--- a/examples/modular-transformers/modular_multimodal1.py
+++ b/examples/modular-transformers/modular_multimodal1.py
@ -1,6 +0,0 @@
-from transformers.models.llama.modeling_llama import LlamaModel
-
-
-# Check that we can correctly change the prefix (here add Text part at the end of the name)
-class Multimodal1TextModel(LlamaModel):
-    pass
--- a/examples/modular-transformers/modular_my_new_model.py
+++ b/examples/modular-transformers/modular_my_new_model.py
@ -2,11 +2,122 @@ from transformers.models.llama.configuration_llama import LlamaConfig


 # Example where we only want to only add a new config argument and new arg doc
-# here there is no `ARG` so we are gonna take parent doc
 class MyNewModelConfig(LlamaConfig):
    r"""
-    new_param (`int`, *optional*, defaults to `False`):
-        A fun new parameter
+    This is the configuration class to store the configuration of a [`MyNewModelModel`]. It is used to instantiate an MyNewModel
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the MyNewModel-7B.
+    e.g. [meta-my_new_model/MyNewModel-2-7b-hf](https://huggingface.co/meta-my_new_model/MyNewModel-2-7b-hf)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the MyNewModel model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`MyNewModelModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. MyNewModel 1 supports up to 2048 tokens,
+            MyNewModel 2 up to 4096, CodeLlama up to 16384.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        pretraining_tp (`int`, *optional*, defaults to 1):
+            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
+            document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to
+            understand more about it. This value is necessary to ensure exact reproducibility of the pretraining
+            results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232).
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'my_new_model3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'my_new_model3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'my_new_model3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'my_new_model3'. Scaling factor applied to high frequency components of the RoPE
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        mlp_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
+        head_dim (`int`, *optional*):
+            The attention head dimension. If None, it will default to hidden_size // num_attention_heads
+
+    ```python
+    >>> from transformers import MyNewModelModel, MyNewModelConfig
+
+    >>> # Initializing a MyNewModel my_new_model-7b style configuration
+    >>> configuration = MyNewModelConfig()
+
+    >>> # Initializing a model from the my_new_model-7b style configuration
+    >>> model = MyNewModelModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
    """

    def __init__(self, mlp_bias=True, new_param=0, **super_kwargs):
--- a/examples/pytorch/README.md
+++ b/examples/pytorch/README.md
@ -65,7 +65,7 @@ examples/pytorch/token-classification/run_ner.py \

 Most example scripts should have the first two command line arguments and some have the third one. You can quickly check if a given example supports any of these by passing a `-h` option, e.g.:
 ```bash
-examples/pytorch/token-classification/run_ner.py -h
+token-classification/run_ner.py -h
 ```

 ## Resuming training
@ -110,7 +110,7 @@ classification MNLI task using the `run_glue` script, with 8 GPUs:

 ```bash
 torchrun \
-    --nproc_per_node 8 pytorch/text-classification/run_glue.py \
+    --nproc_per_node 8 text-classification/run_glue.py \
    --model_name_or_path google-bert/bert-large-uncased-whole-word-masking \
    --task_name mnli \
    --do_train \
--- a/examples/pytorch/_tests_requirements.txt
+++ b/examples/pytorch/_tests_requirements.txt
@ -22,6 +22,7 @@ protobuf
 torch
 torchvision
 torchaudio
+torchcodec
 jiwer
 librosa
 evaluate >= 0.2.0
--- a/examples/pytorch/contrastive-image-text/README.md
+++ b/examples/pytorch/contrastive-image-text/README.md
@ -84,7 +84,7 @@ loaded using the pre-trained weights.
 Finally, we can run the example script to train the model:

 ```bash
-python examples/pytorch/contrastive-image-text/run_clip.py \
+python run_clip.py \
    --output_dir ./clip-roberta-finetuned \
    --model_name_or_path ./clip-roberta \
    --data_dir $PWD/data \
--- a/examples/pytorch/multiple-choice/README.md
+++ b/examples/pytorch/multiple-choice/README.md
@ -21,7 +21,7 @@ limitations under the License.
 `run_swag` allows you to fine-tune any model from our [hub](https://huggingface.co/models) (as long as its architecture as a `ForMultipleChoice` version in the library) on the SWAG dataset or your own csv/jsonlines files as long as they are structured the same way. To make it works on another dataset, you will need to tweak the `preprocess_function` inside the script.

 ```bash
-python examples/pytorch/multiple-choice/run_swag.py \
+python run_swag.py \
 --model_name_or_path FacebookAI/roberta-base \
 --do_train \
 --do_eval \
--- a/examples/pytorch/object-detection/requirements.txt
+++ b/examples/pytorch/object-detection/requirements.txt
@ -1,5 +1,5 @@
 albumentations >= 1.4.16
 timm
-datasets
+datasets>=4.0
 torchmetrics
 pycocotools
--- a/examples/pytorch/object-detection/run_object_detection.py
+++ b/examples/pytorch/object-detection/run_object_detection.py
@ -399,6 +399,9 @@ def main():
        dataset["validation"] = split["test"]

    # Get dataset categories and prepare mappings for label_name <-> label_id
+    if isinstance(dataset["train"].features["objects"], dict):
+        categories = dataset["train"].features["objects"]["category"].feature.names
+    else:  # (for old versions of `datasets` that used Sequence({...}) of the objects)
        categories = dataset["train"].features["objects"].feature["category"].names
    id2label = dict(enumerate(categories))
    label2id = {v: k for k, v in id2label.items()}
--- a/examples/pytorch/object-detection/run_object_detection_no_trainer.py
+++ b/examples/pytorch/object-detection/run_object_detection_no_trainer.py
@ -460,6 +460,9 @@ def main():
        dataset["validation"] = split["test"]

    # Get dataset categories and prepare mappings for label_name <-> label_id
+    if isinstance(dataset["train"].features["objects"], dict):
+        categories = dataset["train"].features["objects"]["category"].feature.names
+    else:  # (for old versions of `datasets` that used Sequence({...}) of the objects)
        categories = dataset["train"].features["objects"].feature["category"].names
    id2label = dict(enumerate(categories))
    label2id = {v: k for k, v in id2label.items()}
--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
@ -324,13 +324,12 @@ def main():
        args.model_name_or_path, id2label=id2label, label2id=label2id, trust_remote_code=args.trust_remote_code
    )
    image_processor = AutoImageProcessor.from_pretrained(
-        args.model_name_or_path, trust_remote_code=args.trust_remote_code
+        args.model_name_or_path, trust_remote_code=args.trust_remote_code, do_reduce_labels=args.do_reduce_labels
    )
    model = AutoModelForSemanticSegmentation.from_pretrained(
        args.model_name_or_path,
        config=config,
        trust_remote_code=args.trust_remote_code,
-        do_reduce_labels=args.do_reduce_labels,
    )

    # Define transforms to be applied to each image and target.
--- a/examples/pytorch/summarization/README.md
+++ b/examples/pytorch/summarization/README.md
@ -40,7 +40,7 @@ and you also will find examples of these below.

 Here is an example on a summarization task:
 ```bash
-python examples/pytorch/summarization/run_summarization.py \
+python run_summarization.py \
    --model_name_or_path google-t5/t5-small \
    --do_train \
    --do_eval \
@ -64,7 +64,7 @@ And here is how you would use it on your own files, after adjusting the values f
 `--train_file`, `--validation_file`, `--text_column` and `--summary_column` to match your setup:

 ```bash
-python examples/pytorch/summarization/run_summarization.py \
+python run_summarization.py \
    --model_name_or_path google-t5/t5-small \
    --do_train \
    --do_eval \
--- a/examples/pytorch/translation/README.md
+++ b/examples/pytorch/translation/README.md
@ -42,7 +42,7 @@ and you also will find examples of these below.
 Here is an example of a translation fine-tuning with a MarianMT model:

 ```bash
-python examples/pytorch/translation/run_translation.py \
+python run_translation.py \
    --model_name_or_path Helsinki-NLP/opus-mt-en-ro \
    --do_train \
    --do_eval \
@ -62,7 +62,7 @@ MBart and some T5 models require special handling.
 T5 models `google-t5/t5-small`, `google-t5/t5-base`, `google-t5/t5-large`, `google-t5/t5-3b` and `google-t5/t5-11b` must use an additional argument: `--source_prefix "translate {source_lang} to {target_lang}"`. For example:

 ```bash
-python examples/pytorch/translation/run_translation.py \
+python run_translation.py \
    --model_name_or_path google-t5/t5-small \
    --do_train \
    --do_eval \
@ -85,7 +85,7 @@ For the aforementioned group of T5 models it's important to remember that if you
 MBart models require a different format for `--source_lang` and `--target_lang` values, e.g. instead of `en` it expects `en_XX`, for `ro` it expects `ro_RO`. The full MBart specification for language codes can be found [here](https://huggingface.co/facebook/mbart-large-cc25). For example:

 ```bash
-python examples/pytorch/translation/run_translation.py \
+python run_translation.py \
    --model_name_or_path facebook/mbart-large-en-ro  \
    --do_train \
    --do_eval \
@ -104,7 +104,7 @@ And here is how you would use the translation finetuning on your own files, afte
 values for the arguments `--train_file`, `--validation_file` to match your setup:

 ```bash
-python examples/pytorch/translation/run_translation.py \
+python run_translation.py \
    --model_name_or_path google-t5/t5-small \
    --do_train \
    --do_eval \
@ -133,7 +133,7 @@ Here the languages are Romanian (`ro`) and English (`en`).
 If you want to use a pre-processed dataset that leads to high BLEU scores, but for the `en-de` language pair, you can use `--dataset_name stas/wmt14-en-de-pre-processed`, as following:

 ```bash
-python examples/pytorch/translation/run_translation.py \
+python run_translation.py \
    --model_name_or_path google-t5/t5-small \
    --do_train \
    --do_eval \
--- a/setup.py
+++ b/setup.py
@ -52,7 +52,7 @@ To create the package for pypi.
   twine upload dist/* -r testpypi --repository-url=https://test.pypi.org/legacy/

   Check that you can install it in a virtualenv by running:
-   pip install -i https://testpypi.python.org/pypi transformers
+   pip install -i https://test.pypi.org/simple/ transformers

   Check you can run the following commands:
   python -c "from transformers import pipeline; classifier = pipeline('text-classification'); print(classifier('What a nice release'))"
@ -204,6 +204,7 @@ _deps = [
    "opentelemetry-api",
    "opentelemetry-exporter-otlp",
    "opentelemetry-sdk",
+    "mistral-common[opencv]>=1.6.3",
 ]


@ -313,7 +314,7 @@ extras["hub-kernels"] = deps_list("kernels")

 extras["integrations"] = extras["hub-kernels"] + extras["optuna"] + extras["ray"] + extras["sigopt"]

-extras["serving"] = deps_list("pydantic", "uvicorn", "fastapi", "starlette")
+extras["serving"] = deps_list("pydantic", "uvicorn", "fastapi", "starlette") + extras["torch"]
 extras["audio"] = deps_list(
    "librosa",
    "pyctcdecode",
@ -334,6 +335,7 @@ extras["video"] = deps_list("av")
 extras["num2words"] = deps_list("num2words")
 extras["sentencepiece"] = deps_list("sentencepiece", "protobuf")
 extras["tiktoken"] = deps_list("tiktoken", "blobfile")
+extras["mistral-common"] = deps_list("mistral-common[opencv]")
 extras["testing"] = (
    deps_list(
        "pytest",
@ -363,6 +365,7 @@ extras["testing"] = (
    )
    + extras["retrieval"]
    + extras["modelcreation"]
+    + extras["mistral-common"]
 )

 extras["deepspeed-testing"] = extras["deepspeed"] + extras["testing"] + extras["optuna"] + extras["sentencepiece"]
@ -384,6 +387,7 @@ extras["all"] = (
    + extras["accelerate"]
    + extras["video"]
    + extras["num2words"]
+    + extras["mistral-common"]
 )


--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@ -34,6 +34,7 @@ from .utils import (
    is_g2p_en_available,
    is_keras_nlp_available,
    is_librosa_available,
+    is_mistral_common_available,
    is_pretty_midi_available,
    is_scipy_available,
    is_sentencepiece_available,
@ -310,6 +311,18 @@ else:
        "convert_slow_tokenizer",
    ]

+try:
+    if not (is_mistral_common_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from .utils import dummy_mistral_common_objects
+
+    _import_structure["utils.dummy_mistral_common_objects"] = [
+        name for name in dir(dummy_mistral_common_objects) if not name.startswith("_")
+    ]
+else:
+    _import_structure["tokenization_mistral_common"] = ["MistralCommonTokenizer"]
+
 # Vision-specific objects
 try:
    if not is_vision_available():
--- a/src/transformers/audio_utils.py
+++ b/src/transformers/audio_utils.py
@ -1157,9 +1157,7 @@ def fram_wave(waveform: np.array, hop_length: int = 160, fft_window_size: int =
            frame = waveform[i : i + fft_window_size]
            frame_width = frame.shape[0]
            if frame_width < waveform.shape[0]:
-                frame = np.lib.pad(
-                    frame, pad_width=(0, fft_window_size - frame_width), mode="constant", constant_values=0
-                )
+                frame = np.pad(frame, pad_width=(0, fft_window_size - frame_width), mode="constant", constant_values=0)
        frames.append(frame)

    frames = np.stack(frames, 0)
--- a/src/transformers/cache_utils.py
+++ b/src/transformers/cache_utils.py
@ -185,17 +185,6 @@ class Cache:
                device = self.value_cache[layer_idx].device
                self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device))

-    @property
-    def seen_tokens(self):
-        logger.warning_once(
-            "The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` "
-            "model input instead."
-        )
-        if hasattr(self, "_seen_tokens"):
-            return self._seen_tokens
-        else:
-            return None
-
    def get_mask_sizes(self, cache_position: torch.Tensor, layer_idx: int) -> tuple[int, int]:
        """
        Return a tuple (kv_length, kv_offset) corresponding to the length and offset that will be returned for
@ -472,7 +461,6 @@ class DynamicCache(Cache):

    def __init__(self, _distributed_cache_data: Optional[Iterable] = None) -> None:
        super().__init__()
-        self._seen_tokens = 0  # Used in `generate` to keep tally of how many tokens the cache has seen
        self.key_cache: list[torch.Tensor] = []
        self.value_cache: list[torch.Tensor] = []

@ -535,10 +523,6 @@ class DynamicCache(Cache):
        Return:
            A tuple containing the updated key and value states.
        """
-        # Update the number of seen tokens
-        if layer_idx == 0:
-            self._seen_tokens += key_states.shape[-2]
-
        # Update the cache
        if key_states is not None:
            if len(self.key_cache) <= layer_idx:
@ -605,7 +589,6 @@ class DynamicCache(Cache):
        if self.get_seq_length() <= max_length:
            return

-        self._seen_tokens = max_length
        for idx in range(len(self.key_cache)):
            if self.key_cache[idx].numel():
                self.key_cache[idx] = self.key_cache[idx][..., :max_length, :]
@ -617,7 +600,6 @@ class DynamicCache(Cache):
        out = []
        for i in range(0, full_batch_size, split_size):
            current_split = DynamicCache()
-            current_split._seen_tokens = self._seen_tokens
            current_split.key_cache = [tensor[i : i + split_size] for tensor in self.key_cache]
            current_split.value_cache = [tensor[i : i + split_size] for tensor in self.value_cache]
            out.append(current_split)
@ -815,10 +797,6 @@ class OffloadedCache(DynamicCache):
        Return:
            A tuple containing the updated key and value states.
        """
-        # Update the number of seen tokens
-        if layer_idx == 0:
-            self._seen_tokens += key_states.shape[-2]
-
        # Update the cache
        if len(self.key_cache) < layer_idx:
            raise ValueError("OffloadedCache does not support model usage where layers are skipped. Use DynamicCache.")
@ -857,6 +835,9 @@ class QuantizedCache(DynamicCache):

    def __init__(self, cache_config: QuantizedCacheConfig) -> None:
        super().__init__()
+
+        # Used only for QuantCache where the seq-length can't be inferred easily from cache contents
+        self._seen_tokens = 0
        self._quantized_key_cache: list[torch.Tensor] = []
        self._quantized_value_cache: list[torch.Tensor] = []

@ -1098,6 +1079,10 @@ class StaticCache(Cache):
            Mapping between the layers and its device. This is required when you are manually initializing the cache
            and the model is split between different gpus. You can know which layers mapped to which device by
            checking the associated device_map: `model.hf_device_map`.
+        tp_size (`Optional[int]`, *optional*):
+            The tensor parallel size of the model. This is used to adjust the number of key/value heads in the cache
+            if the model is using tensor parallelism. If not provided, it defaults to `None`, which means that the
+            number of key/value heads will not be adjusted.


    Example:
@ -1130,6 +1115,7 @@ class StaticCache(Cache):
        device: Union[torch.device, str, None] = None,
        dtype: torch.dtype = torch.float32,
        layer_device_map: Optional[dict[int, Union[str, torch.device, int]]] = None,
+        tp_size: Optional[int] = None,
    ) -> None:
        super().__init__()
        self.max_batch_size = max_batch_size
@ -1144,6 +1130,13 @@ class StaticCache(Cache):
            if getattr(config, "num_key_value_heads", None) is None
            else config.num_key_value_heads
        )
+        if tp_size is not None and tp_size > 1:
+            if self.num_key_value_heads % tp_size != 0:
+                raise ValueError(
+                    f"Number of key value heads {self.num_key_value_heads} must be divisible by tensor parallel size {tp_size}."
+                )
+            # If the model is using tensor parallelism, we need to adjust the number of heads accordingly.
+            self.num_key_value_heads //= tp_size

        self.key_cache: list[torch.Tensor] = []
        self.value_cache: list[torch.Tensor] = []
@ -1400,6 +1393,19 @@ class EncoderDecoderCache(Cache):
        for layer_idx in range(len(cross_attention_cache.key_cache)):
            self.is_updated[layer_idx] = bool(cross_attention_cache.get_seq_length(layer_idx) > 0)

+    def __iter__(self):
+        """
+        Support for backwards-compatible `past_key_value` iteration, e.g. `for x in past_key_value:` to iterate over
+        keys and values
+        """
+        for layer_idx in range(len(self)):
+            yield (
+                self.self_attention_cache.key_cache[layer_idx],
+                self.self_attention_cache.value_cache[layer_idx],
+                self.cross_attention_cache.key_cache[layer_idx],
+                self.cross_attention_cache.value_cache[layer_idx],
+            )
+
    def __getitem__(self, layer_idx: int) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Support for backwards-compatible `past_key_value` indexing, e.g. `past_key_value[0][0].shape[2]` to get the
@ -1573,6 +1579,10 @@ class HybridCache(Cache):
            Mapping between the layers and its device. This is required when you are manually initializing the cache
            and the model is split between different gpus. You can know which layers mapped to which device by
            checking the associated device_map: `model.hf_device_map`.
+        tp_size (`Optional[int]`, *optional*):
+            The tensor parallel size of the model. This is used to adjust the number of key/value heads in the cache
+            if the model is using tensor parallelism. If not provided, it defaults to `None`, which means that the
+            number of key/value heads will not be adjusted.

    Example:

@ -1604,6 +1614,7 @@ class HybridCache(Cache):
        device: Union[torch.device, str, None] = None,
        dtype: torch.dtype = torch.float32,
        layer_device_map: Optional[dict[int, Union[str, torch.device, int]]] = None,
+        tp_size: Optional[int] = None,
    ) -> None:
        super().__init__()
        if not hasattr(config, "sliding_window") or config.sliding_window is None:
@ -1627,6 +1638,13 @@ class HybridCache(Cache):
            if getattr(config, "num_key_value_heads", None) is None
            else config.num_key_value_heads
        )
+        if tp_size is not None and tp_size > 1:
+            if self.num_key_value_heads % tp_size != 0:
+                raise ValueError(
+                    f"Number of key value heads {self.num_key_value_heads} must be divisible by tensor parallel size {tp_size}."
+                )
+            # If the model is using tensor parallelism, we need to adjust the number of heads accordingly.
+            self.num_key_value_heads //= tp_size

        # If the attribute does not exist in the config, fallback to a simple StaticCache
        if hasattr(config, "layer_types"):
@ -2197,6 +2215,10 @@ class OffloadedStaticCache(StaticCache):
            Mapping between the layers and its device. This is required when you are manually initializing the cache
            and the model is split between different gpus. You can know which layers mapped to which device by
            checking the associated device_map: `model.hf_device_map`.
+        tp_size (`Optional[int]`, *optional*):
+            The tensor parallel size of the model. This is used to adjust the number of key/value heads in the cache
+            if the model is using tensor parallelism. If not provided, it defaults to `None`, which means that the
+            number of key/value heads will not be adjusted.

    Example:

@ -2228,6 +2250,7 @@ class OffloadedStaticCache(StaticCache):
        dtype: Optional[torch.dtype] = None,
        offload_device: Union[str, torch.device] = torch.device("cpu"),
        layer_device_map: Optional[dict[int, Union[str, torch.device, int]]] = None,
+        tp_size: Optional[int] = None,
    ) -> None:
        super(Cache, self).__init__()

@ -2251,6 +2274,13 @@ class OffloadedStaticCache(StaticCache):
            if getattr(config, "num_key_value_heads", None) is None
            else config.num_key_value_heads
        )
+        if tp_size is not None and tp_size > 1:
+            if num_key_value_heads % tp_size != 0:
+                raise ValueError(
+                    f"Number of key value heads {num_key_value_heads} must be divisible by tensor parallel size {tp_size}."
+                )
+            # If the model is using tensor parallelism, we need to adjust the number of heads accordingly.
+            num_key_value_heads //= tp_size

        cache_shape = (max_batch_size, num_key_value_heads, self.max_cache_len, head_dim)

@ -2277,10 +2307,6 @@ class OffloadedStaticCache(StaticCache):
            self._device_key_cache.append(key_cache)
            self._device_value_cache.append(value_cache)

-        # For backwards compatibility.
-        # TODO(gante): Remove this.
-        self._seen_tokens = 0
-
        # Create new CUDA stream for parallel prefetching.
        self._prefetch_stream = torch.cuda.Stream() if self.device.type == "cuda" else None

@ -2314,10 +2340,6 @@ class OffloadedStaticCache(StaticCache):
        value_states = value_states.to(self.value_cache[layer_idx].dtype)

        if layer_idx == 0:
-            # Update seen tokens.
-            # TODO(gante): Remove this.
-            self._seen_tokens += key_states.shape[-2]
-
            # Always there.
            k_out = self.key_cache[0]
            v_out = self.value_cache[0]
@ -2371,10 +2393,14 @@ class OffloadedStaticCache(StaticCache):
        return k_out, v_out

    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
-        """Returns the sequence length of the cached states that were seen by the model."""
-
-        # TODO(gante): Remove this.
-        return self._seen_tokens
+        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
+        is_empty_layer = (
+            len(self.key_cache) == 0  # no cache in any layer
+            or len(self.key_cache) <= layer_idx  # hasn't run a layer with cache after it
+            or not self.key_cache[layer_idx].numel()  # the layer has no cache
+        )
+        layer_seq_length = self.key_cache[layer_idx].shape[-2] if not is_empty_layer else 0
+        return layer_seq_length

    def get_max_cache_shape(self) -> Optional[int]:
        """Returns the maximum sequence length of the cached states."""
@ -2384,22 +2410,12 @@ class OffloadedStaticCache(StaticCache):
    def reset(self) -> None:
        """Resets the cache values while preserving the objects."""

-        # For backwards compatibility.
-        # TODO(gante): Remove this.
-        self._seen_tokens = 0
-
        # Zero out cache.
        for layer_idx in range(len(self.key_cache)):
            # In-place ops prevent breaking the static address.
            self.key_cache[layer_idx].zero_()
            self.value_cache[layer_idx].zero_()

-    @property
-    def seen_tokens(self) -> int:
-        # For backwards compatibility.
-        # TODO(gante): Remove this.
-        return self._seen_tokens
-
    def _create_key_value_cache_tensors(
        self, shape: tuple[int, ...], device: torch.device
    ) -> tuple[torch.Tensor, torch.Tensor]:
--- a/src/transformers/commands/chat.py
+++ b/src/transformers/commands/chat.py
@ -14,6 +14,7 @@


 import asyncio
+import copy
 import json
 import os
 import platform
@ -333,6 +334,11 @@ class ChatCommand(BaseTransformersCLICommand):
                    )

                args.host, args.port = args.model_name_or_path_or_address.rsplit(":", 1)
+
+                if args.model_name_or_path is None:
+                    raise ValueError(
+                        "When connecting to a server, please specify a model name with the --model_name_or_path flag."
+                    )
            else:
                self.spawn_backend = True
                args.model_name_or_path = args.model_name_or_path_or_address
@ -446,11 +452,13 @@ class ChatCommand(BaseTransformersCLICommand):
            )
        return processed_generate_flags

-    def get_generation_parameterization(self, args: ChatArguments) -> tuple[GenerationConfig, dict]:
+    def get_generation_parameterization(
+        self, args: ChatArguments, model_generation_config: GenerationConfig
+    ) -> tuple[GenerationConfig, dict]:
        """
        Returns a GenerationConfig object holding the generation parameters for the CLI command.
        """
-        # No generation config arg provided -> use base generation config, apply CLI defaults
+        # No generation config arg provided -> use model's default generation config, then apply CLI defaults
        if args.generation_config is not None:
            if ".json" in args.generation_config:  # is a local file
                dirname = os.path.dirname(args.generation_config)
@ -462,7 +470,8 @@ class ChatCommand(BaseTransformersCLICommand):
            # !!!!!!!!!
            # This is a chat session, so we have a few non-standard defaults
            # !!!!!!!!!
-            generation_config = GenerationConfig(do_sample=True, max_new_tokens=256)
+            generation_config = copy.deepcopy(model_generation_config)
+            generation_config.update({"do_sample": True, "max_new_tokens": 256})

        # Finally: parse and apply `generate_flags`
        parsed_generate_flags = self.parse_generate_flags(args.generate_flags)
@ -670,7 +679,8 @@ class ChatCommand(BaseTransformersCLICommand):
        else:
            user = args.user

-        generation_config, model_kwargs = self.get_generation_parameterization(args)
+        model_generation_config = GenerationConfig.from_pretrained(args.model_name_or_path)
+        generation_config, model_kwargs = self.get_generation_parameterization(args, model_generation_config)

        interface = RichInterface(model_name=args.model_name_or_path, user_name=user)
        interface.clear()
@ -710,7 +720,7 @@ class ChatCommand(BaseTransformersCLICommand):
                    stream=True,
                    extra_body={
                        "request_id": request_id,
-                        "generation_config": {**generation_config.to_dict()},
+                        "generation_config": generation_config.to_json_string(),
                        "model": model,
                    },
                )
--- a/src/transformers/commands/serving.py
+++ b/src/transformers/commands/serving.py
@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import copy
 import functools
 import json
 import re
@ -20,18 +21,11 @@ from dataclasses import dataclass, field
 from threading import Thread
 from typing import Any, Optional

-from huggingface_hub import (
-    ChatCompletionStreamOutputChoice,
-    ChatCompletionStreamOutputDelta,
-    ChatCompletionStreamOutputDeltaToolCall,
-    ChatCompletionStreamOutputFunction,
-    ModelInfo,
-    model_info,
-)
+from huggingface_hub import ModelInfo, model_info

 from transformers.utils.import_utils import is_fastapi_available, is_pydantic_available, is_uvicorn_available

-from .. import PreTrainedTokenizerFast, TextIteratorStreamer
+from .. import LogitsProcessorList, PreTrainedTokenizerFast, TextIteratorStreamer
 from ..generation.continuous_batching import ContinuousBatchingManager, RequestStatus
 from ..utils import is_torch_available, logging
 from . import BaseTransformersCLICommand
@ -52,6 +46,7 @@ if is_torch_available():
 if is_pydantic_available() and is_fastapi_available() and is_uvicorn_available():
    import uvicorn
    from fastapi import FastAPI
+    from fastapi.middleware.cors import CORSMiddleware
    from fastapi.responses import JSONResponse, StreamingResponse
    from pydantic import BaseModel

@ -87,6 +82,9 @@ if is_pydantic_available() and is_fastapi_available() and is_uvicorn_available()
        # tool_prompt: Optional[str] = None
        # top_logprobs: Optional[int] = None

+        # transformers-specific request fields
+        generation_config: Optional[str] = None
+

 logger = logging.get_logger(__name__)

@ -111,37 +109,48 @@ def serve_command_factory(args: Namespace):
    return ServeCommand(args)


-def create_generation_config_from_req(req: "ChatCompletionInput") -> "GenerationConfig":
+def create_generation_config_from_req(
+    req: "ChatCompletionInput", model_generation_config: "GenerationConfig", **kwargs
+) -> "GenerationConfig":
    """
-    Creates a generation config from the parameters of the request. Note that we can pass a `GenerationConfig`
-    (serialized into a `dict`) in `extra_body`, for full `generate` parameterization.
+    Creates a generation config from the parameters of the request. If a generation config is passed in the request,
+    it will be used as a baseline for parameterization. Otherwise, we will use the model's default generation config.
+    Other parameters in the request will be applied on top of the baseline.

    Args:
-        req (`ChatCompletionInput`): The request which may optionally contain generation parameters.
+        req (`ChatCompletionInput`):
+            The request which may optionally contain generation parameters.
+        model_generation_config (`GenerationConfig`):
+            The model's default generation config.

    Returns:
        The prepared `GenerationConfig` object.
    """
-    if req.extra_body is not None and "generation_config" in req.extra_body:
-        for key in req.extra_body["generation_config"].keys():
-            if key in ChatCompletionInput.base_field_names.keys():
-                return {"error": "Duplicated key in the root request and in the passed generation config."}
-
-    if req.extra_body is not None and "generation_config" in req.extra_body:
-        generation_config = GenerationConfig(**(req.extra_body["generation_config"]))
+    # If there is a generation config in the request, it is a json string serialization from a `GenerationConfig`
+    # object. For simplicity, flags set here take precedence over all other flags.
+    if req.generation_config is not None:
+        generation_config = GenerationConfig(**json.loads(req.generation_config))
    else:
-        generation_config = GenerationConfig()
+        generation_config = copy.deepcopy(model_generation_config)
+
+    non_standard_kwargs = generation_config.update(**kwargs)
+    # Set extra kwargs that are not in the `GenerationConfig` class (e.g. continuous batching flags)
+    for k, v in non_standard_kwargs.items():
+        if v is not None:
+            setattr(generation_config, k, v)

    if req.frequency_penalty is not None:
-        generation_config.repetition_penalty = req.frequency_penalty
+        generation_config.repetition_penalty = float(req.frequency_penalty)
    if req.logit_bias is not None:
        generation_config.sequence_bias = req.logit_bias
    if req.stop is not None:
        generation_config.stop_strings = req.stop
    if req.temperature is not None:
-        generation_config.temperature = req.temperature
+        generation_config.temperature = float(req.temperature)
+        if float(req.temperature) == 0.0:
+            generation_config.do_sample = False
    if req.top_p is not None:
-        generation_config.top_p = req.top_p
+        generation_config.top_p = float(req.top_p)
    if req.seed is not None:
        torch.manual_seed(req.seed)

@ -202,17 +211,28 @@ class ServeArguments:
    use_bnb_nested_quant: bool = field(default=False, metadata={"help": "Whether to use nested quantization."})

    # Serving settings
-    host: str = field(default="localhost", metadata={"help": "Interface the server will listen to.."})
+    host: str = field(default="localhost", metadata={"help": "Interface the server will listen to."})
    port: int = field(default=8000, metadata={"help": "Port the server will listen to."})

    # Other settings
    log_level: str = field(
        default="info", metadata={"help": "Logging level as a string. Example: 'info' or 'warning'."}
    )
+    enable_cors: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "Whether to enable CORS. Some apps that make requests from external domains (e.g. Cursor) require "
+                "CORS to be enabled."
+            ),
+        },
+    )


 class ServeCommand(BaseTransformersCLICommand):
    loaded_model: Optional[str] = None
+    running_continuous_batching_manager: Optional[ContinuousBatchingManager] = None
+
    model: PreTrainedModel
    tokenizer: PreTrainedTokenizerFast

@ -236,6 +256,7 @@ class ServeCommand(BaseTransformersCLICommand):

        self.args = args
        self.use_continuous_batching = self.args.attn_implementation == "sdpa_paged"
+        self.enable_cors = self.args.enable_cors

        # State: preserves information about the last call and last KV cache, to determine whether we can reuse the KV
        # cache and avoid re-running prefil
@ -250,36 +271,66 @@ class ServeCommand(BaseTransformersCLICommand):

    def build_chunk(
        self,
-        content: str,
        request_id: str,
+        content: Optional[str] = None,
        role: Optional[str] = None,
        finish_reason: Optional[str] = None,
-        tool_calls: Optional[list[ChatCompletionStreamOutputDeltaToolCall]] = None,
+        tool_calls: Optional[list[dict]] = None,
    ) -> str:
+        """
+        Builds a chunk of a streaming response.
+
+        IMPORTANT: The built chunk won't contain empty fields (fields with `None`). Some downstream apps, like Cursor,
+        assume that when the field exists, it has data.
+
+        Args:
+            request_id (`str`):
+                The request ID.
+            content (`str`, *optional*):
+                Content of the response from the model.
+            role (`str`, *optional*):
+                The role of the next content, until a new role is defined.
+            finish_reason (`str`, *optional*):
+                The reason the generation by the model has finished.
+            tool_calls (`list[dict]`, *optional*):
+                Data about the tool calls, when they are triggered.
+
+        Returns:
+            `str`: The built chunk, a string containing a JSON string with the payload.
+        """
        payload = {
            "object": "chat.completion.chunk",
            "id": request_id,
            "created": int(time.time()),
            "model": self.loaded_model,
+            "choices": [{"delta": {}, "index": 0}],
            "system_fingerprint": "",
-            "choices": [
-                ChatCompletionStreamOutputChoice(
-                    delta=ChatCompletionStreamOutputDelta(
-                        role=role,
-                        content=content,
-                        tool_calls=tool_calls,
-                    ),
-                    index=0,
-                    logprobs=None,
-                    finish_reason=finish_reason,
-                ),
-            ],
        }
+        if content is not None:
+            payload["choices"][0]["delta"]["content"] = content
+        if role is not None:
+            payload["choices"][0]["delta"]["role"] = role
+        if tool_calls is not None:
+            payload["choices"][0]["delta"]["tool_calls"] = tool_calls
+        if finish_reason is not None:
+            payload["choices"][0]["finish_reason"] = finish_reason
+
        return f"data: {json.dumps(payload)}\n\n"

    def run(self):
        app = FastAPI()

+        # Some apps that make requests from external domains (e.g. Cursor) require CORS to be enabled. However, for
+        # security purposes, it's disabled by default
+        if self.enable_cors:
+            app.add_middleware(
+                CORSMiddleware,
+                allow_origins=["*"],
+                allow_credentials=True,
+                allow_methods=["*"],
+                allow_headers=["*"],
+            )
+
        if self.use_continuous_batching:
            self.continuous_batching(app)
        else:
@ -315,7 +366,7 @@ class ServeCommand(BaseTransformersCLICommand):
                        {
                            "id": model.id,
                            "object": "model",
-                            "crated": model.created_at.timestamp(),
+                            "created": model.created_at.timestamp(),
                            "owned_by": model.author,
                        }
                        for model in get_text_gen_models()
@ -326,7 +377,18 @@ class ServeCommand(BaseTransformersCLICommand):
        uvicorn.run(app, host=self.args.host, port=self.args.port, log_level=self.args.log_level)

    def continuous_batching(self, app):
-        generation_config = GenerationConfig(
+        @app.post("/v1/chat/completions")
+        def _serve(req: "ChatCompletionInput"):
+            if not req.stream:
+                return {"error": "Only streaming mode is supported."}
+
+            update_model = self.canonicalized_model_name(req.model) != self.loaded_model
+            if update_model:
+                self.model, self.tokenizer = self.load_model_and_tokenizer(req.model, self.args)
+
+            generation_config = create_generation_config_from_req(
+                req,
+                model_generation_config=self.model.generation_config,
                eos_token_id=self.tokenizer.eos_token_id,
                pad_token_id=self.tokenizer.pad_token_id,
                use_cache=False,
@ -337,35 +399,37 @@ class ServeCommand(BaseTransformersCLICommand):
                scheduler="fifo",
            )

-        manager: ContinuousBatchingManager = self.model.init_continuous_batching(
+            if self.running_continuous_batching_manager is None or update_model:
+                self.running_continuous_batching_manager = self.model.init_continuous_batching(
                    generation_config=generation_config, streaming=True
                )
-        manager.start()

-        @app.post("/v1/chat/completions")
-        def _serve(req: "ChatCompletionInput"):
-            if not req.stream:
-                return {"error": "Only streaming mode is supported."}
+                # TODO (Joao, Lysandre): the logits processors should be fixed in continuous batching
+                # and correctly applied in non-cb
+                self.running_continuous_batching_manager.logit_processor = LogitsProcessorList()
+                self.running_continuous_batching_manager.start()

-            update_model = req.model != self.loaded_model
-
-            if update_model:
-                self.model, self.tokenizer = self.load_model_and_tokenizer(req.model, self.args)
-
-            chat = req.messages
-            inputs = self.tokenizer.apply_chat_template(chat, return_tensors="pt", add_generation_prompt=True).to(
-                self.model.device
-            )
-
-            generation_config = create_generation_config_from_req(req)
+            # TODO (Joao, Lysandre): this should also work with tool support
+            inputs = self.tokenizer.apply_chat_template(
+                req.messages, return_tensors="pt", add_generation_prompt=True
+            ).to(self.model.device)

            def stream_response(_inputs):
                try:
-                    max_new_tokens = req.max_tokens or generation_config.max_new_tokens or 256
-                    request_id = manager.add_request(_inputs, request_id=req.request_id, max_new_tokens=max_new_tokens)
+                    max_new_tokens = req.max_tokens or generation_config.max_new_tokens or 1024
+                    request_id = self.running_continuous_batching_manager.add_request(
+                        _inputs, request_id=req.request_id, max_new_tokens=max_new_tokens
+                    )
                    queue_is_flushed = False

-                    for result in manager:
+                    # Emit the assistant role to start the stream. Other chunks won't have a role, as it is implicit
+                    # they come from the assistant.
+                    yield self.build_chunk(request_id, role="assistant")
+
+                    for result in self.running_continuous_batching_manager:
+                        if result.request_id != request_id:
+                            continue
+
                        if req.request_id is not None and not queue_is_flushed:
                            if result.status == RequestStatus.FINISHED:
                                continue
@ -373,12 +437,12 @@ class ServeCommand(BaseTransformersCLICommand):
                                queue_is_flushed = True

                        finish_reason = "stop" if result.status == RequestStatus.FINISHED else None
-                        yield self.build_chunk(result.next_token, request_id=request_id, finish_reason=finish_reason)
-
                        if result.status == RequestStatus.FINISHED:
+                            yield self.build_chunk(request_id, finish_reason=finish_reason)
                            break
+                        else:
+                            yield self.build_chunk(request_id=request_id, content=result.next_token)

-                    yield "data: [DONE]\n\n"
                except Exception as e:
                    logger.error(str(e))
                    yield f'data: {{"error": "{str(e)}"}}'
@ -401,8 +465,8 @@ class ServeCommand(BaseTransformersCLICommand):
        # No cached messages: this is a new request
        if self.last_messages is None:
            req_continues_last_messages = False
-        # The new request has fewer rounds of conversation: this is a new request
-        elif len(self.last_messages) > len(req.messages):
+        # The new request has no new rounds of conversation: this is a new request
+        elif len(self.last_messages) >= len(req.messages):
            req_continues_last_messages = False
        # Otherwise, check that the last messages are a subset of the new request
        else:
@ -417,7 +481,8 @@ class ServeCommand(BaseTransformersCLICommand):
    def generate(self, app):
        @app.post("/v1/chat/completions")
        def _serve(req: "ChatCompletionInput"):
-            update_model = req.model != self.loaded_model
+            logger.debug(f"Received request: {req}")
+            update_model = self.canonicalized_model_name(req.model) != self.loaded_model

            if update_model:
                self.model, self.tokenizer = self.load_model_and_tokenizer(req.model, self.args)
@ -453,8 +518,11 @@ class ServeCommand(BaseTransformersCLICommand):

            generation_streamer = TextIteratorStreamer(self.tokenizer, skip_special_tokens=True, skip_prompt=True)

-            generation_config = create_generation_config_from_req(req)
-            max_new_tokens = req.max_tokens or generation_config.max_new_tokens or 256
+            generation_config = create_generation_config_from_req(
+                req,
+                model_generation_config=self.model.generation_config,
+            )
+            max_new_tokens = req.max_tokens or generation_config.max_new_tokens or 1024
            generation_config.max_new_tokens = max_new_tokens

            last_kv_cache = None
@ -482,7 +550,14 @@ class ServeCommand(BaseTransformersCLICommand):
                    thread.start()
                    tool_state = ToolState()

+                    # Emit the assistant role to start the stream. Other chunks won't have a role, as it is implicit
+                    # they come from the assistant.
+                    logger.debug("Starting model output")
+                    yield self.build_chunk(_request_id, role="assistant")
+
                    for result in streamer:
+                        logger.debug(f"Model output: {result}")
+
                        # ====== TOOL CALL LOGIC ======
                        if tool_model_family is not None:
                            # Start of a tool call: reset state variables, set `inside_tool_call`
@ -493,7 +568,7 @@ class ServeCommand(BaseTransformersCLICommand):
                            # End of tool call: reset `inside_tool_call`, emit a `finish_reason`
                            if result.strip() == _TOOL_CALL_TOKENS[tool_model_family]["end"]:
                                tool_state.reset()
-                                yield self.build_chunk("", _request_id, role=None, finish_reason="tool_calls")
+                                yield self.build_chunk(_request_id, finish_reason="tool_calls")
                                continue

                            # Inside a tool call
@ -509,15 +584,12 @@ class ServeCommand(BaseTransformersCLICommand):
                                    else:
                                        tool_name = tool_name.group(1)
                                    tool_state.has_tool_name_defined = True
-                                    tool = ChatCompletionStreamOutputDeltaToolCall(
-                                        function=ChatCompletionStreamOutputFunction(
-                                            name=tool_name,
-                                            arguments=None,
-                                        ),
-                                        index=0,
-                                        type="function",
-                                        id=_request_id + "_tool_call",  # Only the first tool call delta has an id
-                                    )
+                                    tool = {
+                                        "function": {"name": tool_name},
+                                        "index": 0,
+                                        "type": "function",
+                                        "id": _request_id + "_tool_call",  # Only the first tool call delta has an id
+                                    }

                                # Second step: extract tool arguments. The tool arguments can be seen as a json string
                                # within the tool json string. We emit a delta for the arguments.
@ -537,22 +609,20 @@ class ServeCommand(BaseTransformersCLICommand):
                                    if tool_state.arg_nesting_level < 0:
                                        result = "".join(result.split("}")[:-2]) + "}"  # e.g. "4}}\n" -> "4}"

-                                    tool = ChatCompletionStreamOutputDeltaToolCall(
-                                        function=ChatCompletionStreamOutputFunction(
-                                            arguments=result,
-                                        ),
-                                        index=0,
-                                        type="function",
-                                        id=None,
-                                    )
+                                    tool = {
+                                        "function": {"arguments": result},
+                                        "index": 0,
+                                        "type": "function",
+                                    }

-                                yield self.build_chunk(None, _request_id, role=None, tool_calls=[tool])
+                                yield self.build_chunk(_request_id, tool_calls=[tool])
                                continue
                        # ====== END OF TOOL CALL LOGIC ======

-                        # All non-tool related tokens are emitted as assistant messages
-                        yield self.build_chunk(result, _request_id, role="assistant")
-                    yield self.build_chunk(None, _request_id, role=None, finish_reason="stop")
+                        # All non-tool related tokens are emitted as assistant messages. Empty text is skipped.
+                        if result != "":
+                            yield self.build_chunk(_request_id, content=result)
+                    yield self.build_chunk(_request_id, finish_reason="stop")

                    thread.join()
                except Exception as e:
@ -585,6 +655,11 @@ class ServeCommand(BaseTransformersCLICommand):

        return quantization_config

+    def canonicalized_model_name(self, model_id: str) -> str:
+        if "@" in model_id:
+            return model_id
+        return f"{model_id}@main"
+
    def load_model_and_tokenizer(
        self, model_id_and_revision: str, args: ServeArguments
    ) -> tuple[PreTrainedModel, PreTrainedTokenizerFast]:
@ -615,15 +690,15 @@ class ServeCommand(BaseTransformersCLICommand):

        model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)

-        if model.generation_config.max_new_tokens is not None and model.generation_config.max_new_tokens < 256:
-            model.generation_config.max_new_tokens = 256
+        if model.generation_config.max_new_tokens is not None and model.generation_config.max_new_tokens < 1024:
+            model.generation_config.max_new_tokens = 1024

        if getattr(model, "hf_device_map", None) is None:
            model = model.to(args.device)

-        self.loaded_model = model_id_and_revision
+        self.loaded_model = f"{model_id}@{revision}"

-        print("Loaded model", model_id_and_revision)
+        logger.warning(f"Loaded model {self.loaded_model}")
        return model, tokenizer


--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@ -18,7 +18,7 @@ import copy
 import json
 import os
 import warnings
-from typing import Any, Optional, Union
+from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union

 from packaging import version

@ -39,9 +39,17 @@ from .utils import (
 from .utils.generic import is_timm_config_dict


+if TYPE_CHECKING:
+    import torch
+
+
 logger = logging.get_logger(__name__)


+# type hinting: specifying the type of config class that inherits from PretrainedConfig
+SpecificPretrainedConfigType = TypeVar("SpecificPretrainedConfigType", bound="PretrainedConfig")
+
+
 class PretrainedConfig(PushToHubMixin):
    # no-format
    r"""
@ -104,8 +112,9 @@ class PretrainedConfig(PushToHubMixin):
        is_encoder_decoder (`bool`, *optional*, defaults to `False`):
            Whether the model is used as an encoder/decoder or not.
        is_decoder (`bool`, *optional*, defaults to `False`):
-            Whether to only use the decoder in an encoder-decoder architecture, otherwise it has no effect on decoder-only or encoder-only architectures.
-        cross_attention_hidden_size** (`bool`, *optional*):
+            Whether to only use the decoder in an encoder-decoder architecture, otherwise it has no effect on
+            decoder-only or encoder-only architectures.
+        cross_attention_hidden_size (`bool`, *optional*):
            The hidden size of the cross-attention layer in case the model is used as a decoder in an encoder-decoder
            setting and the cross-attention hidden dimension differs from `self.config.hidden_size`.
        add_cross_attention (`bool`, *optional*, defaults to `False`):
@ -135,7 +144,8 @@ class PretrainedConfig(PushToHubMixin):
            or PyTorch) checkpoint.
        id2label (`dict[int, str]`, *optional*):
            A map from index (for instance prediction index, or target index) to label.
-        label2id (`dict[str, int]`, *optional*): A map from label to index for the model.
+        label2id (`dict[str, int]`, *optional*):
+            A map from label to index for the model.
        num_labels (`int`, *optional*):
            Number of labels to use in the last layer added to the model, typically for a classification task.
        task_specific_params (`dict[str, Any]`, *optional*):
@ -151,12 +161,16 @@ class PretrainedConfig(PushToHubMixin):
            model by default).
        prefix (`str`, *optional*):
            A specific prompt that should be added at the beginning of each text before calling the model.
-        bos_token_id (`int`, *optional*): The id of the _beginning-of-stream_ token.
-        pad_token_id (`int`, *optional*): The id of the _padding_ token.
-        eos_token_id (`int`, *optional*): The id of the _end-of-stream_ token.
+        bos_token_id (`int`, *optional*):
+            The id of the _beginning-of-stream_ token.
+        pad_token_id (`int`, *optional*):
+            The id of the _padding_ token.
+        eos_token_id (`int`, *optional*):
+            The id of the _end-of-stream_ token.
        decoder_start_token_id (`int`, *optional*):
            If an encoder-decoder model starts decoding with a different token than _bos_, the id of that token.
-        sep_token_id (`int`, *optional*): The id of the _separation_ token.
+        sep_token_id (`int`, *optional*):
+            The id of the _separation_ token.

        > PyTorch specific parameters

@ -175,23 +189,11 @@ class PretrainedConfig(PushToHubMixin):

            This attribute is currently not being used during model loading time, but this may change in the future
            versions. But we can already start preparing for the future by saving the dtype with save_pretrained.
-
-        > TensorFlow specific parameters
-
-        use_bfloat16 (`bool`, *optional*, defaults to `False`):
-            Whether or not the model should use BFloat16 scalars (only used by some TensorFlow models).
-        tf_legacy_loss (`bool`, *optional*, defaults to `False`):
-            Whether the model should use legacy TensorFlow losses. Legacy losses have variable output shapes and may
-            not be XLA-compatible. This option is here for backward compatibility and will be removed in Transformers
-            v5.
-        loss_type (`str`, *optional*):
-            The type of loss that the model should use. It should be in `LOSS_MAPPING`'s keys, otherwise the loss will
-            be automatically inferred from the model architecture.
    """

    model_type: str = ""
    base_config_key: str = ""
-    sub_configs: dict[str, "PretrainedConfig"] = {}
+    sub_configs: dict[str, type["PretrainedConfig"]] = {}
    has_no_defaults_at_init: bool = False
    attribute_map: dict[str, str] = {}
    base_model_tp_plan: Optional[dict[str, Any]] = None
@ -208,93 +210,117 @@ class PretrainedConfig(PushToHubMixin):
            key = super().__getattribute__("attribute_map")[key]
        return super().__getattribute__(key)

-    def __init__(self, **kwargs):
-        # Attributes with defaults
-        self.return_dict = kwargs.pop("return_dict", True)
-        self.output_hidden_states = kwargs.pop("output_hidden_states", False)
-        self._output_attentions = kwargs.pop("output_attentions", False)
-        self.torchscript = kwargs.pop("torchscript", False)  # Only used by PyTorch models
-        self.torch_dtype = kwargs.pop("torch_dtype", None)  # Only used by PyTorch models
-        self.use_bfloat16 = kwargs.pop("use_bfloat16", False)
-        self.tf_legacy_loss = kwargs.pop("tf_legacy_loss", False)  # Only used by TensorFlow models
-        self.pruned_heads = kwargs.pop("pruned_heads", {})
-        self.tie_word_embeddings = kwargs.pop(
-            "tie_word_embeddings", True
-        )  # Whether input and output word embeddings should be tied for all MLM, LM and Seq2Seq models.
-        self.chunk_size_feed_forward = kwargs.pop("chunk_size_feed_forward", 0)
+    def __init__(
+        self,
+        *,
+        # All models common arguments
+        output_hidden_states: bool = False,
+        output_attentions: bool = False,
+        return_dict: bool = True,
+        torchscript: bool = False,
+        torch_dtype: Optional[Union[str, "torch.dtype"]] = None,
+        # Common arguments
+        pruned_heads: Optional[dict[int, list[int]]] = None,
+        tie_word_embeddings: bool = True,
+        chunk_size_feed_forward: int = 0,
+        is_encoder_decoder: bool = False,
+        is_decoder: bool = False,
+        cross_attention_hidden_size: Optional[int] = None,
+        add_cross_attention: bool = False,
+        tie_encoder_decoder: bool = False,
+        # Fine-tuning task arguments
+        architectures: Optional[list[str]] = None,
+        finetuning_task: Optional[str] = None,
+        id2label: Optional[dict[int, str]] = None,
+        label2id: Optional[dict[str, int]] = None,
+        num_labels: Optional[int] = None,
+        task_specific_params: Optional[dict[str, Any]] = None,
+        problem_type: Optional[str] = None,
+        # Tokenizer kwargs
+        tokenizer_class: Optional[str] = None,
+        prefix: Optional[str] = None,
+        bos_token_id: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+        sep_token_id: Optional[int] = None,
+        decoder_start_token_id: Optional[int] = None,
+        **kwargs,
+    ):
+        # Validation for some arguments
+        if label2id is not None and not isinstance(label2id, dict):
+            raise ValueError("Argument label2id should be a dictionary.")
+        if id2label is not None and not isinstance(id2label, dict):
+            raise ValueError("Argument id2label should be a dictionary.")
+        if num_labels is not None and id2label is not None and len(id2label) != num_labels:
+            logger.warning(
+                f"You passed `num_labels={num_labels}` which is incompatible to "
+                f"the `id2label` map of length `{len(id2label)}`."
+            )
+        if problem_type is not None and problem_type not in (
+            "regression",
+            "single_label_classification",
+            "multi_label_classification",
+        ):
+            raise ValueError(
+                f"The config parameter `problem_type` was not understood: received {problem_type} "
+                "but only 'regression', 'single_label_classification' and 'multi_label_classification' are valid."
+            )
+        if torch_dtype is not None and isinstance(torch_dtype, str) and is_torch_available():
+            # we will start using self.torch_dtype in v5, but to be consistent with
+            # from_pretrained's torch_dtype arg convert it to an actual torch.dtype object
+            import torch

-        # Is decoder is used in encoder-decoder models to differentiate encoder from decoder
-        self.is_encoder_decoder = kwargs.pop("is_encoder_decoder", False)
-        self.is_decoder = kwargs.pop("is_decoder", False)
-        self.cross_attention_hidden_size = kwargs.pop("cross_attention_hidden_size", None)
-        self.add_cross_attention = kwargs.pop("add_cross_attention", False)
-        self.tie_encoder_decoder = kwargs.pop("tie_encoder_decoder", False)
+            torch_dtype = getattr(torch, torch_dtype)
+
+        # Attributes common for all models
+        self.return_dict = return_dict
+        self.output_hidden_states = output_hidden_states
+        self.torchscript = torchscript
+        self.torch_dtype = torch_dtype
+        self._output_attentions = output_attentions  # has public property
+
+        # Less common kwargs, only used by some models
+        self.pruned_heads = pruned_heads if pruned_heads is not None else {}
+        self.tie_word_embeddings = tie_word_embeddings
+        self.chunk_size_feed_forward = chunk_size_feed_forward
+
+        # Encoder-decoder models attributes
+        self.is_encoder_decoder = is_encoder_decoder
+        self.is_decoder = is_decoder  # used in encoder-decoder models to differentiate encoder from decoder
+        self.cross_attention_hidden_size = cross_attention_hidden_size
+        self.add_cross_attention = add_cross_attention
+        self.tie_encoder_decoder = tie_encoder_decoder
+
+        # Fine-tuning task attributes
+        self.architectures = architectures
+        self.finetuning_task = finetuning_task
+        self.id2label = id2label
+        self.label2id = label2id
+        self.task_specific_params = task_specific_params
+        self.problem_type = problem_type
+
+        if self.id2label is None:
+            self._create_id_label_maps(num_labels if num_labels is not None else 2)
+        else:
+            # Keys are always strings in JSON so convert ids to int here.
+            self.id2label = {int(key): value for key, value in self.id2label.items()}
+
+        # Tokenizer attributes
+        self.tokenizer_class = tokenizer_class
+        self.prefix = prefix
+        self.bos_token_id = bos_token_id
+        self.pad_token_id = pad_token_id
+        self.eos_token_id = eos_token_id
+        self.sep_token_id = sep_token_id
+        self.decoder_start_token_id = decoder_start_token_id

        # Retrocompatibility: Parameters for sequence generation. While we will keep the ability to load these
        # parameters, saving them will be deprecated. In a distant future, we won't need to load them.
        for parameter_name, default_value in self._get_global_generation_defaults().items():
            setattr(self, parameter_name, kwargs.pop(parameter_name, default_value))

-        # Fine-tuning task arguments
-        self.architectures = kwargs.pop("architectures", None)
-        self.finetuning_task = kwargs.pop("finetuning_task", None)
-        self.id2label = kwargs.pop("id2label", None)
-        self.label2id = kwargs.pop("label2id", None)
-        if self.label2id is not None and not isinstance(self.label2id, dict):
-            raise ValueError("Argument label2id should be a dictionary.")
-        if self.id2label is not None:
-            if not isinstance(self.id2label, dict):
-                raise ValueError("Argument id2label should be a dictionary.")
-            num_labels = kwargs.pop("num_labels", None)
-            if num_labels is not None and len(self.id2label) != num_labels:
-                logger.warning(
-                    f"You passed along `num_labels={num_labels}` with an incompatible id to label map: "
-                    f"{self.id2label}. The number of labels will be overwritten to {self.num_labels}."
-                )
-            self.id2label = {int(key): value for key, value in self.id2label.items()}
-            # Keys are always strings in JSON so convert ids to int here.
-        else:
-            self.num_labels = kwargs.pop("num_labels", 2)
-
-        if self.torch_dtype is not None and isinstance(self.torch_dtype, str):
-            # we will start using self.torch_dtype in v5, but to be consistent with
-            # from_pretrained's torch_dtype arg convert it to an actual torch.dtype object
-            if is_torch_available():
-                import torch
-
-                self.torch_dtype = getattr(torch, self.torch_dtype)
-
-        # Tokenizer arguments TODO: eventually tokenizer and models should share the same config
-        self.tokenizer_class = kwargs.pop("tokenizer_class", None)
-        self.prefix = kwargs.pop("prefix", None)
-        self.bos_token_id = kwargs.pop("bos_token_id", None)
-        self.pad_token_id = kwargs.pop("pad_token_id", None)
-        self.eos_token_id = kwargs.pop("eos_token_id", None)
-        self.sep_token_id = kwargs.pop("sep_token_id", None)
-        self.decoder_start_token_id = kwargs.pop("decoder_start_token_id", None)
-
-        # task specific arguments
-        self.task_specific_params = kwargs.pop("task_specific_params", None)
-
-        # regression / multi-label classification
-        self.problem_type = kwargs.pop("problem_type", None)
-        allowed_problem_types = ("regression", "single_label_classification", "multi_label_classification")
-        if self.problem_type is not None and self.problem_type not in allowed_problem_types:
-            raise ValueError(
-                f"The config parameter `problem_type` was not understood: received {self.problem_type} "
-                "but only 'regression', 'single_label_classification' and 'multi_label_classification' are valid."
-            )
-
-        # TPU arguments
-        if kwargs.pop("xla_device", None) is not None:
-            logger.warning(
-                "The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can "
-                "safely remove it from your `config.json` file."
-            )
-
        # Name or path to the pretrained checkpoint
        self._name_or_path = str(kwargs.pop("name_or_path", ""))
-        # Config hash
        self._commit_hash = kwargs.pop("_commit_hash", None)

        # Attention implementation to use, if relevant.
@ -320,8 +346,16 @@ class PretrainedConfig(PushToHubMixin):
                logger.error(f"Can't set {key} with value {value} for {self}")
                raise err

+        # TODO: remove later, deprecated arguments for TF models
+        self.tf_legacy_loss = kwargs.pop("tf_legacy_loss", False)
+        self.use_bfloat16 = kwargs.pop("use_bfloat16", False)
+
+    def _create_id_label_maps(self, num_labels: int):
+        self.id2label = {i: f"LABEL_{i}" for i in range(num_labels)}
+        self.label2id = dict(zip(self.id2label.values(), self.id2label.keys()))
+
    @property
-    def name_or_path(self) -> str:
+    def name_or_path(self) -> Optional[str]:
        return getattr(self, "_name_or_path", None)

    @name_or_path.setter
@ -361,9 +395,10 @@ class PretrainedConfig(PushToHubMixin):

    @num_labels.setter
    def num_labels(self, num_labels: int):
-        if not hasattr(self, "id2label") or self.id2label is None or len(self.id2label) != num_labels:
-            self.id2label = {i: f"LABEL_{i}" for i in range(num_labels)}
-            self.label2id = dict(zip(self.id2label.values(), self.id2label.keys()))
+        # we do not store `num_labels` attribute in config, but instead
+        # compute it based on the length of the `id2label` map
+        if self.id2label is None or self.num_labels != num_labels:
+            self._create_id_label_maps(num_labels)

    @property
    def _attn_implementation(self):
@ -474,7 +509,7 @@ class PretrainedConfig(PushToHubMixin):

    @classmethod
    def from_pretrained(
-        cls,
+        cls: type[SpecificPretrainedConfigType],
        pretrained_model_name_or_path: Union[str, os.PathLike],
        cache_dir: Optional[Union[str, os.PathLike]] = None,
        force_download: bool = False,
@ -482,7 +517,7 @@ class PretrainedConfig(PushToHubMixin):
        token: Optional[Union[str, bool]] = None,
        revision: str = "main",
        **kwargs,
-    ) -> "PretrainedConfig":
+    ) -> SpecificPretrainedConfigType:
        r"""
        Instantiate a [`PretrainedConfig`] (or a derived class) from a pretrained model configuration.

@ -717,7 +752,9 @@ class PretrainedConfig(PushToHubMixin):
        return config_dict, kwargs

    @classmethod
-    def from_dict(cls, config_dict: dict[str, Any], **kwargs) -> "PretrainedConfig":
+    def from_dict(
+        cls: type[SpecificPretrainedConfigType], config_dict: dict[str, Any], **kwargs
+    ) -> SpecificPretrainedConfigType:
        """
        Instantiates a [`PretrainedConfig`] from a Python dictionary of parameters.

@ -778,7 +815,9 @@ class PretrainedConfig(PushToHubMixin):
            return config

    @classmethod
-    def from_json_file(cls, json_file: Union[str, os.PathLike]) -> "PretrainedConfig":
+    def from_json_file(
+        cls: type[SpecificPretrainedConfigType], json_file: Union[str, os.PathLike]
+    ) -> SpecificPretrainedConfigType:
        """
        Instantiates a [`PretrainedConfig`] from the path to a JSON file of parameters.

--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@ -106,4 +106,5 @@ deps = {
    "opentelemetry-api": "opentelemetry-api",
    "opentelemetry-exporter-otlp": "opentelemetry-exporter-otlp",
    "opentelemetry-sdk": "opentelemetry-sdk",
+    "mistral-common[opencv]": "mistral-common[opencv]>=1.6.3",
 }
--- a/src/transformers/feature_extraction_utils.py
+++ b/src/transformers/feature_extraction_utils.py
@ -20,7 +20,7 @@ import json
 import os
 import warnings
 from collections import UserDict
-from typing import TYPE_CHECKING, Any, Optional, Union
+from typing import TYPE_CHECKING, Any, Optional, TypeVar, Union

 import numpy as np

@ -55,6 +55,9 @@ logger = logging.get_logger(__name__)

 PreTrainedFeatureExtractor = Union["SequenceFeatureExtractor"]  # noqa: F821

+# type hinting: specifying the type of feature extractor class that inherits from FeatureExtractionMixin
+SpecificFeatureExtractorType = TypeVar("SpecificFeatureExtractorType", bound="FeatureExtractionMixin")
+

 class BatchFeature(UserDict):
    r"""
@ -270,7 +273,7 @@ class FeatureExtractionMixin(PushToHubMixin):

    @classmethod
    def from_pretrained(
-        cls,
+        cls: type[SpecificFeatureExtractorType],
        pretrained_model_name_or_path: Union[str, os.PathLike],
        cache_dir: Optional[Union[str, os.PathLike]] = None,
        force_download: bool = False,
@ -278,7 +281,7 @@ class FeatureExtractionMixin(PushToHubMixin):
        token: Optional[Union[str, bool]] = None,
        revision: str = "main",
        **kwargs,
-    ):
+    ) -> SpecificFeatureExtractorType:
        r"""
        Instantiate a type of [`~feature_extraction_utils.FeatureExtractionMixin`] from a feature extractor, *e.g.* a
        derived class of [`SequenceFeatureExtractor`].
--- a/src/transformers/generation/continuous_batching.py
+++ b/src/transformers/generation/continuous_batching.py
@ -126,6 +126,11 @@ class RequestState:
        is_eos = token_id == self.eos_token_id and self.eos_token_id != -1
        is_max_len = self.generated_len() >= self.max_new_tokens

+        # Only add the token if we're not finishing due to max length
+        # (EOS tokens should still be added to the output)
+        if not (is_max_len and not is_eos):
+            self.static_outputs.extend([token_id])
+
        if is_eos or is_max_len:
            self.status = RequestStatus.FINISHED
            return True
@ -157,6 +162,7 @@ class PagedAttentionCache(Cache):
        dtype: torch.dtype = torch.float16,
        layer_device_map: Optional[dict[int, Union[str, torch.device, int]]] = None,
        initial_prompt_shapes: Optional[list[list[int]]] = None,
+        tp_size: Optional[int] = None,
    ) -> None:
        """Initialize a paged attention cache for efficient memory usage.

@ -191,7 +197,16 @@ class PagedAttentionCache(Cache):

        self.block_size = block_size
        self.num_blocks = num_blocks
-        self.cache_shape = (self.num_key_value_heads, num_blocks, self.block_size, self.head_dim)
+        num_key_value_heads = self.num_key_value_heads
+        if tp_size is not None and tp_size > 1:
+            if num_key_value_heads % tp_size != 0:
+                raise ValueError(
+                    f"Number of key value heads {num_key_value_heads} must be divisible by tensor parallel size {tp_size}."
+                )
+            # If the model is using tensor parallelism, we need to adjust the number of heads accordingly.
+            num_key_value_heads //= tp_size
+
+        self.cache_shape = (num_key_value_heads, num_blocks, self.block_size, self.head_dim)

        self.dtype = dtype
        self.device = device
@ -635,7 +650,7 @@ def compute_optimal_blocks(
    memory_per_token = 2 * num_kv_heads * head_dim * dtype_size * num_hidden_layers  # For K and V caches

    # Estimate sequence length requirements
-    tokens_to_generate = getattr(generation_config, "max_new_tokens", 20)
+    tokens_to_generate = getattr(generation_config, "max_new_tokens") or 20

    if median_prefill_length is None and inputs:
        non_empty_inputs = [len(seq) for seq in inputs if seq]
@ -1019,7 +1034,6 @@ class ContinuousBatchProcessor:
                self.metrics.record_ttft_metric(state.created_time, state.request_id)
                state.status = RequestStatus.DECODING
                token = out_tokens[self.logits_indices[i]]
-                state.static_outputs.extend([token])
                state.prompt_ids = [token]
                if state.update_with_token(token):
                    self.metrics.record_request_completion(state.created_time, state.request_id)
@ -1277,6 +1291,7 @@ class ContinuousBatchingManager:
                self.generation_config,
                self.model.device,
                self.model.dtype,
+                tp_size=getattr(self.model, "tp_size"),
            )

            scheduler = None
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@ -656,6 +656,7 @@ class GenerationMixin(ContinuousMixin):
            # If it's not defined, it means the model uses the new general mask API
            if causal_mask_creation_function is None:  # can't be found
                token_type_ids = getattr(model_input, "token_type_ids", None)
+                position_ids = getattr(model_input, position_ids_key, None)
                # Some models may overwrite the general one
                causal_mask_creation_function = getattr(self, "create_masks_for_generate", create_masks_for_generate)
                attention_mask = causal_mask_creation_function(
@ -665,6 +666,7 @@ class GenerationMixin(ContinuousMixin):
                    attention_mask=attention_mask,
                    cache_position=cache_position,
                    past_key_values=past_key_values,
+                    position_ids=position_ids,
                    token_type_ids=token_type_ids,
                )
            else:
@ -733,7 +735,9 @@ class GenerationMixin(ContinuousMixin):
        # - encoder-decoder models should complain if the user attempts to pass `inputs_embeds` and `input_ids`, and
        # pull the former to inputs. It will be used in place of `input_ids` to get the encoder hidden states.
        if input_name == "input_ids" and "inputs_embeds" in model_kwargs:
-            if not self.config.is_encoder_decoder:
+            if model_kwargs["inputs_embeds"] is None:
+                model_kwargs.pop("inputs_embeds")
+            elif not self.config.is_encoder_decoder:
                has_inputs_embeds_forwarding = "inputs_embeds" in set(
                    inspect.signature(self.prepare_inputs_for_generation).parameters.keys()
                )
@ -748,6 +752,7 @@ class GenerationMixin(ContinuousMixin):
                model_kwargs["input_ids"] = self._maybe_initialize_input_ids_for_generation(
                    inputs, bos_token_id, model_kwargs=model_kwargs
                )
+                inputs, input_name = model_kwargs["inputs_embeds"], "inputs_embeds"
            else:
                if inputs is not None:
                    raise ValueError("You passed `inputs_embeds` and `input_ids` to `.generate()`. Please pick one.")
@ -1773,6 +1778,10 @@ class GenerationMixin(ContinuousMixin):
                    ):
                        modified_values[key] = model_gen_config_value
                        setattr(generation_config, key, model_gen_config_value)
+                # edge case: we may set `temperature=0.0` and `do_sample=False`, but the model defaults to
+                # `do_sample=True`
+                if generation_config.temperature == 0.0:
+                    generation_config.do_sample = False
                if use_model_defaults is None and len(modified_values) > 0:
                    logger.warning_once(
                        f"`generation_config` default values have been modified to match model-specific defaults: "
@ -1954,6 +1963,9 @@ class GenerationMixin(ContinuousMixin):
                "device": device,
                "layer_device_map": layer_device_map,
            }
+            if cache_implementation in ["static", "hybrid", "offloaded_static"]:
+                cache_kwargs.update({"tp_size": self.tp_size})
+
            self._cache = cache_cls(**cache_kwargs)
            if requires_cross_attention_cache:
                encoder_kwargs = cache_kwargs.copy()
@ -1977,6 +1989,7 @@ class GenerationMixin(ContinuousMixin):
            and "zamba" not in self.__class__.__name__.lower()
            and "bamba" not in self.__class__.__name__.lower()
            and "minimax" not in self.__class__.__name__.lower()
+            and "lfm2" not in self.__class__.__name__.lower()
        )

    def _prepare_cache_for_generation(
@ -3755,11 +3768,11 @@ class GenerationMixin(ContinuousMixin):
        return gathered_tensor

    @staticmethod
-    def _beam_search_has_unfinished_sequences(
+    def _check_early_stop_heuristic(
+        is_early_stop_heuristic_unsatisfied: torch.Tensor,
        running_beam_scores: torch.Tensor,
        beam_scores: torch.Tensor,
        is_sent_finished: torch.Tensor,
-        next_token_hits_stopping_criteria: torch.Tensor,
        cur_len: int,
        max_length: int,
        decoder_prompt_len: int,
@ -3767,22 +3780,52 @@ class GenerationMixin(ContinuousMixin):
        length_penalty: float,
    ):
        """
-        Beam Search stopping condition -- halts the generation loop if any of these conditions becomes False
+        Determine whether early stopping is possible by checking if the best possible score of running beams
+        could still improve upon the finished ones.
+
+        Mechanism:
+        - Without a length penalty, beam scores typically decrease as more tokens are generated.
+        So, if the *best possible* score from any running beam is already worse than the *worst* finished beam,
+        we can safely stop early.
+        - With a length penalty, scores may increase with longer sequences. In this case, we use heuristics
+        to estimate the best possible score — though this estimate may not always be correct — and stop
+        if no further improvement seems likely.
+
+        We apply different heuristics depending on the value of `early_stopping`:
+        1. `early_stopping == False`:
+        -> Use a heuristic that assumes the best score comes from the current length minus the decoder prompt length.
+        -> See detailed discussion: https://github.com/huggingface/transformers/pull/20901#issuecomment-1369845565
+
+        2. `early_stopping == "never"`:
+        -> Estimate the best score using either `max_length` or `cur_len`, depending on the sign of `length_penalty`.
+        -> A positive length penalty favors longer sequences, so we use `max_length` in that case.
+
+        NOTE: the canonical beam search implementation can be replicated with `early_stopping="never"` and
+        `length_penalty=0.0`, which are NOT the default flags. The default behavior was empirically found to produce
+        better sequences (prior to 2022), and changing it is BC breaking.
        """
-        # a. Can the open beams improve the top completed scores?
-        # early_stopping == False -> apply heuristic = always get the best score from
-        #   `cur_len - decoder_prompt_len`. See the discussion below for more details.
-        #   https://github.com/huggingface/transformers/pull/20901#issuecomment-1369845565
-        # early_stopping == "never" -> compute the best score from `max_length` or `cur_len`, depending on the
-        #   sign of `length_penalty`. Positive `length_penalty` favors longer sequences, thus we use
-        #   `max_length` there.
        if early_stopping == "never" and length_penalty > 0.0:
            best_hypothetical_length = max_length - decoder_prompt_len
        else:
            best_hypothetical_length = cur_len - decoder_prompt_len
        best_possible_running_score = running_beam_scores[:, :1] / (best_hypothetical_length**length_penalty)
        worst_finished_score = torch.where(is_sent_finished, torch.min(beam_scores, dim=1, keepdim=True)[0], -1.0e9)
-        improvement_possible = torch.any(best_possible_running_score > worst_finished_score)
+        return is_early_stop_heuristic_unsatisfied & torch.any(
+            best_possible_running_score > worst_finished_score, dim=-1, keepdim=True
+        )
+
+    @staticmethod
+    def _beam_search_has_unfinished_sequences(
+        is_early_stop_heuristic_unsatisfied: torch.Tensor,
+        is_sent_finished: torch.Tensor,
+        next_token_hits_stopping_criteria: torch.Tensor,
+        early_stopping: Union[bool, str],
+    ):
+        """
+        Beam Search stopping condition -- halts the generation loop if any of these conditions becomes False
+        """
+        # a. Can the open beams improve the top completed scores?
+        improvement_possible = torch.any(is_early_stop_heuristic_unsatisfied)

        # b. Is there still a beam without fully completed sequences? This is only relevant if early_stopping is
        # enabled, where we want to finish as soon as all beams have a completed sequence.
@ -3878,6 +3921,7 @@ class GenerationMixin(ContinuousMixin):
        topk_log_probs: torch.Tensor,
        beam_indices: torch.Tensor,
        topk_running_beam_indices: torch.Tensor,
+        is_early_stop_heuristic_unsatisfied: torch.Tensor,
        is_sent_finished: torch.Tensor,
        next_token_hits_stopping_criteria: torch.Tensor,
        top_num_beam_mask: torch.Tensor,
@ -3902,6 +3946,9 @@ class GenerationMixin(ContinuousMixin):
        # - make sure no scores can be added anymore if beam is full and early stopping is on
        beams_in_batch_are_full = torch.all(is_sent_finished, axis=-1, keepdims=True) & (early_stopping is True)
        topk_log_probs += beams_in_batch_are_full.to(torch.float32) * -1.0e9
+        # - make sure no scores can be added anymore if improvement is not possible
+        topk_log_probs += (~is_early_stop_heuristic_unsatisfied).to(torch.float32) * -1.0e9
+
        # - make sure still running sequences cannot be chosen as finalized beam
        topk_log_probs += (~did_top_num_beams_just_finished) * -1.0e9

@ -4053,6 +4100,9 @@ class GenerationMixin(ContinuousMixin):
        # per batch, beam-item state bit indicating if sentence has finished.
        is_sent_finished = torch.zeros((batch_size, num_beams), dtype=torch.bool, device=input_ids.device)

+        # per batch state bit indicating if there is a possibility to improve the best finished sentence.
+        is_early_stop_heuristic_unsatisfied = torch.ones((batch_size, 1), dtype=torch.bool, device=input_ids.device)
+
        # per batch, beam-item state bit indicating if there are valid continuations.
        next_token_hits_stopping_criteria = torch.zeros(
            (batch_size, num_beams), dtype=torch.bool, device=input_ids.device
@ -4165,6 +4215,7 @@ class GenerationMixin(ContinuousMixin):
                topk_log_probs=topk_log_probs,
                beam_indices=beam_indices,
                topk_running_beam_indices=topk_running_beam_indices,
+                is_early_stop_heuristic_unsatisfied=is_early_stop_heuristic_unsatisfied,
                is_sent_finished=is_sent_finished,
                next_token_hits_stopping_criteria=next_token_hits_stopping_criteria,
                top_num_beam_mask=top_num_beam_mask,
@ -4186,16 +4237,22 @@ class GenerationMixin(ContinuousMixin):
                )

            cur_len = cur_len + 1
+            is_early_stop_heuristic_unsatisfied = self._check_early_stop_heuristic(
+                is_early_stop_heuristic_unsatisfied=is_early_stop_heuristic_unsatisfied,
+                running_beam_scores=running_beam_scores,
+                beam_scores=beam_scores,
+                is_sent_finished=is_sent_finished,
+                cur_len=cur_len,
+                max_length=max_length,
+                decoder_prompt_len=decoder_prompt_len,
+                early_stopping=early_stopping,
+                length_penalty=length_penalty,
+            )
            this_peer_finished = not self._beam_search_has_unfinished_sequences(
-                running_beam_scores,
-                beam_scores,
+                is_early_stop_heuristic_unsatisfied,
                is_sent_finished,
                next_token_hits_stopping_criteria,
-                cur_len,
-                max_length,
-                decoder_prompt_len,
                early_stopping,
-                length_penalty,
            )

        # 5. prepare outputs
--- a/src/transformers/image_processing_utils_fast.py
+++ b/src/transformers/image_processing_utils_fast.py
@ -13,6 +13,7 @@
 # limitations under the License.

 from collections.abc import Iterable
+from copy import deepcopy
 from functools import lru_cache, partial
 from typing import Any, Optional, TypedDict, Union

@ -229,7 +230,7 @@ class BaseImageProcessorFast(BaseImageProcessor):
            if kwarg is not None:
                setattr(self, key, kwarg)
            else:
-                setattr(self, key, getattr(self, key, None))
+                setattr(self, key, deepcopy(getattr(self, key, None)))

        # get valid kwargs names
        self._valid_kwargs_names = list(self.valid_kwargs.__annotations__.keys())
--- a/src/transformers/integrations/executorch.py
+++ b/src/transformers/integrations/executorch.py
@ -15,7 +15,7 @@ from typing import Callable, Optional

 import torch

-from ..cache_utils import DynamicCache, HybridCache, StaticCache
+from ..cache_utils import DynamicCache, EncoderDecoderCache, HybridCache, StaticCache
 from ..generation.configuration_utils import GenerationConfig
 from ..masking_utils import (
    ALL_MASK_ATTENTION_FUNCTIONS,
@ -548,7 +548,7 @@ class Seq2SeqLMDecoderExportableModuleWithStaticCache(torch.nn.Module):
        self.lm_head = model.lm_head
        self.config = model.config

-        # Initialize static cache
+        # Initialize static cache for decoder and DynamicCache for encoder
        self.static_cache = StaticCache(
            config=self.config,
            max_batch_size=batch_size,
@ -556,6 +556,7 @@ class Seq2SeqLMDecoderExportableModuleWithStaticCache(torch.nn.Module):
            device="cpu",
            dtype=torch.float32,
        )
+        self.cache = EncoderDecoderCache(self.static_cache, DynamicCache())

        # Register cache buffers to make them exportable
        for i in range(len(self.static_cache.key_cache)):
@ -567,7 +568,7 @@ class Seq2SeqLMDecoderExportableModuleWithStaticCache(torch.nn.Module):
        outputs = self.decoder(
            input_ids=decoder_input_ids,
            encoder_hidden_states=encoder_hidden_states,
-            past_key_values=self.static_cache,
+            past_key_values=self.cache,
            use_cache=True,
            cache_position=cache_position,
        )
--- a/Show More
+++ b/Show More