try

2025-11-03 19:34:35 +08:00 · 2025-02-14 16:02:47 +01:00
489 changed files with 11529 additions and 18112 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -31,14 +31,6 @@ jobs:
        parallelism: 1
        steps:
            - checkout
-            - run: if [[ "$CIRCLE_PULL_REQUEST" == "" && "$CIRCLE_BRANCH" != "main" && "$CIRCLE_BRANCH" != *-release ]]; then echo "Not a PR, not the main branch and not a release branch, skip test!"; circleci-agent step halt; fi
-            - run: 'curl -L -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" https://api.github.com/repos/$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME/pulls/${CIRCLE_PULL_REQUEST##*/} >> github.txt'
-            - run: cat github.txt
-            - run: (python3 -c 'import json; from datetime import datetime; fp = open("github.txt"); data = json.load(fp); fp.close(); f = "%Y-%m-%dT%H:%M:%SZ"; created = datetime.strptime(data["created_at"], f); updated = datetime.strptime(data["updated_at"], f); s = (updated - created).total_seconds(); print(int(s))' || true) > elapsed.txt
-            - run: if [ "$(cat elapsed.txt)" == "" ]; then echo 60 > elapsed.txt; fi
-            - run: cat elapsed.txt
-            - run: if [ "$(cat elapsed.txt)" -lt "30" ]; then echo "PR is just opened, wait some actions from GitHub"; sleep 30; fi
-            - run: 'if grep -q "\"draft\": true," github.txt; then echo "draft mode, skip test!"; circleci-agent step halt; fi'
            - run: uv pip install -U -e .
            - run: echo 'export "GIT_COMMIT_MESSAGE=$(git show -s --format=%s)"' >> "$BASH_ENV" && source "$BASH_ENV"
            - run: mkdir -p test_preparation
@ -168,7 +160,7 @@ jobs:
        environment:
            TRANSFORMERS_IS_CI: yes
            PYTEST_TIMEOUT: 120
-        parallelism: 1
+        parallelism: 4
        steps:
            - checkout
            - run: uv pip install -e ".[quality]"
@ -177,19 +169,19 @@ jobs:
                command: pip freeze | tee installed.txt
            - store_artifacts:
                  path: ~/transformers/installed.txt
-            - run: python utils/check_copies.py
-            - run: python utils/check_modular_conversion.py --num_workers 4
-            - run: python utils/check_table.py
-            - run: python utils/check_dummies.py
-            - run: python utils/check_repo.py
-            - run: python utils/check_inits.py
-            - run: python utils/check_config_docstrings.py
-            - run: python utils/check_config_attributes.py
-            - run: python utils/check_doctest_list.py
-            - run: make deps_table_check_updated
-            - run: python utils/update_metadata.py --check-only
-            - run: python utils/check_docstrings.py
-            - run: python utils/check_support_list.py
+            - run: if [ "$CIRCLE_NODE_INDEX" == "0" ]; then python utils/check_copies.py; fi
+            - run: if [ "$CIRCLE_NODE_INDEX" == "1" ]; then python utils/check_modular_conversion.py --num_workers 4; fi
+            - run: if [ "$CIRCLE_NODE_INDEX" == "3" ]; then python utils/check_table.py; fi
+            - run: if [ "$CIRCLE_NODE_INDEX" == "0" ]; then python utils/check_dummies.py; fi
+            - run: if [ "$CIRCLE_NODE_INDEX" == "2" ]; then python utils/check_repo.py; fi
+            - run: if [ "$CIRCLE_NODE_INDEX" == "2" ]; then python utils/check_inits.py; fi
+            - run: if [ "$CIRCLE_NODE_INDEX" == "2" ]; then python utils/check_config_docstrings.py; fi
+            - run: if [ "$CIRCLE_NODE_INDEX" == "2" ]; then python utils/check_config_attributes.py; fi
+            - run: if [ "$CIRCLE_NODE_INDEX" == "0" ]; then python utils/check_doctest_list.py; fi
+            - run: if [ "$CIRCLE_NODE_INDEX" == "0" ]; then make deps_table_check_updated; fi
+            - run: if [ "$CIRCLE_NODE_INDEX" == "3" ]; then python utils/update_metadata.py --check-only; fi
+            - run: if [ "$CIRCLE_NODE_INDEX" == "3" ]; then python utils/check_docstrings.py; fi
+            - run: if [ "$CIRCLE_NODE_INDEX" == "0" ]; then python utils/check_support_list.py; fi

 workflows:
    version: 2
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@ -28,6 +28,8 @@ COMMON_ENV_VARIABLES = {
    "TRANSFORMERS_IS_CI": True,
    "PYTEST_TIMEOUT": 120,
    "RUN_PIPELINE_TESTS": False,
+    "RUN_PT_TF_CROSS_TESTS": False,
+    "RUN_PT_FLAX_CROSS_TESTS": False,
 }
 # Disable the use of {"s": None} as the output is way too long, causing the navigation on CircleCI impractical
 COMMON_PYTEST_OPTIONS = {"max-worker-restart": 0, "dist": "loadfile", "vvv": None, "rsfE":None}
@ -175,6 +177,23 @@ class CircleCIJob:


 # JOBS
+torch_and_tf_job = CircleCIJob(
+    "torch_and_tf",
+    docker_image=[{"image":"huggingface/transformers-torch-tf-light"}],
+    additional_env={"RUN_PT_TF_CROSS_TESTS": True},
+    marker="is_pt_tf_cross_test",
+    pytest_options={"rA": None, "durations": 0},
+)
+
+
+torch_and_flax_job = CircleCIJob(
+    "torch_and_flax",
+    additional_env={"RUN_PT_FLAX_CROSS_TESTS": True},
+    docker_image=[{"image":"huggingface/transformers-torch-jax-light"}],
+    marker="is_pt_flax_cross_test",
+    pytest_options={"rA": None, "durations": 0},
+)
+
 torch_job = CircleCIJob(
    "torch",
    docker_image=[{"image": "huggingface/transformers-torch-light"}],
@ -334,7 +353,7 @@ doc_test_job = CircleCIJob(
    pytest_num_workers=1,
 )

-REGULAR_TESTS = [torch_job, tf_job, flax_job, hub_job, onnx_job, tokenization_job, processor_job, generate_job, non_model_job] # fmt: skip
+REGULAR_TESTS = [torch_and_tf_job, torch_and_flax_job, torch_job, tf_job, flax_job, hub_job, onnx_job, tokenization_job, processor_job, generate_job, non_model_job] # fmt: skip
 EXAMPLES_TESTS = [examples_torch_job, examples_tensorflow_job]
 PIPELINE_TESTS = [pipelines_torch_job, pipelines_tf_job]
 REPO_UTIL_TESTS = [repo_utils_job]
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@ -64,7 +64,7 @@ jobs:
            commit_id=$GITHUB_SHA
          fi
          commit_msg=$(git show -s --format=%s | cut -c1-70)
-          python3 benchmark/benchmarks_entrypoint.py "$BRANCH_NAME" "$commit_id" "$commit_msg"
+          python3 benchmark/benchmarks_entrypoint.py "${{ github.head_ref || github.ref_name }}" "$commit_id" "$commit_msg"
        env:
          HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
          # Enable this to see debug logs
@ -73,4 +73,3 @@ jobs:
          PGHOST: ${{ secrets.TRANSFORMERS_BENCHMARKS_PGHOST }}
          PGUSER: transformers_benchmarks
          PGPASSWORD: ${{ secrets.TRANSFORMERS_BENCHMARKS_PGPASSWORD }}
-          BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
--- a/.github/workflows/build-ci-docker-images.yml
+++ b/.github/workflows/build-ci-docker-images.yml
@ -26,7 +26,7 @@ jobs:

    strategy:
      matrix:
-        file: ["quality", "consistency", "custom-tokenizers", "torch-light", "tf-light", "exotic-models", "torch-tf-light", "jax-light", "examples-torch",  "examples-tf"]
+        file: ["quality", "consistency", "custom-tokenizers", "torch-light", "tf-light", "exotic-models", "torch-tf-light", "torch-jax-light", "jax-light", "examples-torch",  "examples-tf"]
    continue-on-error: true

    steps:
@ -34,11 +34,11 @@ jobs:
        name: Set tag
        run: |
              if ${{contains(github.event.head_commit.message, '[build-ci-image]')}}; then
-                  echo "TAG=huggingface/transformers-${{ matrix.file }}:dev" >> "$GITHUB_ENV"
+                  echo "TAG=huggingface/transformers-${{ matrix.file }}:dev" >> "$GITHUB_ENV" 
                  echo "setting it to DEV!"
              else
                  echo "TAG=huggingface/transformers-${{ matrix.file }}" >> "$GITHUB_ENV"
-
+                  
              fi
      -
        name: Set up Docker Buildx
--- a/.github/workflows/change_pr_to_draft.yml
+++ b/.github/workflows/change_pr_to_draft.yml
@ -1,22 +0,0 @@
-name: Change PR to draft
-
-on:
-  pull_request_target:
-    types: [opened, reopened]
-
-jobs:
-  convert_pr_to_draft:
-    runs-on: ubuntu-22.04
-    name: Convert PR to draft
-    if: github.event.pull_request.draft == false
-    steps:
-      - name: Convert PR to draft
-        shell: bash
-        env:
-          PR_NUMBER: ${{ github.event.number }}
-          GH_TOKEN: ${{ github.token }}
-          REPO: ${{ github.repository }}
-        run: |
-          echo $PR_NUMBER
-          gh pr ready $PR_NUMBER --repo $REPO --undo
-          gh pr comment $PR_NUMBER --repo $REPO --body "Hi 👋, thank you for opening this pull request! The pull request is converted to draft by default. When it is ready for review, please click the \`Ready for review\` button (at the bottom of the PR page)."
--- a/.github/workflows/check_failed_model_tests.yml
+++ b/.github/workflows/check_failed_model_tests.yml
@ -22,6 +22,7 @@ env:
  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
  TF_FORCE_GPU_ALLOW_GROWTH: true
+  RUN_PT_TF_CROSS_TESTS: 1
  CUDA_VISIBLE_DEVICES: 0,1


--- a/.github/workflows/model_jobs.yml
+++ b/.github/workflows/model_jobs.yml
@ -30,6 +30,7 @@ env:
  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
  TF_FORCE_GPU_ALLOW_GROWTH: true
+  RUN_PT_TF_CROSS_TESTS: 1
  CUDA_VISIBLE_DEVICES: 0,1

 jobs:
--- a/.github/workflows/model_jobs_amd.yml
+++ b/.github/workflows/model_jobs_amd.yml
@ -30,6 +30,7 @@ env:
  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
  TF_FORCE_GPU_ALLOW_GROWTH: true
+  RUN_PT_TF_CROSS_TESTS: 1
  CUDA_VISIBLE_DEVICES: 0,1

 jobs:
--- a/.github/workflows/new_model_pr_merged_notification.yml
+++ b/.github/workflows/new_model_pr_merged_notification.yml
@ -1,68 +0,0 @@
-# Used to notify core maintainers about new model PR being merged
-name: New model PR merged notification
-
-on:
-  push:
-    branches:
-      - main
-    paths:
-      - 'src/transformers/models/*/modeling_*'
-
-jobs:
-  notify_new_model:
-    name: Notify new model
-    runs-on: ubuntu-22.04
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-      - name: Check new model
-        shell: bash
-        run: |
-          python -m pip install gitpython
-          python -c 'from utils.pr_slow_ci_models import get_new_model; new_model = get_new_model(diff_with_last_commit=True); print(new_model)' | tee output.txt
-          echo "NEW_MODEL=$(tail -n 1 output.txt)" >> $GITHUB_ENV
-          echo "COMMIT_SHA=$(git log -1 --format=%H)" >> $GITHUB_ENV
-
-      - name: print commit sha
-        if: ${{ env.NEW_MODEL != ''}}
-        shell: bash
-        run: |
-          echo "$COMMIT_SHA"
-
-      - name: print new model
-        if: ${{ env.NEW_MODEL != ''}}
-        shell: bash
-        run: |
-          echo "$NEW_MODEL"
-
-      - name: Notify
-        if: ${{ env.NEW_MODEL != ''}}
-        uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001
-        with:
-          # Slack channel id, channel name, or user id to post message.
-          # See also: https://api.slack.com/methods/chat.postMessage#channels
-          channel-id: transformers-new-model-notification
-          # For posting a rich message using Block Kit
-          payload: |
-            {
-              "blocks": [
-                {
-                  "type": "header",
-                  "text": {
-                    "type": "plain_text",
-                    "text": "New model!",
-                    "emoji": true
-                  }
-                },
-                {
-                  "type": "section",
-                  "text": {
-                    "type": "mrkdwn",
-                    "text": "<https://github.com/huggingface/transformers/commit/${{ env.COMMIT_SHA }}|New model: ${{ env.NEW_MODEL }}> GH_ArthurZucker, GH_lysandrejik, GH_ydshieh"
-                  }
-                }
-              ]
-            }
-        env:
-          SLACK_BOT_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
--- a/.github/workflows/push-important-models.yml
+++ b/.github/workflows/push-important-models.yml
@ -7,13 +7,14 @@ on:
 env:
  OUTPUT_SLACK_CHANNEL_ID: "C06L2SGMEEA"
  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
-  HF_HOME: /mnt/cache
-  TRANSFORMERS_IS_CI: yes
-  OMP_NUM_THREADS: 8
-  MKL_NUM_THREADS: 8
-  RUN_SLOW: yes # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access. # This token is created under the bot `hf-transformers-bot`.
-  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
-  TF_FORCE_GPU_ALLOW_GROWTH: true
+  HF_HOME: /mnt/cache 
+  TRANSFORMERS_IS_CI: yes 
+  OMP_NUM_THREADS: 8 
+  MKL_NUM_THREADS: 8 
+  RUN_SLOW: yes # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access. # This token is created under the bot `hf-transformers-bot`. 
+  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }} 
+  TF_FORCE_GPU_ALLOW_GROWTH: true 
+  RUN_PT_TF_CROSS_TESTS: 1

 jobs:
  get_modified_models:
@ -24,13 +25,13 @@ jobs:
    steps:
      - name: Check out code
        uses: actions/checkout@v4
-
+      
      - name: Get changed files
        id: changed-files
        uses: tj-actions/changed-files@3f54ebb830831fc121d3263c1857cfbdc310cdb9 #v42
        with:
          files: src/transformers/models/**
-
+      
      - name: Run step if only the files listed above change
        if: steps.changed-files.outputs.any_changed == 'true'
        id: set-matrix
@ -59,41 +60,41 @@ jobs:
    if: ${{ needs.get_modified_models.outputs.matrix != '[]' && needs.get_modified_models.outputs.matrix != '' && fromJson(needs.get_modified_models.outputs.matrix)[0] != null }}
    strategy:
      fail-fast: false
-      matrix:
+      matrix: 
        model-name: ${{ fromJson(needs.get_modified_models.outputs.matrix) }}

    steps:
      - name: Check out code
        uses: actions/checkout@v4
-
+      
      - name: Install locally transformers & other libs
        run: |
          apt install sudo
          sudo -H pip install --upgrade pip
-          sudo -H pip uninstall -y transformers
-          sudo -H pip install -U -e ".[testing]"
+          sudo -H pip uninstall -y transformers 
+          sudo -H pip install -U -e ".[testing]" 
          MAX_JOBS=4 pip install flash-attn --no-build-isolation
          pip install bitsandbytes
-
+      
      - name: NVIDIA-SMI
        run: |
          nvidia-smi
-
+      
      - name: Show installed libraries and their versions
        run: pip freeze
-
+      
      - name: Run FA2 tests
        id: run_fa2_tests
        run:
          pytest -rsfE -m "flash_attn_test" --make-reports=${{ matrix.model-name }}_fa2_tests/ tests/${{ matrix.model-name }}/test_modeling_*
-
+      
      - name: "Test suite reports artifacts: ${{ matrix.model-name }}_fa2_tests"
        if: ${{ always() }}
        uses: actions/upload-artifact@v4
        with:
          name: ${{ matrix.model-name }}_fa2_tests
          path: /transformers/reports/${{ matrix.model-name }}_fa2_tests
-
+      
      - name: Post to Slack
        if: always()
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
@ -102,13 +103,13 @@ jobs:
          title: 🤗 Results of the FA2 tests - ${{ matrix.model-name }}
          status: ${{ steps.run_fa2_tests.conclusion}}
          slack_token: ${{ secrets.CI_SLACK_BOT_TOKEN }}
-
+      
      - name: Run integration tests
        id: run_integration_tests
        if: always()
        run:
          pytest -rsfE -k "IntegrationTest"  --make-reports=tests_integration_${{ matrix.model-name }} tests/${{ matrix.model-name }}/test_modeling_*
-
+      
      - name: "Test suite reports artifacts: tests_integration_${{ matrix.model-name }}"
        if: ${{ always() }}
        uses: actions/upload-artifact@v4
@ -118,7 +119,7 @@ jobs:

      - name: Post to Slack
        if: always()
-        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main 
        with:
          slack_channel: ${{ env.OUTPUT_SLACK_CHANNEL_ID }}
          title: 🤗 Results of the Integration tests - ${{ matrix.model-name }}
--- a/.github/workflows/self-comment-ci.yml
+++ b/.github/workflows/self-comment-ci.yml
@ -22,6 +22,7 @@ env:
  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
  TF_FORCE_GPU_ALLOW_GROWTH: true
+  RUN_PT_TF_CROSS_TESTS: 1
  CUDA_VISIBLE_DEVICES: 0,1

 jobs:
--- a/.github/workflows/self-push-amd.yml
+++ b/.github/workflows/self-push-amd.yml
@ -14,6 +14,7 @@ env:
  MKL_NUM_THREADS: 8
  PYTEST_TIMEOUT: 60
  TF_FORCE_GPU_ALLOW_GROWTH: true
+  RUN_PT_TF_CROSS_TESTS: 1
  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}

 jobs:
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@ -24,6 +24,7 @@ env:
  MKL_NUM_THREADS: 8
  PYTEST_TIMEOUT: 60
  TF_FORCE_GPU_ALLOW_GROWTH: true
+  RUN_PT_TF_CROSS_TESTS: 1
  CUDA_VISIBLE_DEVICES: 0,1

 jobs:
@ -292,7 +293,7 @@ jobs:

          echo "$machine_type"
          echo "machine_type=$machine_type" >> $GITHUB_ENV
-
+          
      - name: Update clone using environment variables
        working-directory: /transformers
        run: |
@ -405,7 +406,7 @@ jobs:

          echo "$machine_type"
          echo "machine_type=$machine_type" >> $GITHUB_ENV
-
+          
      - name: Update clone using environment variables
        working-directory: /workspace/transformers
        run: |
@ -515,7 +516,7 @@ jobs:

          echo "$machine_type"
          echo "machine_type=$machine_type" >> $GITHUB_ENV
-
+          
      - name: Update clone using environment variables
        working-directory: /workspace/transformers
        run: |
@ -647,6 +648,6 @@ jobs:
        # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
        run: |
          pip install huggingface_hub
-          pip install slack_sdk
+          pip install slack_sdk 
          pip show slack_sdk
          python utils/notification_service.py "${{ needs.setup.outputs.matrix }}"
--- a/.github/workflows/self-scheduled-amd-mi250-caller.yml
+++ b/.github/workflows/self-scheduled-amd-mi250-caller.yml
@ -15,7 +15,7 @@ jobs:
    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled.yaml@main
    with:
      job: run_models_gpu
-      slack_report_channel: "#amd-hf-ci"
+      slack_report_channel: "#transformers-ci-daily-amd"
      runner: mi250
      docker: huggingface/transformers-pytorch-amd-gpu
      ci_event: Scheduled CI (AMD) - mi250
@ -26,7 +26,7 @@ jobs:
    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled.yaml@main
    with:
      job: run_pipelines_torch_gpu
-      slack_report_channel: "#amd-hf-ci"
+      slack_report_channel: "#transformers-ci-daily-amd"
      runner: mi250
      docker: huggingface/transformers-pytorch-amd-gpu
      ci_event: Scheduled CI (AMD) - mi250
@ -37,7 +37,7 @@ jobs:
    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled.yaml@main
    with:
      job: run_examples_gpu
-      slack_report_channel: "#amd-hf-ci"
+      slack_report_channel: "#transformers-ci-daily-amd"
      runner: mi250
      docker: huggingface/transformers-pytorch-amd-gpu
      ci_event: Scheduled CI (AMD) - mi250
@ -48,7 +48,7 @@ jobs:
    uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled.yaml@main
    with:
      job: run_torch_cuda_extensions_gpu
-      slack_report_channel: "#amd-hf-ci"
+      slack_report_channel: "#transformers-ci-daily-amd"
      runner: mi250
      docker: huggingface/transformers-pytorch-deepspeed-amd-gpu
      ci_event: Scheduled CI (AMD) - mi250
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@ -40,6 +40,7 @@ env:
  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
  TF_FORCE_GPU_ALLOW_GROWTH: true
+  RUN_PT_TF_CROSS_TESTS: 1
  CUDA_VISIBLE_DEVICES: 0,1
  NUM_SLICES: 2

@ -570,4 +571,4 @@ jobs:
    with:
      docker: ${{ inputs.docker }}
      start_sha: ${{ github.sha }}
-    secrets: inherit
+    secrets: inherit
--- a/.github/workflows/ssh-runner.yml
+++ b/.github/workflows/ssh-runner.yml
@ -5,7 +5,7 @@ on:
    inputs:
      runner_type:
        description: 'Type of runner to test (a10 or t4)'
-        required: true
+        required: true 
      docker_image:
        description: 'Name of the Docker image'
        required: true
@ -15,14 +15,15 @@ on:

 env:
  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
-  HF_HOME: /mnt/cache
-  TRANSFORMERS_IS_CI: yes
-  OMP_NUM_THREADS: 8
-  MKL_NUM_THREADS: 8
-  RUN_SLOW: yes # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access. # This token is created under the bot `hf-transformers-bot`.
-  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
-  TF_FORCE_GPU_ALLOW_GROWTH: true
+  HF_HOME: /mnt/cache 
+  TRANSFORMERS_IS_CI: yes 
+  OMP_NUM_THREADS: 8 
+  MKL_NUM_THREADS: 8 
+  RUN_SLOW: yes # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access. # This token is created under the bot `hf-transformers-bot`. 
+  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }} 
+  TF_FORCE_GPU_ALLOW_GROWTH: true 
  CUDA_VISIBLE_DEVICES: 0,1
+  RUN_PT_TF_CROSS_TESTS: 1

 jobs:
  get_runner:
@ -77,7 +78,7 @@ jobs:
      - name: Show installed libraries and their versions
        working-directory: /transformers
        run: pip freeze
-
+      
      - name: NVIDIA-SMI
        run: |
          nvidia-smi
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -343,6 +343,8 @@ RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/t

 Like the slow tests, there are other environment variables available which are not enabled by default during testing:
 - `RUN_CUSTOM_TOKENIZERS`: Enables tests for custom tokenizers.
+- `RUN_PT_FLAX_CROSS_TESTS`: Enables tests for PyTorch + Flax integration.
+- `RUN_PT_TF_CROSS_TESTS`: Enables tests for TensorFlow + PyTorch integration.

 More environment variables and additional information can be found in the [testing_utils.py](https://github.com/huggingface/transformers/blob/main/src/transformers/testing_utils.py).

--- a/conftest.py
+++ b/conftest.py
@ -61,6 +61,7 @@ NOT_DEVICE_TESTS = {
    "test_load_save_without_tied_weights",
    "test_tied_weights_keys",
    "test_model_weights_reload_no_missing_tied_weights",
+    "test_pt_tf_model_equivalence",
    "test_mismatched_shapes_have_properly_initialized_weights",
    "test_matched_shapes_have_loaded_weights_when_some_mismatched_shapes_exist",
    "test_model_is_small",
@ -84,6 +85,12 @@ warnings.simplefilter(action="ignore", category=FutureWarning)


 def pytest_configure(config):
+    config.addinivalue_line(
+        "markers", "is_pt_tf_cross_test: mark test to run only when PT and TF interactions are tested"
+    )
+    config.addinivalue_line(
+        "markers", "is_pt_flax_cross_test: mark test to run only when PT and FLAX interactions are tested"
+    )
    config.addinivalue_line("markers", "is_pipeline_test: mark test to run only when pipelines are tested")
    config.addinivalue_line("markers", "is_staging_test: mark test to run only in the staging environment")
    config.addinivalue_line("markers", "accelerate_tests: mark test that require accelerate")
--- a/docker/tf-light.dockerfile
+++ b/docker/tf-light.dockerfile
@ -7,6 +7,6 @@ RUN apt-get install -y  cmake
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
 RUN pip install  --upgrade --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[tf-cpu,sklearn,testing,sentencepiece,tf-speech,vision]"
-RUN uv pip install --no-cache-dir  "protobuf==3.20.3"
+RUN uv pip install --no-cache-dir  "protobuf==3.20.3" 
 RUN pip uninstall -y transformers
-RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
+RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
--- a/docker/torch-light.dockerfile
+++ b/docker/torch-light.dockerfile
@ -7,5 +7,5 @@ ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
 RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
-RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing,tiktoken,num2words]"
-RUN pip uninstall -y transformers
+RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing,tiktoken]"
+RUN pip uninstall -y transformers
--- a/docker/transformers-pytorch-amd-gpu/Dockerfile
+++ b/docker/transformers-pytorch-amd-gpu/Dockerfile
@ -12,7 +12,7 @@ RUN git lfs install

 RUN python3 -m pip install --no-cache-dir --upgrade pip numpy

-RUN python3 -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.2.4
+RUN python3 -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.2

 RUN python3 -m pip install --no-cache-dir --upgrade importlib-metadata setuptools ninja git+https://github.com/facebookresearch/detectron2.git pytesseract "itsdangerous<2.1.0"

--- a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
@ -2,10 +2,10 @@ FROM rocm/dev-ubuntu-22.04:6.2.4
 LABEL maintainer="Hugging Face"

 ARG DEBIAN_FRONTEND=noninteractive
-ARG PYTORCH='2.6.0'
-ARG TORCH_VISION='0.21.0'
-ARG TORCH_AUDIO='2.6.0'
-ARG ROCM='6.2.4'
+ARG PYTORCH='2.5.1'
+ARG TORCH_VISION='0.20.0'
+ARG TORCH_AUDIO='2.5.0'
+ARG ROCM='6.2'

 RUN apt update && \
    apt install -y --no-install-recommends \
@ -16,11 +16,9 @@ RUN apt update && \
    python-is-python3 \
    rocrand-dev \
    rocthrust-dev \
-    rocblas-dev \
-    hipsolver-dev \
    hipsparse-dev \
    hipblas-dev \
-    hipblaslt-dev && \
+    rocblas-dev && \
    apt clean && \
    rm -rf /var/lib/apt/lists/*

--- a/docker/transformers-quantization-latest-gpu/Dockerfile
+++ b/docker/transformers-quantization-latest-gpu/Dockerfile
@ -76,9 +76,6 @@ RUN python3 -m pip install git+https://github.com/NetEase-FuXi/EETQ.git
 RUN python3 -m pip install --no-cache-dir flute-kernel==0.3.0 -i https://flute-ai.github.io/whl/cu118
 RUN python3 -m pip install --no-cache-dir fast_hadamard_transform==1.0.4.post1

-# Add compressed-tensors for quantization testing
-RUN python3 -m pip install --no-cache-dir compressed-tensors
-
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
--- a/docs/source/ar/trainer.md
+++ b/docs/source/ar/trainer.md
@ -673,29 +673,6 @@ tpu_use_sudo: false
 use_cpu: false
 ```

-</hfoption>
-<hfoption id="Tensor Parallelism with PyTorch 2">
-
-```yml
-compute_environment: LOCAL_MACHINE
-tp_config:
-  tp_size: 4
-distributed_type: TP
-downcast_bf16: 'no'
-machine_rank: 0
-main_training_function: main
-mixed_precision: 'no'
-num_machines: 1
-num_processes: 4
-rdzv_backend: static
-same_network: true
-tpu_env: []
-tpu_use_cluster: false
-tpu_use_sudo: false
-use_cpu: false
-
-```
-
 </hfoption>
 </hfoptions>
 يُعد أمر  [`accelerate_launch`](https://huggingface.co/docs/accelerate/package_reference/cli#accelerate-launch) هو الطريقة المُوصى بها لتشغيل نص البرمجى للتدريب على نظام موزع باستخدام Accelerate و [`Trainer`] مع المعلمات المحددة في `config_file.yaml`. يتم حفظ هذا الملف في مجلد ذاكرة التخزين المؤقت لـ Accelerate ويتم تحميله تلقائيًا عند تشغيل `accelerate_launch`.
--- a/docs/source/de/contributing.md
+++ b/docs/source/de/contributing.md
@ -283,6 +283,8 @@ RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/t
 Wie bei den langsamen Tests gibt es auch andere Umgebungsvariablen, die standardmäßig beim Testen nicht gesetzt sind:

 * `RUN_CUSTOM_TOKENIZERS`: Aktiviert Tests für benutzerdefinierte Tokenizer.
+* `RUN_PT_FLAX_CROSS_TESTS`: Aktiviert Tests für die Integration von PyTorch + Flax.
+* `RUN_PT_TF_CROSS_TESTS`: Aktiviert Tests für die Integration von TensorFlow + PyTorch.

 Weitere Umgebungsvariablen und zusätzliche Informationen finden Sie in der [testing_utils.py](src/transformers/testing_utils.py).

--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@ -461,8 +461,6 @@
        title: Granite
      - local: model_doc/granitemoe
        title: GraniteMoe
-      - local: model_doc/granitemoeshared
-        title: GraniteMoeShared
      - local: model_doc/granitevision
        title: GraniteVision
      - local: model_doc/helium
@ -965,10 +963,6 @@
        title: Segment Anything
      - local: model_doc/siglip
        title: SigLIP
-      - local: model_doc/siglip2
-        title: SigLIP2
-      - local: model_doc/smolvlm
-        title: SmolVLM
      - local: model_doc/speech-encoder-decoder
        title: Speech Encoder Decoder Models
      - local: model_doc/tapas
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@ -173,7 +173,6 @@ Flax), PyTorch, and/or TensorFlow.
 |               [GPTSAN-japanese](model_doc/gptsan-japanese)               |       ✅        |         ❌         |      ❌      |
 |                       [Granite](model_doc/granite)                       |       ✅        |         ❌         |      ❌      |
 |                  [GraniteMoeMoe](model_doc/granitemoe)                   |       ✅        |         ❌         |      ❌      |
-|            [GraniteMoeSharedMoe](model_doc/granitemoeshared)             |       ✅        |         ❌         |      ❌      |
 |                    [Graphormer](model_doc/graphormer)                    |       ✅        |         ❌         |      ❌      |
 |                [Grounding DINO](model_doc/grounding-dino)                |       ✅        |         ❌         |      ❌      |
 |                      [GroupViT](model_doc/groupvit)                      |       ✅        |         ✅         |      ❌      |
@ -317,8 +316,6 @@ Flax), PyTorch, and/or TensorFlow.
 |                           [SEW](model_doc/sew)                           |       ✅        |         ❌         |      ❌      |
 |                         [SEW-D](model_doc/sew-d)                         |       ✅        |         ❌         |      ❌      |
 |                        [SigLIP](model_doc/siglip)                        |       ✅        |         ❌         |      ❌      |
-|                       [SigLIP2](model_doc/siglip2)                       |       ✅        |         ❌         |      ❌      |
-|                       [SmolVLM](model_doc/smolvlm)                       |       ✅        |         ❌         |      ❌      |
 |        [Speech Encoder decoder](model_doc/speech-encoder-decoder)        |       ✅        |         ❌         |      ✅      |
 |                 [Speech2Text](model_doc/speech_to_text)                  |       ✅        |         ✅         |      ❌      |
 |                      [SpeechT5](model_doc/speecht5)                      |       ✅        |         ❌         |      ❌      |
--- a/docs/source/en/llm_tutorial_optimization.md
+++ b/docs/source/en/llm_tutorial_optimization.md
@ -55,7 +55,7 @@ To give some examples of how much VRAM it roughly takes to load a model in bfloa

 As of writing this document, the largest GPU chip on the market is the A100 & H100 offering 80GB of VRAM. Most of the models listed before require more than 80GB just to be loaded and therefore necessarily require [tensor parallelism](https://huggingface.co/docs/transformers/perf_train_gpu_many#tensor-parallelism) and/or [pipeline parallelism](https://huggingface.co/docs/transformers/perf_train_gpu_many#naive-model-parallelism-vertical-and-pipeline-parallelism).

-🤗 Transformers now supports tensor parallelism for supported models having `base_tp_plan` in their respecitve config classes. Learn more about Tensor Parallelism [here](perf_train_gpu_many#tensor-parallelism). Furthermore, if you're interested in writing models in a tensor-parallelism-friendly way, feel free to have a look at [the text-generation-inference library](https://github.com/huggingface/text-generation-inference/tree/main/server/text_generation_server/models/custom_modeling).
+🤗 Transformers does not support tensor parallelism out of the box as it requires the model architecture to be written in a specific way. If you're interested in writing models in a tensor-parallelism-friendly way, feel free to have a look at [the text-generation-inference library](https://github.com/huggingface/text-generation-inference/tree/main/server/text_generation_server/models/custom_modeling).

 Naive pipeline parallelism is supported out of the box. For this, simply load the model with `device="auto"` which will automatically place the different layers on the available GPUs as explained [here](https://huggingface.co/docs/accelerate/v0.22.0/en/concept_guides/big_model_inference).
 Note, however that while very effective, this naive pipeline parallelism does not tackle the issues of GPU idling. For this more advanced pipeline parallelism is required as explained [here](https://huggingface.co/docs/transformers/en/perf_train_gpu_many#naive-model-parallelism-vertical-and-pipeline-parallelism).
--- a/docs/source/en/model_doc/granitemoeshared.md
+++ b/docs/source/en/model_doc/granitemoeshared.md
@ -1,66 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# GraniteMoeShared
-
-## Overview
-
-
-The GraniteMoe model was proposed in [Power Scheduler: A Batch Size and Token Number Agnostic Learning Rate Scheduler](https://arxiv.org/abs/2408.13359) by Yikang Shen, Matthew Stallone, Mayank Mishra, Gaoyuan Zhang, Shawn Tan, Aditya Prasad, Adriana Meza Soria, David D. Cox and Rameswar Panda.
-
-Additionally this class GraniteMoeSharedModel adds shared experts for Moe.
-
-```python
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-model_path = "ibm-research/moe-7b-1b-active-shared-experts"
-tokenizer = AutoTokenizer.from_pretrained(model_path)
-
-# drop device_map if running on CPU
-model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
-model.eval()
-
-# change input text as desired
-prompt = "Write a code to find the maximum value in a list of numbers."
-
-# tokenize the text
-input_tokens = tokenizer(prompt, return_tensors="pt")
-# generate output tokens
-output = model.generate(**input_tokens, max_new_tokens=100)
-# decode output tokens into text
-output = tokenizer.batch_decode(output)
-# loop over the batch to print, in this example the batch size is 1
-for i in output:
-    print(i)
-```
-
-This HF implementation is contributed by [Mayank Mishra](https://huggingface.co/mayank-mishra), [Shawn Tan](https://huggingface.co/shawntan) and [Sukriti Sharma](https://huggingface.co/SukritiSharma).
-
-
-## GraniteMoeSharedConfig
-
-[[autodoc]] GraniteMoeSharedConfig
-
-## GraniteMoeSharedModel
-
-[[autodoc]] GraniteMoeSharedModel
-    - forward
-
-## GraniteMoeSharedForCausalLM
-
-[[autodoc]] GraniteMoeSharedForCausalLM
-    - forward
--- a/docs/source/en/model_doc/pixtral.md
+++ b/docs/source/en/model_doc/pixtral.md
@ -59,7 +59,7 @@ chat = [
    {
      "role": "user", "content": [
        {"type": "text", "content": "Can this animal"}, 
-        {"type": "image", "url": "https://picsum.photos/id/237/200/300"}, 
+        {"type": "image", "ur": "https://picsum.photos/id/237/200/300"}, 
        {"type": "text", "content": "live here?"}, 
        {"type": "image", "url": "https://picsum.photos/seed/picsum/200/300"}
      ]
--- a/docs/source/en/model_doc/siglip2.md
+++ b/docs/source/en/model_doc/siglip2.md
@ -1,276 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# SigLIP2
-
-## Overview
-
-The SigLIP2 model was proposed in [SigLIP 2: Multilingual Vision-Language Encoders with Improved Semantic Understanding, Localization, and Dense Features](https://huggingface.co/papers/2502.14786) by Michael Tschannen, Alexey Gritsenko, Xiao Wang, Muhammad Ferjad Naeem, Ibrahim Alabdulmohsin,
-Nikhil Parthasarathy, Talfan Evans, Lucas Beyer, Ye Xia, Basil Mustafa, Olivier Hénaff, Jeremiah Harmsen,
-Andreas Steiner and Xiaohua Zhai.
-
-The model comes in two variants
-
- 1) FixRes - model works with fixed resolution images (backward compatible with SigLIP v1)
- 2) NaFlex - model works with variable image aspect ratios and resolutions (SigLIP2 in `transformers`)
-
-The abstract from the paper is the following:
-
-*We introduce SigLIP 2, a family of new multilingual vision-language encoders that build on the success
-of the original SigLIP. In this second iteration, we extend the original image-text training objective with
-several prior, independently developed techniques into a unified recipe—this includes decoder-based
-pretraining, self-supervised losses (self-distillation, masked prediction) and online data curation. With
-these changes, SigLIP 2 models outperform their SigLIP counterparts at all model scales in core capabilities, 
-including zero-shot classification (best SigLIP 2 ViT-g/16 achieves 85.0% ImageNet zero-shot
-accuracy), image-text retrieval, and transfer performance when extracting visual representations for
-Vision-Language Models (VLMs). Furthermore, the new training recipe leads to significant improvements 
-on localization and dense prediction tasks. We also train variants which support multiple resolutions 
-and preserve the input’s native aspect ratio. Finally, we train on a more diverse data-mixture that
-includes de-biasing techniques, leading to much better multilingual understanding and improved fair-
-ness. To provide users with the ability to trade-off inference cost with performance, we release model
-checkpoints at four sizes (ViT-B/86M, L/303M, So400m/400M, and g/1B).*
-
-## Usage tips
-
- Usage of SigLIP2 is similar to [SigLIP](siglip) and [CLIP](clip). The main difference from CLIP is the training loss, which does not require a global view of all the pairwise similarities of images and texts within a batch. One needs to apply the sigmoid activation function to the logits, rather than the softmax.
- Training is supported but does not use `torch.distributed` utilities which may limit the scalability of batch size. However, DDP and FDSP works on single-node multi-gpu setup.
- When using the standalone [`GemmaTokenizerFast`] make sure to pass `padding="max_length"` and `max_length=64` as that's how the model was trained.
- Model was trained with *lowercased* text, make sure you make the same preprocessing for your text labels.
- To get the same results as the pipeline, a prompt template of "this is a photo of {label}" should be used.
- The NaFlex variant supports processing images at higher resolutions by adjusting the `max_num_patches` parameter in the `Processor`. The default value is `max_num_patches=256`. Increasing `max_num_patches` to 1024 (4x) will approximately double processed image height and width, while preserving the aspect ratio.
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/siglip2_metrics_table.png"
-alt="drawing" width="600"/>
-
-This model was contributed by [qubvel](https://huggingface.co/qubvel-hf).
-The original code can be found [here](https://github.com/google-research/big_vision/tree/main).
-
-## Usage example
-
-There are 2 main ways to use SigLIP2: either using the pipeline API, which abstracts away all the complexity for you, or by using the `Siglip2Model` class yourself.
-
-### FixRes variant
-
-**Pipeline API**
-
-The pipeline allows to use the model in a few lines of code:
-
-```python
->>> from transformers import pipeline
->>> from PIL import Image
->>> import requests
-
->>> # load pipe
->>> image_classifier = pipeline(
-...     task="zero-shot-image-classification",
-...     model="google/siglip2-base-patch16-224",
-... )
-
->>> # load image
->>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
->>> image = Image.open(requests.get(url, stream=True).raw)
-
->>> # inference
->>> candidate_labels = ["2 cats", "a plane", "a remote"]
->>> outputs = image_classifier(image, candidate_labels=candidate_labels)
->>> outputs = [{"score": round(output["score"], 4), "label": output["label"] } for output in outputs]
->>> print(outputs)
-[{'score': 0.1499, 'label': '2 cats'}, {'score': 0.0008, 'label': 'a remote'}, {'score': 0.0, 'label': 'a plane'}]
-```
-
-**Using the model yourself**
-
-If you want to do the pre- and postprocessing yourself, here's how to do that:
-
-```python
->>> from PIL import Image
->>> import requests
->>> from transformers import AutoProcessor, AutoModel
->>> import torch
-
->>> model = AutoModel.from_pretrained("google/siglip2-base-patch16-224")
->>> processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-224")
-
->>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
->>> image = Image.open(requests.get(url, stream=True).raw)
-
->>> candidate_labels = ["2 cats", "2 dogs"]
-# follows the pipeline prompt template to get same results
->>> texts = [f"This is a photo of {label}." for label in candidate_labels]
-
-# IMPORTANT: we pass `padding=max_length` and `max_length=64` since the model was trained with this
->>> inputs = processor(text=texts, images=image, padding="max_length", max_length=64, return_tensors="pt")
-
->>> with torch.no_grad():
-...     outputs = model(**inputs)
-
->>> logits_per_image = outputs.logits_per_image
->>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
->>> print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'")
-15.0% that image 0 is '2 cats'
-```
-
-### NaFlex variant
-
-NaFlex combines ideas from FlexiViT, i.e. supporting multiple, predefined sequence lengths 
-with a single ViT model, and NaViT, namely processing images at their native aspect ratio.
-This enables processing different types of images at appropriate resolution, e.g. using a
-larger resolution to process document images, while at the same time minimizing the impact 
-of aspect ratio distortion on certain inference tasks, e.g. on OCR.
-
-Given a patch size and target sequence length, NaFlex preprocesses the data by first resizing 
-the input image such that the height and width after resizing are multiples of the patch size,
-while 
-    
-    1. keeping the aspect ratio distortion as small as possible
-    2. producing a sequence length of at most the desired target sequence length (`max_num_patches`)
-    
-The resulting distortion in width and height is at most `(patch_size - 1) / width` and
-`(patch_size - 1) / height`, respectively, which tends to be small for common resolutions and aspect ratios. 
-After resizing, the image is split into a sequence of patches, and a mask with padding information is added.
-
-```python
->>> from PIL import Image
->>> import requests
->>> from transformers import AutoProcessor, AutoModel
->>> import torch
-
->>> model = AutoModel.from_pretrained("google/siglip2-base-patch16-naflex")
->>> processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-naflex")
-
->>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
->>> image = Image.open(requests.get(url, stream=True).raw)
-
->>> candidate_labels = ["2 cats", "2 dogs"]
-# follows the pipeline prompt template to get same results
->>> texts = [f"This is a photo of {label}." for label in candidate_labels]
-
-# default value for `max_num_patches` is 256, but you can increase resulted image resolution providing
-# higher values e.g. `max_num_patches=512`
->>> inputs = processor(text=texts, images=image, max_num_patches=256, return_tensors="pt")
-
->>> with torch.no_grad():
-...     outputs = model(**inputs)
-
->>> logits_per_image = outputs.logits_per_image
->>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
->>> print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'")
-21.1% that image 0 is '2 cats'
-```
-
-## Resources
-
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with SigLIP2.
-
- [Zero-shot image classification task guide](../tasks/zero_shot_image_classification)
- Demo notebook for SigLIP2 can be found [here](https://github.com/qubvel/transformers-notebooks/tree/master/notebooks/SigLIP2_inference.ipynb). 🌎
-
-If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
-
-
-## Combining SigLIP2 and Flash Attention 2
-
-First, make sure to install the latest version of Flash Attention 2.
-
-```bash
-pip install -U flash-attn --no-build-isolation
-```
-
-Make also sure that you have a hardware that is compatible with Flash-Attention 2. Read more about it in the official documentation of flash-attn repository. Make also sure to load your model in half-precision (e.g. `torch.float16``)
-
-To load and run a model using Flash Attention 2, refer to the snippet below:
-
-```python
->>> import torch
->>> import requests
->>> from PIL import Image
->>> from transformers import AutoProcessor, AutoModel
->>> device = "cuda" # the device to load the model onto
-
->>> model = AutoModel.from_pretrained(
-...     "google/siglip2-so400m-patch14-384",
-...     attn_implementation="flash_attention_2",
-...     torch_dtype=torch.float16,
-...     device_map=device,
-... )
->>> processor = AutoProcessor.from_pretrained("google/siglip2-so400m-patch14-384")
-
->>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
->>> image = Image.open(requests.get(url, stream=True).raw)
-
->>> candidate_labels = ["2 cats", "2 dogs"]
-# follows the pipeline prompt template to get same results
->>> texts = [f'This is a photo of {label}.' for label in candidate_labels]
-# important: we pass `padding=max_length` since the model was trained with this
->>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt").to(device)
-
->>> with torch.no_grad():
-...     with torch.autocast(device):
-...         outputs = model(**inputs)
-
->>> logits_per_image = outputs.logits_per_image
->>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
->>> print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'")
-19.8% that image 0 is '2 cats'
-```
-
-## Siglip2Config
-
-[[autodoc]] Siglip2Config
-
-## Siglip2TextConfig
-
-[[autodoc]] Siglip2TextConfig
-
-## Siglip2VisionConfig
-
-[[autodoc]] Siglip2VisionConfig
-
-## Siglip2ImageProcessor
-
-[[autodoc]] Siglip2ImageProcessor
-    - preprocess
-
-## Siglip2ImageProcessorFast
-
-[[autodoc]] Siglip2ImageProcessorFast
-    - preprocess
-
-## Siglip2Processor
-
-[[autodoc]] Siglip2Processor
-
-## Siglip2Model
-
-[[autodoc]] Siglip2Model
-    - forward
-    - get_text_features
-    - get_image_features
-
-## Siglip2TextModel
-
-[[autodoc]] Siglip2TextModel
-    - forward
-
-## Siglip2VisionModel
-
-[[autodoc]] Siglip2VisionModel
-    - forward
-
-## Siglip2ForImageClassification
-
-[[autodoc]] Siglip2ForImageClassification
-    - forward
--- a/docs/source/en/model_doc/smolvlm.md
+++ b/docs/source/en/model_doc/smolvlm.md
@ -1,197 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# SmolVLM
-
-## Overview
-SmolVLM2 is an adaptation of the Idefics3 model with two main differences:
-
- It uses SmolLM2 for the text model.
- It supports multi-image and video inputs
-
-## Usage tips
-
-Input images are processed either by upsampling (if resizing is enabled) or at their original resolution. The resizing behavior depends on two parameters: do_resize and size.
-
-Videos should not be upsampled. 
-
-If `do_resize` is set to `True`, the model resizes images so that the longest edge is 4*512 pixels by default.
-The default resizing behavior can be customized by passing a dictionary to the `size` parameter. For example, `{"longest_edge": 4 * 512}` is the default, but you can change it to a different value if needed.
-
-Here’s how to control resizing and set a custom size:
-```python
-image_processor = SmolVLMImageProcessor(do_resize=True, size={"longest_edge": 2 * 512}, max_image_size=512)
-```
-
-Additionally, the `max_image_size` parameter, which controls the size of each square patch the image is decomposed into, is set to 512 by default but can be adjusted as needed. After resizing (if applicable), the image processor decomposes the images into square patches based on the `max_image_size` parameter.
-
-This model was contributed by [orrzohar](https://huggingface.co/orrzohar).
-
-
-
-## Usage example
-
-### Single Media inference
-
-The model can accept both images and videos as input, but you should use only one of the modalities at a time. Here's an example code for that.
-
-```python
-import torch
-from transformers import AutoProcessor, AutoModelForImageTextToText
-
-processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-256M-Video-Instruct")
-model = AutoModelForImageTextToText.from_pretrained(
-    "HuggingFaceTB/SmolVLM2-256M-Video-Instruct",
-    torch_dtype=torch.bfloat16,
-    device_map="cuda"
-)
-
-conversation = [
-    {
-        "role": "user",
-        "content":[
-            {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"},
-            {"type": "text", "text": "Describe this image."}
-        ]
-    }
-]
-
-inputs = processor.apply_chat_template(
-    conversation,
-    add_generation_prompt=True,
-    tokenize=True,
-    return_dict=True,
-    return_tensors="pt",
-).to(model.device, dtype=torch.bfloat16)
-
-output_ids = model.generate(**inputs, max_new_tokens=128)
-generated_texts = processor.batch_decode(output_ids, skip_special_tokens=True)
-print(generated_texts)
-
-
-# Video
-conversation = [
-    {
-        "role": "user",
-        "content": [
-            {"type": "video", "path": "/path/to/video.mp4"},
-            {"type": "text", "text": "Describe this video in detail"}
-        ]
-    },
-]
-
-inputs = processor.apply_chat_template(
-    conversation,
-    add_generation_prompt=True,
-    tokenize=True,
-    return_dict=True,
-    return_tensors="pt",
-).to(model.device, dtype=torch.bfloat16)
-
-generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=100)
-generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
-print(generated_texts[0])
-```
-
-### Batch Mixed Media Inference
-
-The model can batch inputs composed of several images/videos and text. Here is an example.
-
-```python
-import torch
-from transformers import AutoProcessor, AutoModelForImageTextToText
-
-processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-256M-Video-Instruct")
-model = AutoModelForImageTextToText.from_pretrained(
-    "HuggingFaceTB/SmolVLM2-256M-Video-Instruct",
-    torch_dtype=torch.bfloat16,
-    device_map="cuda"
-)
-
-# Conversation for the first image
-conversation1 = [
-    {
-        "role": "user",
-        "content": [
-            {"type": "image", "path": "/path/to/image.jpg"},
-            {"type": "text", "text": "Describe this image."}
-        ]
-    }
-]
-
-# Conversation with two images
-conversation2 = [
-    {
-        "role": "user",
-        "content": [
-            {"type": "image", "path": "/path/to/image.jpg"},
-            {"type": "image", "path": "/path/to/image.jpg"},
-            {"type": "text", "text": "What is written in the pictures?"}
-        ]
-    }
-]
-
-# Conversation with pure text
-conversation3 = [
-    {"role": "user","content": "who are you?"}
-]
-
-
-conversations = [conversation1, conversation2, conversation3]
-inputs = processor.apply_chat_template(
-    conversation,
-    add_generation_prompt=True,
-    tokenize=True,
-    return_dict=True,
-    return_tensors="pt",
-).to(model.device, dtype=torch.bfloat16)
-
-generated_ids = model.generate(**inputs, do_sample=False, max_new_tokens=100)
-generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
-print(generated_texts[0])
-```
-
-## SmolVLMConfig
-
-[[autodoc]] SmolVLMConfig
-
-## SmolVLMVisionConfig
-
-[[autodoc]] SmolVLMVisionConfig
-
-## Idefics3VisionTransformer
-
-[[autodoc]] SmolVLMVisionTransformer
-
-## SmolVLMModel
-
-[[autodoc]] SmolVLMModel
-    - forward
-
-## SmolVLMForConditionalGeneration
-
-[[autodoc]] SmolVLMForConditionalGeneration
-    - forward
-
-
-## SmolVLMImageProcessor
-[[autodoc]] SmolVLMImageProcessor
-    - preprocess
-
-
-## SmolVLMProcessor
-[[autodoc]] SmolVLMProcessor
-    - __call__
--- a/docs/source/en/perf_infer_gpu_one.md
+++ b/docs/source/en/perf_infer_gpu_one.md
@ -60,7 +60,6 @@ FlashAttention-2 is currently supported for the following architectures:
 * [GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj#transformers.GPTJModel)
 * [Granite](https://huggingface.co/docs/transformers/model_doc/granite#transformers.GraniteModel)
 * [GraniteMoe](https://huggingface.co/docs/transformers/model_doc/granitemoe#transformers.GraniteMoeModel)
-* [GraniteMoeShared](https://huggingface.co/docs/transformers/model_doc/granitemoeshared#transformers.GraniteMoeSharedModel)
 * [Idefics2](https://huggingface.co/docs/transformers/model_doc/idefics2#transformers.Idefics2Model)
 * [Idefics3](https://huggingface.co/docs/transformers/model_doc/idefics3#transformers.Idefics3Model)
 * [Falcon](https://huggingface.co/docs/transformers/model_doc/falcon#transformers.FalconModel)
@ -95,7 +94,6 @@ FlashAttention-2 is currently supported for the following architectures:
 * [PhiMoE](https://huggingface.co/docs/transformers/model_doc/phimoe#transformers.PhimoeModel)
 * [StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm#transformers.StableLmModel)
 * [Starcoder2](https://huggingface.co/docs/transformers/model_doc/starcoder2#transformers.Starcoder2Model)
-* [SmolVLM](https://huggingface.co/docs/transformers/model_doc/smolvlm#transformers.SmolVLMModel)
 * [Qwen2](https://huggingface.co/docs/transformers/model_doc/qwen2#transformers.Qwen2Model)
 * [Qwen2Audio](https://huggingface.co/docs/transformers/model_doc/qwen2_audio#transformers.Qwen2AudioEncoder)
 * [Qwen2MoE](https://huggingface.co/docs/transformers/model_doc/qwen2_moe#transformers.Qwen2MoeModel)
@ -111,7 +109,6 @@ FlashAttention-2 is currently supported for the following architectures:
 * [data2vec_audio](https://huggingface.co/docs/transformers/main/en/model_doc/data2vec#transformers.Data2VecAudioModel)
 * [Sew](https://huggingface.co/docs/transformers/main/en/model_doc/sew#transformers.SEWModel)
 * [SigLIP](https://huggingface.co/docs/transformers/model_doc/siglip)
-* [SigLIP2](https://huggingface.co/docs/transformers/model_doc/siglip2)
 * [UniSpeech](https://huggingface.co/docs/transformers/v4.39.3/en/model_doc/unispeech#transformers.UniSpeechModel)
 * [unispeech_sat](https://huggingface.co/docs/transformers/v4.39.3/en/model_doc/unispeech-sat#transformers.UniSpeechSatModel)
 * [helium](https://huggingface.co/docs/transformers/main/en/model_doc/heliumtransformers.HeliumModel)
@ -269,7 +266,6 @@ For now, Transformers supports SDPA inference and training for the following arc
 * [Idefics3](https://huggingface.co/docs/transformers/model_doc/idefics3#transformers.Idefics3Model)
 * [I-JEPA](https://huggingface.co/docs/transformers/model_doc/ijepa#transformers.IJepaModel)
 * [GraniteMoe](https://huggingface.co/docs/transformers/model_doc/granitemoe#transformers.GraniteMoeModel)
-* [GraniteMoeShared](https://huggingface.co/docs/transformers/model_doc/granitemoeshared#transformers.GraniteMoeSharedModel)
 * [JetMoe](https://huggingface.co/docs/transformers/model_doc/jetmoe#transformers.JetMoeModel)
 * [Jamba](https://huggingface.co/docs/transformers/model_doc/jamba#transformers.JambaModel)
 * [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel)
@ -303,7 +299,6 @@ For now, Transformers supports SDPA inference and training for the following arc
 * [Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral#transformers.MixtralModel)
 * [StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm#transformers.StableLmModel)
 * [Starcoder2](https://huggingface.co/docs/transformers/model_doc/starcoder2#transformers.Starcoder2Model)
-* [SmolVLM](https://huggingface.co/docs/transformers/model_doc/smolvlm#transformers.SmolVLMModel)
 * [Qwen2](https://huggingface.co/docs/transformers/model_doc/qwen2#transformers.Qwen2Model)
 * [Qwen2Audio](https://huggingface.co/docs/transformers/model_doc/qwen2_audio#transformers.Qwen2AudioEncoder)
 * [Qwen2MoE](https://huggingface.co/docs/transformers/model_doc/qwen2_moe#transformers.Qwen2MoeModel)
@ -311,7 +306,6 @@ For now, Transformers supports SDPA inference and training for the following arc
 * [RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta#transformers.RobertaModel)
 * [Sew](https://huggingface.co/docs/transformers/main/en/model_doc/sew#transformers.SEWModel)
 * [SigLIP](https://huggingface.co/docs/transformers/model_doc/siglip)
-* [SigLIP2](https://huggingface.co/docs/transformers/model_doc/siglip2)
 * [StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm#transformers.StableLmModel)
 * [Starcoder2](https://huggingface.co/docs/transformers/model_doc/starcoder2#transformers.Starcoder2Model)
 * [UniSpeech](https://huggingface.co/docs/transformers/v4.39.3/en/model_doc/unispeech#transformers.UniSpeechModel)
--- a/docs/source/en/perf_train_gpu_many.md
+++ b/docs/source/en/perf_train_gpu_many.md
@ -450,13 +450,12 @@ Implementations:
 - [parallelformers](https://github.com/tunib-ai/parallelformers) (only inference at the moment)
 - [SageMaker](https://arxiv.org/abs/2111.05972) - this is a proprietary solution that can only be used on AWS.
 - [OSLO](https://github.com/tunib-ai/oslo) has the tensor parallelism implementation based on the Transformers.
- [`transformers` integration](main_classes/trainer) tensor parallelism is available through tp_size attribute for models having `base_tp_plan`. Further you can look at [example usage](perf_infer_gpu_multi)

 SageMaker combines TP with DP for a more efficient processing.

 🤗 Transformers status:
- core: uses PyTorch 2 APIs to support tensor parallelism to models having base_tp_plan in their respective config classes.
- Alternatively, you can as well try [parallelformers](https://github.com/tunib-ai/parallelformers) that provides this support for most of our models. Training mode with TP is as well supported natively in transformers.
+- core: not yet implemented in the core
+- but if you want inference [parallelformers](https://github.com/tunib-ai/parallelformers) provides this support for most of our models. So until this is implemented in the core you can use theirs. And hopefully training mode will be supported too.
 - Deepspeed-Inference also supports our BERT, GPT-2, and GPT-Neo models in their super-fast CUDA-kernel-based inference mode, see more [here](https://www.deepspeed.ai/tutorials/inference-tutorial/)

 🤗 Accelerate integrates with [TP from Megatron-LM](https://huggingface.co/docs/accelerate/v0.23.0/en/usage_guides/megatron_lm).
@ -536,7 +535,7 @@ Important papers:
 - [Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B, A Large-Scale Generative Language Model](
 https://arxiv.org/abs/2201.11990)

-🤗 Transformers status: not yet implemented, since we have no PP.
+🤗 Transformers status: not yet implemented, since we have no PP and TP.

 ## FlexFlow

--- a/docs/source/en/quantization/overview.md
+++ b/docs/source/en/quantization/overview.md
@ -59,7 +59,7 @@ Use the table below to help you decide which quantization method to use.
 | [HQQ](./hqq.md)                               | 🟢                   | 🟢              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🟢              | 1/8         | 🟢               | 🔴                          | 🟢                      | https://github.com/mobiusml/hqq/            |
 | [optimum-quanto](./quanto.md)                 | 🟢                   | 🟢              | 🟢        | 🔴        | 🟢                                 | 🔴              | 🟢              | 2/4/8     | 🔴               | 🔴                          | 🟢                      | https://github.com/huggingface/optimum-quanto       |
 | [FBGEMM_FP8](./fbgemm_fp8.md)                 | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🔴              | 8             | 🔴               | 🟢                          | 🟢                      | https://github.com/pytorch/FBGEMM       |
-| [torchao](./torchao.md)                       | 🟢                   | 🟢               | 🟢        | 🔴        | 🟡 <sub>5</sub> | 🔴              |                 | 4/8         |                  | 🟢🔴                        | 🟢                      | https://github.com/pytorch/ao       |
+| [torchao](./torchao.md)                       | 🟢                   |                 | 🟢        | 🔴        | 🟡 <sub>5</sub> | 🔴              |                 | 4/8         |                  | 🟢🔴                        | 🟢                      | https://github.com/pytorch/ao       |
 | [VPTQ](./vptq.md)                             | 🔴                   | 🔴              |     🟢     | 🟡        | 🔴                                 | 🔴              | 🟢              | 1/8         | 🔴               | 🟢                          | 🟢                      | https://github.com/microsoft/VPTQ            |
 | [SpQR](./spqr.md)                          | 🔴                       |  🔴   | 🟢        | 🔴              |    🔴    | 🔴         |         🟢              | 3              |              🔴                     | 🟢           | 🟢                      | https://github.com/Vahe1994/SpQR/       |
 | [FINEGRAINED_FP8](./finegrained_fp8.md)                 | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🔴              | 8             | 🔴               | 🟢                          | 🟢                      |        |
--- a/docs/source/en/quantization/torchao.md
+++ b/docs/source/en/quantization/torchao.md
@ -22,14 +22,6 @@ pip install --upgrade torch torchao transformers

 By default, the weights are loaded in full precision (torch.float32) regardless of the actual data type the weights are stored in such as torch.float16. Set `torch_dtype="auto"` to load the weights in the data type defined in a model's `config.json` file to automatically load the most memory-optimal data type.

-
-## Manually Choose Quantization Types and Settings
-
-`torchao` Provides many commonly used types of quantization, including different dtypes like int4, float8 and different flavors like weight only, dynamic quantization etc., only `int4_weight_only`, `int8_weight_only` and `int8_dynamic_activation_int8_weight` are integrated into hugigngface transformers currently, but we can add more when needed.
-If you want to run the following codes on CPU even with GPU available, just change `device_map="cpu"` and `quantization_config = TorchAoConfig("int4_weight_only", group_size=128, layout=Int4CPULayout())` where `layout` comes from `from torchao.dtypes import Int4CPULayout` which is only available from torchao 0.8.0 and higher.
-
-Users can manually specify the quantization types and settings they want to use:
-
 ```py
 import torch
 from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
@ -40,67 +32,31 @@ model_name = "meta-llama/Meta-Llama-3-8B"
 quantization_config = TorchAoConfig("int4_weight_only", group_size=128)
 quantized_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", quantization_config=quantization_config)

-tokenizer = AutoTokenizer.from_pretrained(model_name)
-input_text = "What are we having for dinner?"
-input_ids = tokenizer(input_text, return_tensors="pt").to(quantized_model.device)
-
-# auto-compile the quantized model with `cache_implementation="static"` to get speedup
-output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
-print(tokenizer.decode(output[0], skip_special_tokens=True))
-
-# benchmark the performance
-from torch._inductor.utils import do_bench_using_profiling
-from typing import Callable
-
-def benchmark_fn(func: Callable, *args, **kwargs) -> float:
-    """Thin wrapper around do_bench_using_profiling"""
-    no_args = lambda: func(*args, **kwargs)
-    time = do_bench_using_profiling(no_args)
-    return time * 1e3
-
-MAX_NEW_TOKENS = 1000
-print("int4wo-128 model:", benchmark_fn(quantized_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS, cache_implementation="static"))
-
-bf16_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.bfloat16)
-output = bf16_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static") # auto-compile
-print("bf16 model:", benchmark_fn(bf16_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS, cache_implementation="static"))
-
-```
-
-## Automatically Select Quantization Types
-
-`torchao` also provies `autoquant` feature that automatically chooses a quantization type for quantizable layers such as linear based on microbenchmarks of quantizing and compiling a single linear layer.
-
-```py
-import torch
-from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
-
-model_name = "meta-llama/Meta-Llama-3-8B"
-quantization_config = TorchAoConfig("autoquant", min_sqnr=None)
-quantized_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", quantization_config=quantization_config)
-
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 input_text = "What are we having for dinner?"
 input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")

 # auto-compile the quantized model with `cache_implementation="static"` to get speedup
 output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
-# Due to some implementation details we are explicitly calling this now, we may refactor our code and remove this in the future
-quantized_model.finalize_autoquant()
 print(tokenizer.decode(output[0], skip_special_tokens=True))

 # benchmark the performance
-from torch._inductor.utils import do_bench_using_profiling
-from typing import Callable
+import torch.utils.benchmark as benchmark

-def benchmark_fn(func: Callable, *args, **kwargs) -> float:
-    """Thin wrapper around do_bench_using_profiling"""
-    no_args = lambda: func(*args, **kwargs)
-    time = do_bench_using_profiling(no_args)
-    return time * 1e3
+def benchmark_fn(f, *args, **kwargs):
+    # Manual warmup
+    for _ in range(5):
+        f(*args, **kwargs)
+        
+    t0 = benchmark.Timer(
+        stmt="f(*args, **kwargs)",
+        globals={"args": args, "kwargs": kwargs, "f": f},
+        num_threads=torch.get_num_threads(),
+    )
+    return f"{(t0.blocked_autorange().mean):.3f}"

 MAX_NEW_TOKENS = 1000
-print("autoquantized model:", benchmark_fn(quantized_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS, cache_implementation="static"))
+print("int4wo-128 model:", benchmark_fn(quantized_model.generate, **input_ids, max_new_tokens=MAX_NEW_TOKENS, cache_implementation="static"))

 bf16_model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda", torch_dtype=torch.bfloat16)
 output = bf16_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static") # auto-compile
@ -124,7 +80,7 @@ quantized_model.save_pretrained(output_dir, safe_serialization=False)

 # load quantized model
 ckpt_id = "llama3-8b-int4wo-128"  # or huggingface hub model id
-loaded_quantized_model = AutoModelForCausalLM.from_pretrained(ckpt_id, device_map="auto")
+loaded_quantized_model = AutoModelForCausalLM.from_pretrained(ckpt_id, device_map="cuda")


 # confirm the speedup
--- a/docs/source/en/trainer.md
+++ b/docs/source/en/trainer.md
@ -799,29 +799,6 @@ tpu_use_sudo: false
 use_cpu: false
 ```

-</hfoption>
-<hfoption id="Tensor Parallelism with PyTorch 2">
-
-```yml
-compute_environment: LOCAL_MACHINE
-tp_config:
-  tp_size: 4
-distributed_type: TP
-downcast_bf16: 'no'
-machine_rank: 0
-main_training_function: main
-mixed_precision: 'no'
-num_machines: 1
-num_processes: 4
-rdzv_backend: static
-same_network: true
-tpu_env: []
-tpu_use_cluster: false
-tpu_use_sudo: false
-use_cpu: false
-
-```
-
 </hfoption>
 </hfoptions>

--- a/docs/source/es/trainer.md
+++ b/docs/source/es/trainer.md
@ -361,30 +361,6 @@ use_cpu: false

 ```

-</hfoption>
-
-<hfoption id="Tensor Parallelism with PyTorch 2">
-
-```yml
-compute_environment: LOCAL_MACHINE
-tp_config:
-  tp_size: 4
-distributed_type: TP
-downcast_bf16: 'no'
-machine_rank: 0
-main_training_function: main
-mixed_precision: 'no'
-num_machines: 1
-num_processes: 4
-rdzv_backend: static
-same_network: true
-tpu_env: []
-tpu_use_cluster: false
-tpu_use_sudo: false
-use_cpu: false
-
-```
-
 </hfoption>
 </hfoptions>

--- a/docs/source/ko/contributing.md
+++ b/docs/source/ko/contributing.md
@ -85,7 +85,7 @@ python src/transformers/commands/transformers_cli.py env
 3. 해당 기능의 사용법을 보여주는 *코드 스니펫*을 제공해 주세요.
 4. 기능과 관련된 논문이 있는 경우 링크를 포함해 주세요.

-이슈가 잘 작성되었다면 이슈가 생성된 순간, 이미 80% 정도의 작업이 완료된 것입니다.
+이슈가 잘 작성되었다면 이슈가 생성된 순간, 이미 80% 정도의 작업이 완료된 것입니다. 

 이슈를 제기하는 데 도움이 될 만한 [템플릿](https://github.com/huggingface/transformers/tree/main/templates)도 준비되어 있습니다.

@ -140,7 +140,7 @@ python src/transformers/commands/transformers_cli.py env
   ```

   만약 이미 가상 환경에 🤗 Transformers가 설치되어 있다면, `-e` 플래그를 사용하여 설치하기 전에 `pip uninstall transformers`로 제거해주세요.
-
+   
   여러분의 운영체제에 따라서, 그리고 🤗 Transformers의 선택적 의존성의 수가 증가하면서, 이 명령이 실패할 수도 있습니다. 그럴 경우 사용하려는 딥러닝 프레임워크(PyTorch, TensorFlow, 그리고/또는 Flax)를 설치한 후 아래 명령을 실행해주세요:

   ```bash
@ -188,7 +188,7 @@ python src/transformers/commands/transformers_cli.py env
   이러한 검사에 대해 자세히 알아보고 관련 문제를 해결하는 방법은 [Pull Request에 대한 검사](https://huggingface.co/docs/transformers/pr_checks) 가이드를 확인하세요.

   만약 `docs/source` 디렉터리 아래의 문서를 수정하는 경우, 문서가 빌드될 수 있는지 확인하세요. 이 검사는 Pull Request를 열 때도 CI에서 실행됩니다. 로컬 검사를 실행하려면 문서 빌더를 설치해야 합니다:
-
+   
   ```bash
   pip install ".[docs]"
   ```
@ -216,7 +216,7 @@ python src/transformers/commands/transformers_cli.py env
   git fetch upstream
   git rebase upstream/main
   ```
-
+   
   변경 사항을 브랜치에 푸시하세요:

   ```bash
@ -238,7 +238,7 @@ python src/transformers/commands/transformers_cli.py env
 ☐ 새로운 기능을 추가하는 경우, 해당 기능에 대한 테스트도 추가하세요.<br>
   - 새 모델을 추가하는 경우, `ModelTester.all_model_classes = (MyModel, MyModelWithLMHead,...)`을 사용하여 일반적인 테스트를 활성화하세요.
   - 새 `@slow` 테스트를 추가하는 경우, 다음 명령으로 테스트를 통과하는지 확인하세요: `RUN_SLOW=1 python -m pytest tests/models/my_new_model/test_my_new_model.py`.
-   - 새 토크나이저를 추가하는 경우, 테스트를 작성하고 다음 명령으로 테스트를 통과하는지 확인하세요: `RUN_SLOW=1 python -m pytest tests/models/{your_model_name}/test_tokenization_{your_model_name}.py`.
+   - 새 토크나이저를 추가하는 경우, 테스트를 작성하고 다음 명령으로 테스트를 통과하는지 확인하세요: `RUN_SLOW=1 python -m pytest tests/models/{your_model_name}/test_tokenization_{your_model_name}.py`. 
   - CircleCI에서는 느린 테스트를 실행하지 않지만, GitHub Actions에서는 매일 밤 실행됩니다!<br>

 ☐ 모든 공개 메소드는 유용한 기술문서를 가져야 합니다 (예를 들어 [`modeling_bert.py`](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py) 참조).<br>
@ -282,6 +282,8 @@ RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/t

 느린 테스트와 마찬가지로, 다음과 같이 테스트 중에 기본적으로 활성화되지 않는 다른 환경 변수도 있습니다:
 - `RUN_CUSTOM_TOKENIZERS`: 사용자 정의 토크나이저 테스트를 활성화합니다.
+- `RUN_PT_FLAX_CROSS_TESTS`: PyTorch + Flax 통합 테스트를 활성화합니다.
+- `RUN_PT_TF_CROSS_TESTS`: TensorFlow + PyTorch 통합 테스트를 활성화합니다.

 더 많은 환경 변수와 추가 정보는 [testing_utils.py](src/transformers/testing_utils.py)에서 찾을 수 있습니다.

--- a/docs/source/ko/trainer.md
+++ b/docs/source/ko/trainer.md
@ -548,29 +548,6 @@ tpu_use_sudo: false
 use_cpu: false
 ```

-</hfoption>
-<hfoption id="Tensor Parallelism with PyTorch 2">
-
-```yml
-compute_environment: LOCAL_MACHINE
-tp_config:
-  tp_size: 4
-distributed_type: TP
-downcast_bf16: 'no'
-machine_rank: 0
-main_training_function: main
-mixed_precision: 'no'
-num_machines: 1
-num_processes: 4
-rdzv_backend: static
-same_network: true
-tpu_env: []
-tpu_use_cluster: false
-tpu_use_sudo: false
-use_cpu: false
-
-```
-
 </hfoption>
 </hfoptions>

--- a/docs/source/zh/contributing.md
+++ b/docs/source/zh/contributing.md
@ -33,7 +33,7 @@ limitations under the License.
 * 实现新的模型。
 * 为示例或文档做贡献。

-如果你不知道从哪里开始，有一个特别的 [Good First Issue](https://github.com/huggingface/transformers/contribute) 列表。它会列出一些适合初学者的开放的 issues，并帮助你开始为开源项目做贡献。只需要在你想要处理的 issue 下发表评论就行。
+如果你不知道从哪里开始，有一个特别的 [Good First Issue](https://github.com/huggingface/transformers/contribute) 列表。它会列出一些适合初学者的开放的 issues，并帮助你开始为开源项目做贡献。只需要在你想要处理的 issue 下发表评论就行。 

 如果想要稍微更有挑战性的内容，你也可以查看 [Good Second Issue](https://github.com/huggingface/transformers/labels/Good%20Second%20Issue) 列表。总的来说，如果你觉得自己知道该怎么做，就去做吧，我们会帮助你达到目标的！🚀

@ -139,7 +139,7 @@ python src/transformers/commands/transformers_cli.py env
   ```

   如果在虚拟环境中已经安装了 🤗 Transformers，请先使用 `pip uninstall transformers` 卸载它，然后再用 `-e` 参数以可编辑模式重新安装。
-
+   
   根据你的操作系统，以及 Transformers 的可选依赖项数量的增加，可能会在执行此命令时出现失败。如果出现这种情况，请确保已经安装了你想使用的深度学习框架（PyTorch, TensorFlow 和 Flax），然后执行以下操作：

   ```bash
@ -187,7 +187,7 @@ python src/transformers/commands/transformers_cli.py env
   想要了解有关这些检查及如何解决相关问题的更多信息，请阅读 [检查 Pull Request](https://huggingface.co/docs/transformers/pr_checks) 指南。

   如果你修改了 `docs/source` 目录下的文档，请确保文档仍然能够被构建。这个检查也会在你创建 PR 时在 CI 中运行。如果要进行本地检查，请确保安装了文档构建工具：
-
+   
   ```bash
   pip install ".[docs]"
   ```
@ -281,6 +281,8 @@ RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/t

 和时间较长的测试一样，还有其他环境变量在测试过程中，在默认情况下是未启用的：
 - `RUN_CUSTOM_TOKENIZERS`: 启用自定义分词器的测试。
+- `RUN_PT_FLAX_CROSS_TESTS`: 启用 PyTorch + Flax 整合的测试。
+- `RUN_PT_TF_CROSS_TESTS`: 启用 TensorFlow + PyTorch 整合的测试。

 更多环境变量和额外信息可以在 [testing_utils.py](src/transformers/testing_utils.py) 中找到。

--- a/examples/flax/question-answering/run_qa.py
+++ b/examples/flax/question-answering/run_qa.py
@ -61,7 +61,7 @@ from transformers.utils import check_min_version, send_example_telemetry
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 Array = Any
 Dataset = datasets.arrow_dataset.Dataset
--- a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
+++ b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
@ -60,7 +60,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risk.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/flax/speech-recognition/requirements.txt")

--- a/examples/flax/text-classification/run_flax_glue.py
+++ b/examples/flax/text-classification/run_flax_glue.py
@ -56,7 +56,7 @@ from transformers.utils import check_min_version, send_example_telemetry

 logger = logging.getLogger(__name__)
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 Array = Any
 Dataset = datasets.arrow_dataset.Dataset
--- a/examples/flax/token-classification/run_flax_ner.py
+++ b/examples/flax/token-classification/run_flax_ner.py
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version

 logger = logging.getLogger(__name__)
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")

--- a/examples/modular-transformers/configuration_my_new_model.py
+++ b/examples/modular-transformers/configuration_my_new_model.py
@ -140,11 +140,6 @@ class MyNewModelConfig(PretrainedConfig):
        "layers.*.mlp.up_proj": "colwise",
        "layers.*.mlp.down_proj": "rowwise",
    }
-    base_model_pp_plan = {
-        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
-        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
-        "norm": (["hidden_states"], ["hidden_states"]),
-    }

    def __init__(
        self,
--- a/examples/modular-transformers/configuration_my_new_model2.py
+++ b/examples/modular-transformers/configuration_my_new_model2.py
@ -43,11 +43,6 @@ class MyNewModel2Config(PretrainedConfig):
        "layers.*.mlp.up_proj": "colwise",
        "layers.*.mlp.down_proj": "rowwise",
    }
-    base_model_pp_plan = {
-        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
-        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
-        "norm": (["hidden_states"], ["hidden_states"]),
-    }

    def __init__(
        self,
--- a/examples/modular-transformers/configuration_new_model.py
+++ b/examples/modular-transformers/configuration_new_model.py
@ -79,20 +79,6 @@ class NewModelConfig(PretrainedConfig):

    model_type = "new_model"
    keys_to_ignore_at_inference = ["past_key_values"]
-    base_model_tp_plan = {
-        "layers.*.self_attn.q_proj": "colwise",
-        "layers.*.self_attn.k_proj": "colwise",
-        "layers.*.self_attn.v_proj": "colwise",
-        "layers.*.self_attn.o_proj": "rowwise",
-        "layers.*.mlp.gate_proj": "colwise",
-        "layers.*.mlp.up_proj": "colwise",
-        "layers.*.mlp.down_proj": "rowwise",
-    }
-    base_model_pp_plan = {
-        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
-        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
-        "norm": (["hidden_states"], ["hidden_states"]),
-    }

    def __init__(
        self,
--- a/examples/modular-transformers/image_processing_new_imgproc_model.py
+++ b/examples/modular-transformers/image_processing_new_imgproc_model.py
@ -19,7 +19,7 @@ from ...image_utils import (
    PILImageResampling,
    infer_channel_dimension_format,
    is_scaled_image,
-    make_flat_list_of_images,
+    make_list_of_images,
    to_numpy_array,
    valid_images,
    validate_preprocess_arguments,
@ -221,7 +221,8 @@ class ImgprocModelImageProcessor(BaseImageProcessor):

        size = size if size is not None else self.size
        size = get_size_dict(size, default_to_square=False)
-        images = make_flat_list_of_images(images)
+
+        images = make_list_of_images(images)

        if not valid_images(images):
            raise ValueError(
--- a/examples/modular-transformers/modeling_dummy.py
+++ b/examples/modular-transformers/modeling_dummy.py
@ -356,7 +356,6 @@ class DummyPreTrainedModel(PreTrainedModel):
    _supports_cache_class = True
    _supports_quantized_cache = True
    _supports_static_cache = True
-    _supports_attention_backend = True

    def _init_weights(self, module):
        std = self.config.initializer_range
@ -699,9 +698,7 @@ class DummyModel(DummyPreTrainedModel):
            if attention_mask is not None:
                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
                mask_length = attention_mask.shape[-1]
-                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
-                    causal_mask.device
-                )
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
                padding_mask = padding_mask == 0
                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
                    padding_mask, min_dtype
--- a/examples/modular-transformers/modeling_multimodal1.py
+++ b/examples/modular-transformers/modeling_multimodal1.py
@ -356,7 +356,6 @@ class Multimodal1TextPreTrainedModel(PreTrainedModel):
    _supports_cache_class = True
    _supports_quantized_cache = True
    _supports_static_cache = True
-    _supports_attention_backend = True

    def _init_weights(self, module):
        std = self.config.initializer_range
@ -699,9 +698,7 @@ class Multimodal1TextModel(Multimodal1TextPreTrainedModel):
            if attention_mask is not None:
                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
                mask_length = attention_mask.shape[-1]
-                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
-                    causal_mask.device
-                )
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
                padding_mask = padding_mask == 0
                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
                    padding_mask, min_dtype
--- a/examples/modular-transformers/modeling_my_new_model2.py
+++ b/examples/modular-transformers/modeling_my_new_model2.py
@ -356,7 +356,6 @@ class MyNewModel2PreTrainedModel(PreTrainedModel):
    _supports_cache_class = True
    _supports_quantized_cache = True
    _supports_static_cache = True
-    _supports_attention_backend = True

    def _init_weights(self, module):
        std = self.config.initializer_range
@ -492,7 +491,6 @@ class MyNewModel2Model(MyNewModel2PreTrainedModel):
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
-        **kwargs,  # NOOP kwarg for now
    ) -> Union[Tuple, BaseModelOutputWithPast]:
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
@ -705,9 +703,7 @@ class MyNewModel2Model(MyNewModel2PreTrainedModel):
            if attention_mask is not None:
                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
                mask_length = attention_mask.shape[-1]
-                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
-                    causal_mask.device
-                )
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
                padding_mask = padding_mask == 0
                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
                    padding_mask, min_dtype
@ -791,20 +787,17 @@ class MyNewModel2ForSequenceClassification(MyNewModel2PreTrainedModel):
        if self.config.pad_token_id is None and batch_size != 1:
            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
        if self.config.pad_token_id is None:
-            last_non_pad_token = -1
-        elif input_ids is not None:
-            # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
-            non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
-            token_indices = torch.arange(input_ids.shape[-1], device=logits.device)
-            last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
+            sequence_lengths = -1
        else:
-            last_non_pad_token = -1
-            logger.warning_once(
-                f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
-                "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
-            )
+            if input_ids is not None:
+                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                sequence_lengths = sequence_lengths.to(logits.device)
+            else:
+                sequence_lengths = -1

-        pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token]
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]

        loss = None
        if labels is not None:
--- a/examples/modular-transformers/modeling_new_task_model.py
+++ b/examples/modular-transformers/modeling_new_task_model.py
@ -19,7 +19,6 @@ from ...utils import (
    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
 )
-from ...utils.deprecation import deprecate_kwarg
 from ..auto import AutoModel, AutoModelForCausalLM
 from .configuration_new_task_model import NewTaskModelConfig

@ -255,7 +254,8 @@ class NewTaskModelForNewTask(NewTaskModelPreTrainedModel, GenerationMixin):
        token_type_ids,
        past_key_values,
        cache_position,
-        input_tensor,
+        input_ids=None,
+        inputs_embeds=None,
        is_training: bool = False,
    ):
        if self.config.text_config._attn_implementation == "flash_attention_2":
@ -265,7 +265,8 @@ class NewTaskModelForNewTask(NewTaskModelPreTrainedModel, GenerationMixin):

        using_static_cache = isinstance(past_key_values, StaticCache)
        min_dtype = torch.finfo(self.dtype).min
-        inputs_lead_dim, sequence_length = input_tensor.shape[:2]
+        inputs_lead_dim = input_ids.shape[0] if input_ids is not None else inputs_embeds.shape[0]
+        sequence_length = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
        if using_static_cache:
            target_length = past_key_values.get_max_cache_shape()
        elif isinstance(past_key_values, HybridCache):
@ -296,20 +297,16 @@ class NewTaskModelForNewTask(NewTaskModelPreTrainedModel, GenerationMixin):
        if attention_mask is not None:
            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
            mask_length = attention_mask.shape[-1]
-
-            # First unmask prefix tokens during training
-            if is_training:
-                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
-                    token_type_ids[:, None, None, :].to(causal_mask.device) == 0, 0
-                )
-
-            # Then apply padding mask (will mask pad tokens)
            padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(causal_mask.device)
            padding_mask = padding_mask == 0
            causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
                padding_mask, min_dtype
            )
-
+            # we are training thus we need to create a full mask on the image + prefix but causal on suffix
+            if is_training:
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    token_type_ids[:, None, None, :].to(causal_mask.device) == 0, 0
+                )
        return causal_mask

    def get_image_features(self, pixel_values: torch.FloatTensor):
@ -328,7 +325,6 @@ class NewTaskModelForNewTask(NewTaskModelPreTrainedModel, GenerationMixin):
        image_features = image_features / (self.config.text_config.hidden_size**0.5)
        return image_features

-    @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
    @add_start_docstrings_to_model_forward(NEW_TASK_MODEL_INPUTS_DOCSTRING)
    @replace_return_docstrings(output_type=NewTaskModelCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
    def forward(
@ -349,17 +345,16 @@ class NewTaskModelForNewTask(NewTaskModelPreTrainedModel, GenerationMixin):
        num_logits_to_keep: int = 0,
    ) -> Union[Tuple, NewTaskModelCausalLMOutputWithPast]:
        r"""
+        Args:
            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

-            logits_to_keep (`int` or `torch.Tensor`, *optional*):
-                If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
+            num_logits_to_keep (`int`, *optional*):
+                Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
-                If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
-                This is useful when using packed tensor format (single dimension for batch and sequence length).

        Returns:

@ -423,7 +418,7 @@ class NewTaskModelForNewTask(NewTaskModelPreTrainedModel, GenerationMixin):
        attention_mask=None,
        token_type_ids=None,
        use_cache=True,
-        logits_to_keep=None,
+        num_logits_to_keep=None,
        labels=None,
        **kwargs,
    ):
@ -436,7 +431,7 @@ class NewTaskModelForNewTask(NewTaskModelPreTrainedModel, GenerationMixin):
            position_ids=position_ids,
            cache_position=cache_position,
            use_cache=use_cache,
-            logits_to_keep=logits_to_keep,
+            num_logits_to_keep=num_logits_to_keep,
            token_type_ids=token_type_ids,
            **kwargs,
        )
@ -450,12 +445,10 @@ class NewTaskModelForNewTask(NewTaskModelPreTrainedModel, GenerationMixin):
            model_inputs["pixel_values"] = pixel_values
        is_training = token_type_ids is not None and labels is not None
        if cache_position[0] == 0 and isinstance(past_key_values, HybridCache):
-            input_tensor = inputs_embeds if inputs_embeds is not None else input_ids
            causal_mask = self._update_causal_mask(
-                attention_mask, token_type_ids, past_key_values, cache_position, input_tensor, is_training
+                attention_mask, token_type_ids, past_key_values, cache_position, input_ids, inputs_embeds, is_training
            )
            model_inputs["attention_mask"] = causal_mask
-
        return model_inputs

    def resize_token_embeddings(
--- a/examples/modular-transformers/modeling_super.py
+++ b/examples/modular-transformers/modeling_super.py
@ -356,7 +356,6 @@ class SuperPreTrainedModel(PreTrainedModel):
    _supports_cache_class = True
    _supports_quantized_cache = True
    _supports_static_cache = True
-    _supports_attention_backend = True

    def _init_weights(self, module):
        std = self.config.initializer_range
@ -621,9 +620,7 @@ class SuperModel(SuperPreTrainedModel):
            if attention_mask is not None:
                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
                mask_length = attention_mask.shape[-1]
-                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
-                    causal_mask.device
-                )
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
                padding_mask = padding_mask == 0
                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
                    padding_mask, min_dtype
--- a/examples/pytorch/audio-classification/run_audio_classification.py
+++ b/examples/pytorch/audio-classification/run_audio_classification.py
@ -45,7 +45,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt")

--- a/examples/pytorch/contrastive-image-text/run_clip.py
+++ b/examples/pytorch/contrastive-image-text/run_clip.py
@ -54,7 +54,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt")

--- a/examples/pytorch/image-classification/run_image_classification.py
+++ b/examples/pytorch/image-classification/run_image_classification.py
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")

--- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py
+++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
@ -49,7 +49,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 logger = get_logger(__name__)

--- a/examples/pytorch/image-pretraining/run_mae.py
+++ b/examples/pytorch/image-pretraining/run_mae.py
@ -43,7 +43,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")

--- a/examples/pytorch/image-pretraining/run_mim.py
+++ b/examples/pytorch/image-pretraining/run_mim.py
@ -48,7 +48,7 @@ Any model supported by the AutoModelForMaskedImageModeling API can be used.
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")

--- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py
+++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
@ -53,7 +53,7 @@ Any model supported by the AutoModelForMaskedImageModeling API can be used.
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")

--- a/examples/pytorch/instance-segmentation/run_instance_segmentation.py
+++ b/examples/pytorch/instance-segmentation/run_instance_segmentation.py
@ -46,7 +46,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt")

--- a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py
+++ b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py
@ -52,7 +52,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt")

--- a/examples/pytorch/language-modeling/run_clm.py
+++ b/examples/pytorch/language-modeling/run_clm.py
@ -55,7 +55,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 logger = get_logger(__name__)

--- a/examples/pytorch/language-modeling/run_fim.py
+++ b/examples/pytorch/language-modeling/run_fim.py
@ -58,7 +58,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

--- a/examples/pytorch/language-modeling/run_fim_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_fim_no_trainer.py
@ -60,7 +60,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 logger = get_logger(__name__)

--- a/examples/pytorch/language-modeling/run_mlm.py
+++ b/examples/pytorch/language-modeling/run_mlm.py
@ -54,7 +54,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 logger = get_logger(__name__)
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
--- a/examples/pytorch/language-modeling/run_plm.py
+++ b/examples/pytorch/language-modeling/run_plm.py
@ -47,7 +47,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

--- a/examples/pytorch/multiple-choice/run_swag.py
+++ b/examples/pytorch/multiple-choice/run_swag.py
@ -46,7 +46,7 @@ from transformers.utils import check_min_version, send_example_telemetry


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 logger = logging.getLogger(__name__)

--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@ -54,7 +54,7 @@ from transformers.utils import check_min_version, send_example_telemetry


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 logger = get_logger(__name__)
 # You should update this to your particular problem to have better documentation of `model_type`
--- a/examples/pytorch/object-detection/run_object_detection.py
+++ b/examples/pytorch/object-detection/run_object_detection.py
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/object-detection/requirements.txt")

--- a/examples/pytorch/object-detection/run_object_detection_no_trainer.py
+++ b/examples/pytorch/object-detection/run_object_detection_no_trainer.py
@ -51,7 +51,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 logging.basicConfig(level=logging.INFO)
 logger = get_logger(__name__)
--- a/examples/pytorch/question-answering/run_qa.py
+++ b/examples/pytorch/question-answering/run_qa.py
@ -50,7 +50,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

--- a/examples/pytorch/question-answering/run_qa_beam_search.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search.py
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

--- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

--- a/examples/pytorch/question-answering/run_qa_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_no_trainer.py
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

--- a/examples/pytorch/question-answering/run_seq2seq_qa.py
+++ b/examples/pytorch/question-answering/run_seq2seq_qa.py
@ -46,7 +46,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
@ -51,7 +51,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/semantic-segmentation/requirements.txt")

--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
@ -50,7 +50,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 logger = get_logger(__name__)

--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
@ -50,7 +50,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")

--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
@ -53,7 +53,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")

--- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")

--- a/examples/pytorch/summarization/run_summarization.py
+++ b/examples/pytorch/summarization/run_summarization.py
@ -52,7 +52,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")

--- a/examples/pytorch/summarization/run_summarization_no_trainer.py
+++ b/examples/pytorch/summarization/run_summarization_no_trainer.py
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
--- a/examples/pytorch/text-classification/run_classification.py
+++ b/examples/pytorch/text-classification/run_classification.py
@ -47,7 +47,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")

--- a/examples/pytorch/text-classification/run_glue.py
+++ b/examples/pytorch/text-classification/run_glue.py
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")

--- a/examples/pytorch/text-classification/run_glue_no_trainer.py
+++ b/examples/pytorch/text-classification/run_glue_no_trainer.py
@ -49,7 +49,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 logger = get_logger(__name__)

--- a/examples/pytorch/text-classification/run_xnli.py
+++ b/examples/pytorch/text-classification/run_xnli.py
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")

--- a/examples/pytorch/token-classification/run_ner.py
+++ b/examples/pytorch/token-classification/run_ner.py
@ -49,7 +49,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")

--- a/examples/pytorch/token-classification/run_ner_no_trainer.py
+++ b/examples/pytorch/token-classification/run_ner_no_trainer.py
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
--- a/examples/pytorch/translation/run_translation.py
+++ b/examples/pytorch/translation/run_translation.py
@ -52,7 +52,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")

--- a/examples/pytorch/translation/run_translation_no_trainer.py
+++ b/examples/pytorch/translation/run_translation_no_trainer.py
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
--- a/examples/quantization/custom_quantization.py
+++ b/examples/quantization/custom_quantization.py
@ -1,78 +0,0 @@
-import json
-from typing import Any, Dict
-
-import torch
-
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from transformers.quantizers import HfQuantizer, register_quantization_config, register_quantizer
-from transformers.utils.quantization_config import QuantizationConfigMixin
-
-
-@register_quantization_config("custom")
-class CustomConfig(QuantizationConfigMixin):
-    def __init__(self):
-        self.quant_method = "custom"
-        self.bits = 8
-
-    def to_dict(self) -> Dict[str, Any]:
-        output = {
-            "num_bits": self.bits,
-        }
-        return output
-
-    def __repr__(self):
-        config_dict = self.to_dict()
-        return f"{self.__class__.__name__} {json.dumps(config_dict, indent=2, sort_keys=True)}\n"
-
-    def to_diff_dict(self) -> Dict[str, Any]:
-        config_dict = self.to_dict()
-
-        default_config_dict = CustomConfig().to_dict()
-
-        serializable_config_dict = {}
-
-        for key, value in config_dict.items():
-            if value != default_config_dict[key]:
-                serializable_config_dict[key] = value
-
-        return serializable_config_dict
-
-
-@register_quantizer("custom")
-class CustomQuantizer(HfQuantizer):
-    def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs):
-        super().__init__(quantization_config, **kwargs)
-        self.quantization_config = quantization_config
-        self.scale_map = {}
-        self.device = kwargs.get("device", "cuda" if torch.cuda.is_available() else "cpu")
-        self.torch_dtype = kwargs.get("torch_dtype", torch.float32)
-
-    def _process_model_before_weight_loading(self, model, **kwargs):
-        return True
-
-    def _process_model_after_weight_loading(self, model, **kwargs):
-        return True
-
-    def is_serializable(self) -> bool:
-        return True
-
-    def is_trainable(self) -> bool:
-        return False
-
-
-model_8bit = AutoModelForCausalLM.from_pretrained(
-    "facebook/opt-350m", quantization_config=CustomConfig(), torch_dtype="auto"
-)
-
-tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
-input_text = "once there is"
-inputs = tokenizer(input_text, return_tensors="pt")
-output = model_8bit.generate(
-    **inputs,
-    max_length=100,
-    num_return_sequences=1,
-    no_repeat_ngram_size=2,
-)
-generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
-
-print(generated_text)
--- a/examples/quantization/custom_quantization_int8_example.py
+++ b/examples/quantization/custom_quantization_int8_example.py
@ -1,257 +0,0 @@
-import json
-from typing import Any, Dict, List, Optional
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from accelerate import init_empty_weights
-from huggingface_hub import HfApi
-
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from transformers.quantizers import HfQuantizer, get_module_from_name, register_quantization_config, register_quantizer
-from transformers.utils.quantization_config import QuantizationConfigMixin
-
-
-# Implement INT8 Symmetric Linear layer
-class Int8SymmetricLinear(torch.nn.Module):
-    def __init__(self, in_features, out_features, bias, dtype=torch.float32):
-        super().__init__()
-        self.in_features = in_features
-        self.out_features = out_features
-
-        self.register_buffer("weight", torch.zeros((out_features, in_features), dtype=torch.int8))
-        self.register_buffer("weight_scale", torch.zeros((out_features, 1), dtype=dtype))
-
-        if bias:
-            self.register_buffer("bias", torch.zeros((self.out_features), dtype=dtype))
-        else:
-            self.bias = None
-
-    def forward(self, x):
-        dequant_weight = self.weight * self.weight_scale
-        output = F.linear(x, dequant_weight)
-        if self.bias is not None:
-            output = output + self.bias
-        return output
-
-
-# Function to replace standard linear layers with INT8 symmetric quantized layers
-def _replace_with_int8_symmetric_linear(
-    model,
-    modules_to_not_convert=None,
-    current_key_name=None,
-    quantization_config=None,
-    has_been_replaced=False,
-    pre_quantized=False,
-):
-    """
-    Recursively replaces nn.Linear modules with Int8SymmetricLinear modules.
-    """
-    if current_key_name is None:
-        current_key_name = []
-
-    for name, module in model.named_children():
-        current_key_name.append(name)
-
-        if (isinstance(module, nn.Linear)) and name not in modules_to_not_convert:
-            # Check if the current key is not in the `modules_to_not_convert`
-            current_key_name_str = ".".join(current_key_name)
-            if not any(
-                (key + "." in current_key_name_str) or (key == current_key_name_str) for key in modules_to_not_convert
-            ):
-                with init_empty_weights(include_buffers=True):
-                    in_features = module.in_features
-                    out_features = module.out_features
-                    model._modules[name] = Int8SymmetricLinear(
-                        in_features, out_features, module.bias is not None, dtype=module.weight.dtype
-                    )
-                    has_been_replaced = True
-                    model._modules[name].requires_grad_(False)
-
-        if len(list(module.children())) > 0:
-            _, has_been_replaced = _replace_with_int8_symmetric_linear(
-                module,
-                modules_to_not_convert,
-                current_key_name,
-                quantization_config,
-                has_been_replaced=has_been_replaced,
-                pre_quantized=pre_quantized,
-            )
-        # Remove the last key for recursion
-        current_key_name.pop(-1)
-    return model, has_been_replaced
-
-
-def replace_with_int8_symmetric_linear(
-    model, modules_to_not_convert=None, current_key_name=None, quantization_config=None, pre_quantized=False
-):
-    """
-    Main function to replace model layers with INT8 symmetric quantized versions.
-    """
-    modules_to_not_convert = ["lm_head"] if modules_to_not_convert is None else modules_to_not_convert
-
-    if quantization_config.modules_to_not_convert is not None:
-        modules_to_not_convert.extend(quantization_config.modules_to_not_convert)
-    modules_to_not_convert = list(set(modules_to_not_convert))
-
-    model, has_been_replaced = _replace_with_int8_symmetric_linear(
-        model, modules_to_not_convert, current_key_name, quantization_config, pre_quantized=pre_quantized
-    )
-
-    if not has_been_replaced:
-        raise ValueError(
-            "You are loading your model using INT8 symmetric quantization but no linear modules were found in your model."
-        )
-
-    return model
-
-
-@register_quantization_config("int8_symmetric")
-class Int8SymmetricConfig(QuantizationConfigMixin):
-    """
-    Configuration for INT8 symmetric quantization.
-    """
-
-    def __init__(self, modules_to_not_convert: Optional[List[str]] = None, **kwargs):
-        self.quant_method = "int8_symmetric"
-        self.modules_to_not_convert = modules_to_not_convert
-
-    def __repr__(self):
-        config_dict = self.to_dict()
-        return f"{self.__class__.__name__} {json.dumps(config_dict, indent=2, sort_keys=True)}\n"
-
-    def to_diff_dict(self) -> Dict[str, Any]:
-        config_dict = self.to_dict()
-        default_config_dict = Int8SymmetricConfig().to_dict()
-
-        serializable_config_dict = {}
-        for key, value in config_dict.items():
-            if value != default_config_dict[key]:
-                serializable_config_dict[key] = value
-
-        return serializable_config_dict
-
-
-@register_quantizer("int8_symmetric")
-class Int8SymmetricQuantizer(HfQuantizer):
-    """
-    Implementation of INT8 symmetric quantization.
-
-    """
-
-    requires_calibration = False
-    requires_parameters_quantization = True
-
-    def __init__(self, quantization_config: QuantizationConfigMixin, **kwargs):
-        super().__init__(quantization_config, **kwargs)
-        self.quantization_config = quantization_config
-
-    def _process_model_before_weight_loading(self, model, **kwargs):
-        """
-        Replace model's linear layers with quantized versions before loading weights.
-        """
-        self.modules_to_not_convert = self.quantization_config.modules_to_not_convert
-
-        model = replace_with_int8_symmetric_linear(
-            model,
-            modules_to_not_convert=self.modules_to_not_convert,
-            quantization_config=self.quantization_config,
-            pre_quantized=self.pre_quantized,
-        )
-
-    def check_quantized_param(
-        self,
-        model,
-        param_value: "torch.Tensor",
-        param_name: str,
-        state_dict: Dict[str, Any],
-        **kwargs,
-    ):
-        module, tensor_name = get_module_from_name(model, param_name)
-
-        if isinstance(module, Int8SymmetricLinear):
-            if self.pre_quantized or tensor_name == "bias":
-                if tensor_name == "weight" and param_value.dtype != torch.int8:
-                    raise ValueError("Expect quantized weights but got an unquantized weight")
-                return False
-            else:
-                if tensor_name == "weight_scale":
-                    raise ValueError("Expect unquantized weights but got a quantized weight_scale")
-                return True
-        return False
-
-    def create_quantized_param(
-        self,
-        model,
-        param_value: "torch.Tensor",
-        param_name: str,
-        target_device: "torch.device",
-        state_dict: Dict[str, Any],
-        unexpected_keys: Optional[List[str]] = None,
-    ):
-        """
-        Quantizes weights to INT8 symmetric format.
-        """
-        abs_max_per_row = torch.max(torch.abs(param_value), dim=1, keepdim=True)[0].clamp(min=1e-5)
-
-        weight_scale = abs_max_per_row / 127.0
-
-        weight_quantized = torch.round(param_value / weight_scale).clamp(-128, 127).to(torch.int8)
-
-        module, tensor_name = get_module_from_name(model, param_name)
-        module._buffers[tensor_name] = weight_quantized.to(target_device)
-        module._buffers["weight_scale"] = weight_scale.to(target_device)
-
-    def update_missing_keys(self, model, missing_keys: List[str], prefix: str) -> List[str]:
-        not_missing_keys = []
-        for name, module in model.named_modules():
-            if isinstance(module, Int8SymmetricLinear):
-                for missing in missing_keys:
-                    if (
-                        (name in missing or name in f"{prefix}.{missing}")
-                        and not missing.endswith(".weight")
-                        and not missing.endswith(".bias")
-                    ):
-                        not_missing_keys.append(missing)
-        return [k for k in missing_keys if k not in not_missing_keys]
-
-    def _process_model_after_weight_loading(self, model, **kwargs):
-        """
-        Post-processing after weights are loaded.
-        """
-        return True
-
-    def is_serializable(self, safe_serialization=None):
-        return True
-
-    @property
-    def is_trainable(self) -> bool:
-        return False
-
-
-# Example usage
-if __name__ == "__main__":
-    model_int8 = AutoModelForCausalLM.from_pretrained(
-        "meta-llama/Llama-3.2-1B", quantization_config=Int8SymmetricConfig(), torch_dtype=torch.float, device_map="cpu"
-    )
-
-    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
-    input_text = "once there is"
-    inputs = tokenizer(input_text, return_tensors="pt").to("cpu")
-    output = model_int8.generate(
-        **inputs,
-        max_length=100,
-        num_return_sequences=1,
-        no_repeat_ngram_size=2,
-    )
-    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
-    print(generated_text)
-
-    # Save and upload to HUB
-    output_model_dir = "Llama-3.2-1B-INT8-CUSTOM"
-    model_int8.save_pretrained(output_model_dir)
-    tokenizer.save_pretrained(output_model_dir)
-    api = HfApi()
-    repo_id = "medmekk/Llama-3.2-1B-INT8-CUSTOM"
-    api.create_repo(repo_id, private=False)
-    api.upload_folder(folder_path=output_model_dir, repo_id=repo_id, repo_type="model")
--- a/examples/research_projects/codeparrot/examples/requirements.txt
+++ b/examples/research_projects/codeparrot/examples/requirements.txt
@ -1,5 +1,5 @@
 datasets==2.3.2
-transformers==4.48.0
+transformers==4.38.0
 wandb==0.13.1
 evaluate==0.2.2
 scikit-learn==1.5.0
--- a/examples/tensorflow/contrastive-image-text/run_clip.py
+++ b/examples/tensorflow/contrastive-image-text/run_clip.py
@ -51,7 +51,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 require_version(
    "datasets>=1.8.0", "To fix: pip install -r examples/tensorflow/contrastive-image-text/requirements.txt"
--- a/examples/tensorflow/image-classification/run_image_classification.py
+++ b/examples/tensorflow/image-classification/run_image_classification.py
@ -55,7 +55,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")

--- a/examples/tensorflow/multiple-choice/run_swag.py
+++ b/examples/tensorflow/multiple-choice/run_swag.py
@ -50,7 +50,7 @@ from transformers.utils import check_min_version, send_example_telemetry


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.50.0.dev0")
+check_min_version("4.49.0.dev0")

 logger = logging.getLogger(__name__)

--- a/Show More
+++ b/Show More