Add sha256sum to ssh-runner workflow

Update SSH runner version to 1.90.3
Update runner type for single GPU A10 instance
2025-11-01 01:24:35 +08:00 · 2025-10-28 18:59:44 +01:00 · 2025-10-28 18:57:31 +01:00 · 2025-10-24 17:39:17 +02:00 · 2025-10-24 13:58:19 +02:00 · 2025-10-24 13:36:37 +02:00
87 changed files with 2261 additions and 1119 deletions
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@ -28,7 +28,7 @@ jobs:
      (github.event_name == 'pull_request' && contains( github.event.pull_request.labels.*.name, 'run-benchmark') )||
      (github.event_name == 'push' && github.ref == 'refs/heads/main')
    container:
-      image: huggingface/transformers-all-latest-gpu
+      image: huggingface/transformers-pytorch-gpu
      options: --gpus all --privileged --ipc host
    steps:
      - name: Get repo
--- a/.github/workflows/benchmark_v2_a10_caller.yml
+++ b/.github/workflows/benchmark_v2_a10_caller.yml
@ -9,7 +9,7 @@ jobs:
    uses: ./.github/workflows/benchmark_v2.yml
    with:
      runner: aws-g5-4xlarge-cache-use1-public-80
-      container_image: huggingface/transformers-all-latest-gpu
+      container_image: huggingface/transformers-pytorch-gpu
      container_options: --gpus all --privileged --ipc host --shm-size "16gb"
      commit_sha: ${{ github.sha }}
      run_id: ${{ github.run_id }}
--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
@ -45,52 +45,26 @@ jobs:
            REF=main
          push: true
          tags: huggingface/transformers-all-latest-gpu${{ inputs.image_postfix }}
-
-      - name: Post to Slack
-        if: always()
-        uses: huggingface/hf-workflows/.github/actions/post-slack@main
-        with:
-          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
-          title: 🤗 Results of the transformers-all-latest-gpu docker build
-          status: ${{ job.status }}
-          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
-
-  flash-attn-ci-image:
-    name: "PyTorch with Flash Attn [dev]"
-    runs-on:
-      group: aws-general-8-plus
-    steps:
+      # Push CI images still need to be re-built daily
      -
-        name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-      -
-        name: Check out code
-        uses: actions/checkout@v4
-      -
-        name: Login to DockerHub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_PASSWORD }}
-      -
-        name: Build and push
+        name: Build and push (for Push CI) in a daily basis
+        # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
+        # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
+        if: inputs.image_postfix != '-push-ci'
        uses: docker/build-push-action@v5
        with:
          context: ./docker/transformers-all-latest-gpu
          build-args: |
            REF=main
-            PYTORCH=2.8.0
-            TORCHCODEC=0.7.0
-            FLASH_ATTN=yes
          push: true
-          tags: huggingface/transformers-all-latest-gpu${{ inputs.image_postfix }}:flash-attn
+          tags: huggingface/transformers-all-latest-gpu-push-ci

      - name: Post to Slack
        if: always()
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
-          title: 🤗 Results of the transformers-all-latest-gpu docker build
+          title: 🤗 Results of the transformers-all-latest-gpu-push-ci docker build
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

@ -130,8 +104,51 @@ jobs:
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

+  # Can't build 2 images in a single job `latest-torch-deepspeed-docker` (for `nvcr.io/nvidia`)
+  latest-torch-deepspeed-docker-for-push-ci-daily-build:
+    name: "Latest PyTorch + DeepSpeed (Push CI - Daily Build)"
+    runs-on:
+      group: aws-general-8-plus
+    steps:
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      -
+        name: Check out code
+        uses: actions/checkout@v4
+      -
+        name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      # Push CI images still need to be re-built daily
+      -
+        name: Build and push (for Push CI) in a daily basis
+        # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
+        # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
+        if: inputs.image_postfix != '-push-ci'
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-pytorch-deepspeed-latest-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci
+
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        with:
+          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
+          title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu-push-ci docker build
+          status: ${{ job.status }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+
  doc-builder:
    name: "Doc builder"
+    # Push CI doesn't need this image
+    if: inputs.image_postfix != '-push-ci'
    runs-on:
      group: aws-general-8-plus
    steps:
@ -164,6 +181,44 @@ jobs:
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

+  latest-pytorch:
+    name: "Latest PyTorch [dev]"
+    # Push CI doesn't need this image
+    if: inputs.image_postfix != '-push-ci'
+    runs-on:
+      group: aws-general-8-plus
+    steps:
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      -
+        name: Check out code
+        uses: actions/checkout@v4
+      -
+        name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      -
+        name: Build and push
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-pytorch-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-pytorch-gpu
+
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        with:
+          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
+          title: 🤗 Results of the huggingface/transformers-pytorch-gpudocker build
+          status: ${{ job.status }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+
  latest-pytorch-amd:
    name: "Latest PyTorch (AMD) [dev]"
    runs-on:
@ -190,47 +245,29 @@ jobs:
            REF=main
          push: true
          tags: huggingface/transformers-pytorch-amd-gpu${{ inputs.image_postfix }}
+      # Push CI images still need to be re-built daily
+      -
+        name: Build and push (for Push CI) in a daily basis
+        # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
+        # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
+        if: inputs.image_postfix != '-push-ci'
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-pytorch-amd-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-pytorch-amd-gpu-push-ci

      - name: Post to Slack
        if: always()
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
-          title: 🤗 Results of the huggingface/transformers-pytorch-amd-gpu build
+          title: 🤗 Results of the huggingface/transformers-pytorch-amd-gpu-push-ci build
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

-  cache-latest-pytorch-amd:
-    name: "Cache Latest Pytorch (AMD) Image"
-    needs: latest-pytorch-amd
-    runs-on:
-      group: amd-mi325-1gpu
-    steps:
-      -
-        name: Login to DockerHub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_PASSWORD }}
-        
-      - 
-        name: Pull and save docker image to cache
-        run: |
-          image="huggingface/transformers-pytorch-amd-gpu"
-          final_path="/mnt/image-cache/transformers-pytorch-amd-gpu.tar"
-          tmp_path="${final_path}.tmp"
-
-          echo "Pulling image: ${image}"
-          docker pull "${image}"
-
-          echo "Saving to temp file: ${tmp_path}"
-          docker save "${image}" -o "${tmp_path}"
-
-          echo "Moving to final path: ${final_path}"
-          mv -f "${tmp_path}" "${final_path}"
-
-          echo "Cache populated successfully at ${final_path}"
-
  latest-pytorch-deepspeed-amd:
    name: "PyTorch + DeepSpeed (AMD) [dev]"
    runs-on:
@ -257,6 +294,19 @@ jobs:
            REF=main
          push: true
          tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }}
+      # Push CI images still need to be re-built daily
+      -
+        name: Build and push (for Push CI) in a daily basis
+        # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
+        # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
+        if: inputs.image_postfix != '-push-ci'
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-pytorch-deepspeed-amd-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-pytorch-deepspeed-amd-gpu-push-ci

      - name: Post to Slack
        if: always()
@ -269,6 +319,8 @@ jobs:

  latest-quantization-torch-docker:
    name: "Latest Pytorch + Quantization [dev]"
+     # Push CI doesn't need this image
+    if: inputs.image_postfix != '-push-ci'
    runs-on:
      group: aws-general-8-plus
    steps:
--- a/.github/workflows/model_jobs.yml
+++ b/.github/workflows/model_jobs.yml
@ -28,9 +28,6 @@ on:
      report_repo_id:
        required: false
        type: string
-      pytest_marker:
-        required: false
-        type: string

 env:
  HF_HOME: /mnt/cache
@ -140,7 +137,7 @@ jobs:
      - name: Run all tests on GPU
        working-directory: /transformers
        run: |
-          script -q -c "PATCH_TESTING_METHODS_TO_COLLECT_OUTPUTS=yes _PATCHED_TESTING_METHODS_OUTPUT_DIR=/transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports python3 -m pytest -rsfE -v -m '${{ inputs.pytest_marker }}' --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports tests/${{ matrix.folders }}" test_outputs.txt
+          script -q -c "PATCH_TESTING_METHODS_TO_COLLECT_OUTPUTS=yes _PATCHED_TESTING_METHODS_OUTPUT_DIR=/transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports python3 -m pytest -rsfE -v --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports tests/${{ matrix.folders }}" test_outputs.txt
          ls -la
          # Extract the exit code from the output file
          EXIT_CODE=$(tail -1 test_outputs.txt | grep -o 'COMMAND_EXIT_CODE="[0-9]*"' | cut -d'"' -f2)
--- a/.github/workflows/push-important-models.yml
+++ b/.github/workflows/push-important-models.yml
@ -149,7 +149,7 @@ jobs:
    with:
      job: run_models_gpu
      slack_report_channel: "#transformers-ci-push"
-      docker: huggingface/transformers-all-latest-gpu:flash-attn
+      docker: huggingface/transformers-all-latest-gpu
      ci_event: push
      report_repo_id: hf-internal-testing/transformers_ci_push
      commit_sha: ${{ github.sha }}
--- a/.github/workflows/self-push-amd-mi210-caller.yml
+++ b/.github/workflows/self-push-amd-mi210-caller.yml
@ -0,0 +1,25 @@
+name: Self-hosted runner (AMD mi210 CI caller)
+
+on:
+  #workflow_run:
+  #  workflows: ["Self-hosted runner (push-caller)"]
+  #  branches: ["main"]
+  #  types: [completed]
+  push:
+    branches:
+      - run_amd_push_ci_caller*
+    paths:
+      - "src/**"
+      - "tests/**"
+      - ".github/**"
+      - "templates/**"
+      - "utils/**"
+
+jobs:
+  run_amd_ci:
+    name: AMD mi210
+    if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_push_ci_caller')))
+    uses: ./.github/workflows/self-push-amd.yml
+    with:
+      gpu_flavor: mi210
+    secrets: inherit
--- a/.github/workflows/self-push-amd-mi250-caller.yml
+++ b/.github/workflows/self-push-amd-mi250-caller.yml
@ -0,0 +1,25 @@
+name: Self-hosted runner (AMD mi250 CI caller)
+
+on:
+  #workflow_run:
+  #  workflows: ["Self-hosted runner (push-caller)"]
+  #  branches: ["main"]
+  #  types: [completed]
+  push:
+    branches:
+      - run_amd_push_ci_caller*
+    paths:
+      - "src/**"
+      - "tests/**"
+      - ".github/**"
+      - "templates/**"
+      - "utils/**"
+
+jobs:
+  run_amd_ci:
+    name: AMD mi250
+    if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_push_ci_caller')))
+    uses: ./.github/workflows/self-push-amd.yml
+    with:
+      gpu_flavor: mi250
+    secrets: inherit
--- a/.github/workflows/self-push-amd.yml
+++ b/.github/workflows/self-push-amd.yml
@ -0,0 +1,334 @@
+name: Self-hosted runner AMD GPU (push)
+
+on:
+  workflow_call:
+    inputs:
+      gpu_flavor:
+        required: true
+        type: string
+
+env:
+  HF_HOME: /mnt/cache
+  TRANSFORMERS_IS_CI: yes
+  OMP_NUM_THREADS: 8
+  MKL_NUM_THREADS: 8
+  PYTEST_TIMEOUT: 60
+  TF_FORCE_GPU_ALLOW_GROWTH: true
+  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
+
+jobs:
+  check_runner_status:
+    name: Check Runner Status
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Checkout transformers
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 2
+
+      - name: Check Runner Status
+        run: python utils/check_self_hosted_runner.py --target_runners amd-mi210-single-gpu-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
+
+  check_runners:
+    name: Check Runners
+    needs: check_runner_status
+    strategy:
+      matrix:
+        machine_type: [single-gpu, multi-gpu]
+    runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+    container:
+      image: huggingface/transformers-pytorch-amd-gpu-push-ci  # <--- We test only for PyTorch for now
+      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: ROCM-SMI
+        run: |
+          rocm-smi
+      - name: ROCM-INFO
+        run: |
+          rocminfo  | grep "Agent" -A 14
+      - name: Show ROCR environment
+        run: |
+          echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+  setup_gpu:
+    name: Setup
+    needs: check_runners
+    strategy:
+      matrix:
+        machine_type: [single-gpu, multi-gpu]
+    runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+    container:
+      image: huggingface/transformers-pytorch-amd-gpu-push-ci  # <--- We test only for PyTorch for now
+      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+      test_map: ${{ steps.set-matrix.outputs.test_map }}
+    env:
+      # `CI_BRANCH_PUSH`: The branch name from the push event
+      # `CI_BRANCH_WORKFLOW_RUN`: The name of the branch on which this workflow is triggered by `workflow_run` event
+      # `CI_SHA_PUSH`: The commit SHA from the push event
+      # `CI_SHA_WORKFLOW_RUN`: The commit SHA that triggers this workflow by `workflow_run` event
+      CI_BRANCH_PUSH: ${{ github.event.ref }}
+      CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }}
+      CI_SHA_PUSH: ${{ github.event.head_commit.id }}
+      CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }}
+    steps:
+      # Necessary to get the correct branch name and commit SHA for `workflow_run` event
+      # We also take into account the `push` event (we might want to test some changes in a branch)
+      - name: Prepare custom environment variables
+        shell: bash
+        # `CI_BRANCH`: The non-empty branch name from the above two (one and only one of them is empty)
+        # `CI_SHA`: The non-empty commit SHA from the above two (one and only one of them is empty)
+        run: |
+          CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
+          echo $CI_BRANCH_PUSH
+          echo $CI_BRANCH_WORKFLOW_RUN
+          echo $CI_SHA_PUSH
+          echo $CI_SHA_WORKFLOW_RUN
+          [[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
+          [[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
+
+      - name: print environment variables
+        run: |
+          echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
+          echo "env.CI_SHA = ${{ env.CI_SHA }}"
+
+      - name: Update clone using environment variables
+        working-directory: /transformers
+        run: |
+          echo "original branch = $(git branch --show-current)"
+          git fetch && git checkout ${{ env.CI_BRANCH }}
+          echo "updated branch = $(git branch --show-current)"
+          git checkout ${{ env.CI_SHA }}
+          echo "log = $(git log -n 1)"
+
+      - name: Cleanup
+        working-directory: /transformers
+        run: |
+          rm -rf tests/__pycache__
+          rm -rf tests/models/__pycache__
+          rm -rf reports
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Fetch the tests to run
+        working-directory: /transformers
+        # TODO: add `git-python` in the docker images
+        run: |
+          pip install --upgrade git-python
+          python3 utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
+
+      - name: Report fetched tests
+        uses: actions/upload-artifact@v4
+        with:
+          name: test_fetched
+          path: /transformers/test_preparation.txt
+
+      - id: set-matrix
+        name: Organize tests into models
+        working-directory: /transformers
+        # The `keys` is used as GitHub actions matrix for jobs, i.e. `models/bert`, `tokenization`, `pipeline`, etc.
+        # The `test_map` is used to get the actual identified test files under each key.
+        # If no test to run (so no `test_map.json` file), create a dummy map (empty matrix will fail)
+        run: |
+          if [ -f test_map.json ]; then
+              keys=$(python3 -c 'import json; fp = open("test_map.json"); test_map = json.load(fp); fp.close(); d = list(test_map.keys()); print(d)')
+              test_map=$(python3 -c 'import json; fp = open("test_map.json"); test_map = json.load(fp); fp.close(); print(test_map)')
+          else
+              keys=$(python3 -c 'keys = ["dummy"]; print(keys)')
+              test_map=$(python3 -c 'test_map = {"dummy": []}; print(test_map)')
+          fi
+          echo $keys
+          echo $test_map
+          echo "matrix=$keys" >> $GITHUB_OUTPUT
+          echo "test_map=$test_map" >> $GITHUB_OUTPUT
+
+  run_models_gpu:
+    name: Model tests
+    needs: setup_gpu
+    # `dummy` means there is no test to run
+    if: contains(fromJson(needs.setup_gpu.outputs.matrix), 'dummy') != true
+    strategy:
+      fail-fast: false
+      matrix:
+        folders: ${{ fromJson(needs.setup_gpu.outputs.matrix) }}
+        machine_type: [single-gpu, multi-gpu]
+    runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+    container:
+      image: huggingface/transformers-pytorch-amd-gpu-push-ci  # <--- We test only for PyTorch for now
+      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    env:
+      # For the meaning of these environment variables, see the job `Setup`
+      CI_BRANCH_PUSH: ${{ github.event.ref }}
+      CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }}
+      CI_SHA_PUSH: ${{ github.event.head_commit.id }}
+      CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }}
+    steps:
+      # Necessary to get the correct branch name and commit SHA for `workflow_run` event
+      # We also take into account the `push` event (we might want to test some changes in a branch)
+      - name: Prepare custom environment variables
+        shell: bash
+        # For the meaning of these environment variables, see the job `Setup`
+        run: |
+          CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
+          echo $CI_BRANCH_PUSH
+          echo $CI_BRANCH_WORKFLOW_RUN
+          echo $CI_SHA_PUSH
+          echo $CI_SHA_WORKFLOW_RUN
+          [[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
+          [[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
+
+      - name: print environment variables
+        run: |
+          echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
+          echo "env.CI_SHA = ${{ env.CI_SHA }}"
+
+      - name: Update clone using environment variables
+        working-directory: /transformers
+        run: |
+          echo "original branch = $(git branch --show-current)"
+          git fetch && git checkout ${{ env.CI_BRANCH }}
+          echo "updated branch = $(git branch --show-current)"
+          git checkout ${{ env.CI_SHA }}
+          echo "log = $(git log -n 1)"
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: Echo folder ${{ matrix.folders }}
+        shell: bash
+        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
+        # set the artifact folder names (because the character `/` is not allowed).
+        run: |
+          echo "${{ matrix.folders }}"
+          echo "${{ fromJson(needs.setup_gpu.outputs.test_map)[matrix.folders] }}"
+          matrix_folders=${{ matrix.folders }}
+          matrix_folders=${matrix_folders/'models/'/'models_'}
+          echo "$matrix_folders"
+          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
+
+      - name: ROCM-SMI
+        run: |
+          rocm-smi
+      - name: ROCM-INFO
+        run: |
+          rocminfo  | grep "Agent" -A 14
+      - name: Show ROCR environment
+        run: |
+          echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Run all non-slow selected tests on GPU
+        working-directory: /transformers
+        run: |
+          python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports ${{ fromJson(needs.setup_gpu.outputs.test_map)[matrix.folders] }} -m "not not_device_test"
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
+
+      - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports"
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ matrix.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports
+          path: /transformers/reports/${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
+
+  send_results:
+    name: Send results to webhook
+    runs-on: ubuntu-22.04
+    if: always()
+    needs: [
+        check_runner_status,
+        check_runners,
+        setup_gpu,
+        run_models_gpu,
+#        run_tests_torch_cuda_extensions_single_gpu,
+#        run_tests_torch_cuda_extensions_multi_gpu
+    ]
+    env:
+      # For the meaning of these environment variables, see the job `Setup`
+      CI_BRANCH_PUSH: ${{ github.event.ref }}
+      CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }}
+      CI_SHA_PUSH: ${{ github.event.head_commit.id }}
+      CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }}
+    steps:
+      - name: Preliminary job status
+        shell: bash
+        # For the meaning of these environment variables, see the job `Setup`
+        run: |
+          echo "Runner availability: ${{ needs.check_runner_status.result }}"
+          echo "Setup status: ${{ needs.setup_gpu.result }}"
+          echo "Runner status: ${{ needs.check_runners.result }}"
+
+      # Necessary to get the correct branch name and commit SHA for `workflow_run` event
+      # We also take into account the `push` event (we might want to test some changes in a branch)
+      - name: Prepare custom environment variables
+        shell: bash
+        # For the meaning of these environment variables, see the job `Setup`
+        run: |
+          CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
+          echo $CI_BRANCH_PUSH
+          echo $CI_BRANCH_WORKFLOW_RUN
+          echo $CI_SHA_PUSH
+          echo $CI_SHA_WORKFLOW_RUN
+          [[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
+          [[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
+
+      - name: print environment variables
+        run: |
+          echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
+          echo "env.CI_SHA = ${{ env.CI_SHA }}"
+
+      - uses: actions/checkout@v4
+        # To avoid failure when multiple commits are merged into `main` in a short period of time.
+        # Checking out to an old commit beyond the fetch depth will get an error `fatal: reference is not a tree: ...
+        # (Only required for `workflow_run` event, where we get the latest HEAD on `main` instead of the event commit)
+        with:
+          fetch-depth: 20
+
+      - name: Update clone using environment variables
+        run: |
+          echo "original branch = $(git branch --show-current)"
+          git fetch && git checkout ${{ env.CI_BRANCH }}
+          echo "updated branch = $(git branch --show-current)"
+          git checkout ${{ env.CI_SHA }}
+          echo "log = $(git log -n 1)"
+
+      - uses: actions/download-artifact@v4
+      - name: Send message to Slack
+        env:
+          CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
+          CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
+          CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
+          CI_SLACK_CHANNEL_ID_AMD: ${{ secrets.CI_SLACK_CHANNEL_ID_AMD }}
+          CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
+          CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_AMD }}
+          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
+          CI_EVENT: Push CI (AMD) - ${{ inputs.gpu_flavor }}
+          CI_TITLE_PUSH: ${{ github.event.head_commit.message }}
+          CI_TITLE_WORKFLOW_RUN: ${{ github.event.workflow_run.head_commit.message }}
+          CI_SHA: ${{ env.CI_SHA }}
+          RUNNER_STATUS: ${{ needs.check_runner_status.result }}
+          RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
+          SETUP_STATUS: ${{ needs.setup_gpu.result }}
+
+        # We pass `needs.setup_gpu.outputs.matrix` as the argument. A processing in `notification_service.py` to change
+        # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
+        run: |
+          pip install huggingface_hub
+          pip install slack_sdk
+          pip show slack_sdk
+          python utils/notification_service.py "${{ needs.setup_gpu.outputs.matrix }}"
--- a/.github/workflows/self-push-caller.yml
+++ b/.github/workflows/self-push-caller.yml
@ -0,0 +1,54 @@
+# Used to trigger self-push CI
+name: Self-hosted runner (push-caller)
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - "src/**"
+      - "tests/**"
+      - ".github/**"
+      - "templates/**"
+      - "utils/**"
+
+jobs:
+  check-for-setup:
+      runs-on: ubuntu-22.04
+      name: Check if setup was changed
+      outputs:
+        changed: ${{ steps.was_changed.outputs.changed }}
+      steps:
+        - uses: actions/checkout@v4
+          with: 
+            fetch-depth: "2"
+        
+        - name: Get changed files
+          id: changed-files
+          uses: tj-actions/changed-files@1c8e6069583811afb28f97afeaf8e7da80c6be5c
+        
+        - name: Was setup changed 
+          id: was_changed
+          run: |
+            for file in ${{ steps.changed-files.outputs.all_changed_files }}; do
+              if [ `basename "${file}"` = "setup.py" ]; then
+                echo "changed=1" >> $GITHUB_OUTPUT
+              fi
+            done
+
+  build-docker-containers:
+    needs: check-for-setup
+    if: (github.event_name == 'push') && (needs.check-for-setup.outputs.changed == '1')
+    uses: ./.github/workflows/build-docker-images.yml
+    with:
+      image_postfix: "-push-ci"
+    secrets: inherit
+
+  run_push_ci:
+    name: Trigger Push CI
+    runs-on: ubuntu-22.04
+    if: ${{ always() }}
+    needs: build-docker-containers
+    steps:
+      - name: Trigger push CI via workflow_run
+        run: echo "Trigger push CI via workflow_run"
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@ -0,0 +1,652 @@
+name: Self-hosted runner (push)
+
+on:
+  workflow_run:
+    workflows: ["Self-hosted runner (push-caller)"]
+    branches: ["main"]
+    types: [completed]
+  push:
+    branches:
+      - ci_*
+      - ci-*
+    paths:
+      - "src/**"
+      - "tests/**"
+      - ".github/**"
+      - "templates/**"
+      - "utils/**"
+  repository_dispatch:
+
+env:
+  HF_HOME: /mnt/cache
+  TRANSFORMERS_IS_CI: yes
+  OMP_NUM_THREADS: 8
+  MKL_NUM_THREADS: 8
+  PYTEST_TIMEOUT: 60
+  TF_FORCE_GPU_ALLOW_GROWTH: true
+  CUDA_VISIBLE_DEVICES: 0,1
+
+jobs:
+  setup:
+    name: Setup
+    strategy:
+      matrix:
+        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
+    runs-on:
+      group: '${{ matrix.machine_type }}'
+    container:
+      image: huggingface/transformers-all-latest-gpu-push-ci
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+      test_map: ${{ steps.set-matrix.outputs.test_map }}
+    env:
+      # `CI_BRANCH_PUSH`: The branch name from the push event
+      # `CI_BRANCH_WORKFLOW_RUN`: The name of the branch on which this workflow is triggered by `workflow_run` event
+      # `CI_SHA_PUSH`: The commit SHA from the push event
+      # `CI_SHA_WORKFLOW_RUN`: The commit SHA that triggers this workflow by `workflow_run` event
+      CI_BRANCH_PUSH: ${{ github.event.ref }}
+      CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }}
+      CI_SHA_PUSH: ${{ github.event.head_commit.id }}
+      CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }}
+    steps:
+      # Necessary to get the correct branch name and commit SHA for `workflow_run` event
+      # We also take into account the `push` event (we might want to test some changes in a branch)
+      - name: Prepare custom environment variables
+        shell: bash
+        # `CI_BRANCH`: The non-empty branch name from the above two (one and only one of them is empty)
+        # `CI_SHA`: The non-empty commit SHA from the above two (one and only one of them is empty)
+        run: |
+          CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
+          echo $CI_BRANCH_PUSH
+          echo $CI_BRANCH_WORKFLOW_RUN
+          echo $CI_SHA_PUSH
+          echo $CI_SHA_WORKFLOW_RUN
+          [[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
+          [[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
+
+      - name: print environment variables
+        run: |
+          echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
+          echo "env.CI_SHA = ${{ env.CI_SHA }}"
+
+      - name: Update clone using environment variables
+        working-directory: /transformers
+        run: |
+          echo "original branch = $(git branch --show-current)"
+          git fetch && git checkout ${{ env.CI_BRANCH }}
+          echo "updated branch = $(git branch --show-current)"
+          git checkout ${{ env.CI_SHA }}
+          echo "log = $(git log -n 1)"
+
+      - name: Cleanup
+        working-directory: /transformers
+        run: |
+          rm -rf tests/__pycache__
+          rm -rf tests/models/__pycache__
+          rm -rf reports
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Fetch the tests to run
+        working-directory: /transformers
+        # TODO: add `git-python` in the docker images
+        run: |
+          pip install --upgrade git-python
+          python3 utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
+
+      - name: Report fetched tests
+        uses: actions/upload-artifact@v4
+        with:
+          name: test_fetched
+          path: /transformers/test_preparation.txt
+
+      - id: set-matrix
+        name: Organize tests into models
+        working-directory: /transformers
+        # The `keys` is used as GitHub actions matrix for jobs, i.e. `models/bert`, `tokenization`, `pipeline`, etc.
+        # The `test_map` is used to get the actual identified test files under each key.
+        # If no test to run (so no `test_map.json` file), create a dummy map (empty matrix will fail)
+        run: |
+          if [ -f test_map.json ]; then
+              keys=$(python3 -c 'import json; fp = open("test_map.json"); test_map = json.load(fp); fp.close(); d = list(test_map.keys()); print(d)')
+              test_map=$(python3 -c 'import json; fp = open("test_map.json"); test_map = json.load(fp); fp.close(); print(test_map)')
+          else
+              keys=$(python3 -c 'keys = ["dummy"]; print(keys)')
+              test_map=$(python3 -c 'test_map = {"dummy": []}; print(test_map)')
+          fi
+          echo $keys
+          echo $test_map
+          echo "matrix=$keys" >> $GITHUB_OUTPUT
+          echo "test_map=$test_map" >> $GITHUB_OUTPUT
+
+  run_tests_single_gpu:
+    name: Model tests
+    needs: setup
+    # `dummy` means there is no test to run
+    if: contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true
+    strategy:
+      fail-fast: false
+      matrix:
+        folders: ${{ fromJson(needs.setup.outputs.matrix) }}
+        machine_type: [aws-g5-4xlarge-cache]
+    runs-on:
+      group: '${{ matrix.machine_type }}'
+    container:
+      image: huggingface/transformers-all-latest-gpu-push-ci
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    env:
+      # For the meaning of these environment variables, see the job `Setup`
+      CI_BRANCH_PUSH: ${{ github.event.ref }}
+      CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }}
+      CI_SHA_PUSH: ${{ github.event.head_commit.id }}
+      CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }}
+    steps:
+      # Necessary to get the correct branch name and commit SHA for `workflow_run` event
+      # We also take into account the `push` event (we might want to test some changes in a branch)
+      - name: Prepare custom environment variables
+        shell: bash
+        # For the meaning of these environment variables, see the job `Setup`
+        run: |
+          CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
+          echo $CI_BRANCH_PUSH
+          echo $CI_BRANCH_WORKFLOW_RUN
+          echo $CI_SHA_PUSH
+          echo $CI_SHA_WORKFLOW_RUN
+          [[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
+          [[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
+
+      - name: print environment variables
+        run: |
+          echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
+          echo "env.CI_SHA = ${{ env.CI_SHA }}"
+
+      - name: Set `machine_type` for report and artifact names
+        working-directory: /transformers
+        shell: bash
+        run: |
+          echo "${{ matrix.machine_type }}"
+
+          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+            machine_type=single-gpu
+          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
+            machine_type=multi-gpu
+          else
+            machine_type=${{ matrix.machine_type }}
+          fi
+
+          echo "$machine_type"
+          echo "machine_type=$machine_type" >> $GITHUB_ENV
+
+      - name: Update clone using environment variables
+        working-directory: /transformers
+        run: |
+          echo "original branch = $(git branch --show-current)"
+          git fetch && git checkout ${{ env.CI_BRANCH }}
+          echo "updated branch = $(git branch --show-current)"
+          git checkout ${{ env.CI_SHA }}
+          echo "log = $(git log -n 1)"
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: Echo folder ${{ matrix.folders }}
+        shell: bash
+        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
+        # set the artifact folder names (because the character `/` is not allowed).
+        run: |
+          echo "${{ matrix.folders }}"
+          echo "${{ fromJson(needs.setup.outputs.test_map)[matrix.folders] }}"
+          matrix_folders=${{ matrix.folders }}
+          matrix_folders=${matrix_folders/'models/'/'models_'}
+          echo "$matrix_folders"
+          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
+
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Run all non-slow selected tests on GPU
+        working-directory: /transformers
+        run: |
+          python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ env.machine_type }}_tests_gpu_${{ matrix.folders }} ${{ fromJson(needs.setup.outputs.test_map)[matrix.folders] }}
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ env.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
+
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports"
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ env.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
+          path: /transformers/reports/${{ env.machine_type }}_tests_gpu_${{ matrix.folders }}
+
+  run_tests_multi_gpu:
+    name: Model tests
+    needs: setup
+    # `dummy` means there is no test to run
+    if: contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true
+    strategy:
+      fail-fast: false
+      matrix:
+        folders: ${{ fromJson(needs.setup.outputs.matrix) }}
+        machine_type: [aws-g5-12xlarge-cache]
+    runs-on:
+      group: '${{ matrix.machine_type }}'
+    container:
+      image: huggingface/transformers-all-latest-gpu-push-ci
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    env:
+      # For the meaning of these environment variables, see the job `Setup`
+      CI_BRANCH_PUSH: ${{ github.event.ref }}
+      CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }}
+      CI_SHA_PUSH: ${{ github.event.head_commit.id }}
+      CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }}
+    steps:
+      # Necessary to get the correct branch name and commit SHA for `workflow_run` event
+      # We also take into account the `push` event (we might want to test some changes in a branch)
+      - name: Prepare custom environment variables
+        shell: bash
+        # For the meaning of these environment variables, see the job `Setup`
+        run: |
+          CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
+          echo $CI_BRANCH_PUSH
+          echo $CI_BRANCH_WORKFLOW_RUN
+          echo $CI_SHA_PUSH
+          echo $CI_SHA_WORKFLOW_RUN
+          [[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
+          [[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
+
+      - name: print environment variables
+        run: |
+          echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
+          echo "env.CI_SHA = ${{ env.CI_SHA }}"
+
+      - name: Set `machine_type` for report and artifact names
+        working-directory: /transformers
+        shell: bash
+        run: |
+          echo "${{ matrix.machine_type }}"
+
+          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+            machine_type=single-gpu
+          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
+            machine_type=multi-gpu
+          else
+            machine_type=${{ matrix.machine_type }}
+          fi
+
+          echo "$machine_type"
+          echo "machine_type=$machine_type" >> $GITHUB_ENV
+
+      - name: Update clone using environment variables
+        working-directory: /transformers
+        run: |
+          echo "original branch = $(git branch --show-current)"
+          git fetch && git checkout ${{ env.CI_BRANCH }}
+          echo "updated branch = $(git branch --show-current)"
+          git checkout ${{ env.CI_SHA }}
+          echo "log = $(git log -n 1)"
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: Echo folder ${{ matrix.folders }}
+        shell: bash
+        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
+        # set the artifact folder names (because the character `/` is not allowed).
+        run: |
+          echo "${{ matrix.folders }}"
+          echo "${{ fromJson(needs.setup.outputs.test_map)[matrix.folders] }}"
+          matrix_folders=${{ matrix.folders }}
+          matrix_folders=${matrix_folders/'models/'/'models_'}
+          echo "$matrix_folders"
+          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
+
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Run all non-slow selected tests on GPU
+        env:
+          MKL_SERVICE_FORCE_INTEL: 1
+        working-directory: /transformers
+        run: |
+          python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ env.machine_type }}_tests_gpu_${{ matrix.folders }} ${{ fromJson(needs.setup.outputs.test_map)[matrix.folders] }}
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ env.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
+
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports"
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ env.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
+          path: /transformers/reports/${{ env.machine_type }}_tests_gpu_${{ matrix.folders }}
+
+  run_tests_torch_cuda_extensions_single_gpu:
+    name: Torch CUDA extension tests
+    needs: setup
+    if: contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended')
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [aws-g5-4xlarge-cache]
+    runs-on:
+      group: '${{ matrix.machine_type }}'
+    container:
+      image: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    env:
+      # For the meaning of these environment variables, see the job `Setup`
+      CI_BRANCH_PUSH: ${{ github.event.ref }}
+      CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }}
+      CI_SHA_PUSH: ${{ github.event.head_commit.id }}
+      CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }}
+    steps:
+      # Necessary to get the correct branch name and commit SHA for `workflow_run` event
+      # We also take into account the `push` event (we might want to test some changes in a branch)
+      - name: Prepare custom environment variables
+        shell: bash
+        # For the meaning of these environment variables, see the job `Setup`
+        run: |
+          CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
+          echo $CI_BRANCH_PUSH
+          echo $CI_BRANCH_WORKFLOW_RUN
+          echo $CI_SHA_PUSH
+          echo $CI_SHA_WORKFLOW_RUN
+          [[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
+          [[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
+
+      - name: print environment variables
+        run: |
+          echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
+          echo "env.CI_SHA = ${{ env.CI_SHA }}"
+
+      - name: Set `machine_type` for report and artifact names
+        working-directory: /workspace/transformers
+        shell: bash
+        run: |
+          echo "${{ matrix.machine_type }}"
+
+          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+            machine_type=single-gpu
+          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
+            machine_type=multi-gpu
+          else
+            machine_type=${{ matrix.machine_type }}
+          fi
+
+          echo "$machine_type"
+          echo "machine_type=$machine_type" >> $GITHUB_ENV
+
+      - name: Update clone using environment variables
+        working-directory: /workspace/transformers
+        run: |
+          echo "original branch = $(git branch --show-current)"
+          git fetch && git checkout ${{ env.CI_BRANCH }}
+          echo "updated branch = $(git branch --show-current)"
+          git checkout ${{ env.CI_SHA }}
+          echo "log = $(git log -n 1)"
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /workspace/transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: Remove cached torch extensions
+        run: rm -rf /github/home/.cache/torch_extensions/
+
+      # To avoid unknown test failures
+      - name: Pre build DeepSpeed *again*
+        working-directory: /workspace
+        run: |
+          python3 -m pip uninstall -y deepspeed
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
+      - name: Environment
+        working-directory: /workspace/transformers
+        run: |
+          python utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /workspace/transformers
+        run: pip freeze
+
+      - name: Run all non-slow selected tests on GPU
+        working-directory: /workspace/transformers
+        # TODO: Here we pass all tests in the 2 folders for simplicity. It's better to pass only the identified tests.
+        run: |
+          python -m pytest -n 1 --dist=loadfile -v --make-reports=${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /workspace/transformers/reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt
+
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports"
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
+          path: /workspace/transformers/reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
+
+  run_tests_torch_cuda_extensions_multi_gpu:
+    name: Torch CUDA extension tests
+    needs: setup
+    if: contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended')
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [aws-g5-12xlarge-cache]
+    runs-on:
+      group: '${{ matrix.machine_type }}'
+    container:
+      image: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    env:
+      # For the meaning of these environment variables, see the job `Setup`
+      CI_BRANCH_PUSH: ${{ github.event.ref }}
+      CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }}
+      CI_SHA_PUSH: ${{ github.event.head_commit.id }}
+      CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }}
+    steps:
+      # Necessary to get the correct branch name and commit SHA for `workflow_run` event
+      # We also take into account the `push` event (we might want to test some changes in a branch)
+      - name: Prepare custom environment variables
+        shell: bash
+        # For the meaning of these environment variables, see the job `Setup`
+        run: |
+          CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
+          echo $CI_BRANCH_PUSH
+          echo $CI_BRANCH_WORKFLOW_RUN
+          echo $CI_SHA_PUSH
+          echo $CI_SHA_WORKFLOW_RUN
+          [[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
+          [[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
+
+      - name: print environment variables
+        run: |
+          echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
+          echo "env.CI_SHA = ${{ env.CI_SHA }}"
+
+      - name: Set `machine_type` for report and artifact names
+        working-directory: /workspace/transformers
+        shell: bash
+        run: |
+          echo "${{ matrix.machine_type }}"
+
+          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+            machine_type=single-gpu
+          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
+            machine_type=multi-gpu
+          else
+            machine_type=${{ matrix.machine_type }}
+          fi
+
+          echo "$machine_type"
+          echo "machine_type=$machine_type" >> $GITHUB_ENV
+
+      - name: Update clone using environment variables
+        working-directory: /workspace/transformers
+        run: |
+          echo "original branch = $(git branch --show-current)"
+          git fetch && git checkout ${{ env.CI_BRANCH }}
+          echo "updated branch = $(git branch --show-current)"
+          git checkout ${{ env.CI_SHA }}
+          echo "log = $(git log -n 1)"
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /workspace/transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: Remove cached torch extensions
+        run: rm -rf /github/home/.cache/torch_extensions/
+
+      # To avoid unknown test failures
+      - name: Pre build DeepSpeed *again*
+        working-directory: /workspace
+        run: |
+          python3 -m pip uninstall -y deepspeed
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
+      - name: Environment
+        working-directory: /workspace/transformers
+        run: |
+          python utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /workspace/transformers
+        run: pip freeze
+
+      - name: Run all non-slow selected tests on GPU
+        working-directory: /workspace/transformers
+        # TODO: Here we pass all tests in the 2 folders for simplicity. It's better to pass only the identified tests.
+        run: |
+          python -m pytest -n 1 --dist=loadfile -v --make-reports=${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /workspace/transformers/reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt
+
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports"
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
+          path: /workspace/transformers/reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
+
+  send_results:
+    name: Send results to webhook
+    runs-on: ubuntu-22.04
+    if: always()
+    needs: [
+        setup,
+        run_tests_single_gpu,
+        run_tests_multi_gpu,
+        run_tests_torch_cuda_extensions_single_gpu,
+        run_tests_torch_cuda_extensions_multi_gpu
+    ]
+    env:
+      # For the meaning of these environment variables, see the job `Setup`
+      CI_BRANCH_PUSH: ${{ github.event.ref }}
+      CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }}
+      CI_SHA_PUSH: ${{ github.event.head_commit.id }}
+      CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }}
+    steps:
+      - name: Preliminary job status
+        shell: bash
+        # For the meaning of these environment variables, see the job `Setup`
+        run: |
+          echo "Setup status: ${{ needs.setup.result }}"
+
+      # Necessary to get the correct branch name and commit SHA for `workflow_run` event
+      # We also take into account the `push` event (we might want to test some changes in a branch)
+      - name: Prepare custom environment variables
+        shell: bash
+        # For the meaning of these environment variables, see the job `Setup`
+        run: |
+          CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
+          echo $CI_BRANCH_PUSH
+          echo $CI_BRANCH_WORKFLOW_RUN
+          echo $CI_SHA_PUSH
+          echo $CI_SHA_WORKFLOW_RUN
+          [[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
+          [[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
+
+      - name: print environment variables
+        run: |
+          echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
+          echo "env.CI_SHA = ${{ env.CI_SHA }}"
+
+      - uses: actions/checkout@v4
+        # To avoid failure when multiple commits are merged into `main` in a short period of time.
+        # Checking out to an old commit beyond the fetch depth will get an error `fatal: reference is not a tree: ...
+        # (Only required for `workflow_run` event, where we get the latest HEAD on `main` instead of the event commit)
+        with:
+          fetch-depth: 20
+
+      - name: Update clone using environment variables
+        run: |
+          echo "original branch = $(git branch --show-current)"
+          git fetch && git checkout ${{ env.CI_BRANCH }}
+          echo "updated branch = $(git branch --show-current)"
+          git checkout ${{ env.CI_SHA }}
+          echo "log = $(git log -n 1)"
+
+      - uses: actions/download-artifact@v4
+      - name: Send message to Slack
+        env:
+          CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
+          CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
+          CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
+          CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
+          CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
+          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
+          CI_EVENT: push
+          CI_TITLE_PUSH: ${{ github.event.head_commit.message }}
+          CI_TITLE_WORKFLOW_RUN: ${{ github.event.workflow_run.head_commit.message }}
+          CI_SHA: ${{ env.CI_SHA }}
+          SETUP_STATUS: ${{ needs.setup.result }}
+
+        # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
+        # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
+        run: |
+          pip install huggingface_hub
+          pip install slack_sdk
+          pip show slack_sdk
+          python utils/notification_service.py "${{ needs.setup.outputs.matrix }}"
--- a/.github/workflows/self-scheduled-caller.yml
+++ b/.github/workflows/self-scheduled-caller.yml
@ -63,7 +63,7 @@ jobs:
    with:
      job: run_pipelines_torch_gpu
      slack_report_channel: "#transformers-ci-daily-pipeline-torch"
-      docker: huggingface/transformers-all-latest-gpu
+      docker: huggingface/transformers-pytorch-gpu
      ci_event: Daily CI
      report_repo_id: hf-internal-testing/transformers_daily_ci
      commit_sha: ${{ github.sha }}
--- a/.github/workflows/self-scheduled-flash-attn-caller.yml
+++ b/.github/workflows/self-scheduled-flash-attn-caller.yml
@ -1,60 +0,0 @@
-name: Nvidia CI - Flash Attn
-
-on:
-  repository_dispatch:
-  schedule:
-    - cron: "17 2 * * *"
-  push:
-    branches:
-      - run_nvidia_ci_flash_attn*
-  workflow_dispatch:
-    inputs:
-      prev_workflow_run_id:
-        description: 'previous workflow run id to compare'
-        type: string
-        required: false
-        default: ""
-      other_workflow_run_id:
-        description: 'other workflow run id to compare'
-        type: string
-        required: false
-        default: ""
-
-
-# Used for `push` to easily modify the target workflow runs to compare against
-env:
-    prev_workflow_run_id: ""
-    other_workflow_run_id: ""
-
-
-jobs:
-  setup:
-    name: Setup
-    runs-on: ubuntu-22.04
-    steps:
-      - name: Setup
-        run: |
-          mkdir "setup_values"
-          echo "${{ inputs.prev_workflow_run_id || env.prev_workflow_run_id }}" > "setup_values/prev_workflow_run_id.txt"
-          echo "${{ inputs.other_workflow_run_id || env.other_workflow_run_id }}" > "setup_values/other_workflow_run_id.txt"
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          name: setup_values
-          path: setup_values
-
-
-  model-ci:
-    name: Model CI
-    uses: ./.github/workflows/self-scheduled.yml
-    with:
-      job: run_models_gpu
-      slack_report_channel: "#transformers-ci-flash-attn"
-      docker: huggingface/transformers-all-latest-gpu:flash-attn
-      ci_event: Daily CI
-      runner_type: "a10"
-      report_repo_id: hf-internal-testing/transformers_flash_attn_ci
-      commit_sha: ${{ github.sha }}
-      pytest_marker: "flash_attn_test or flash_attn_3_test"
-    secrets: inherit
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@ -38,10 +38,6 @@ on:
        default: ""
        required: false
        type: string
-      pytest_marker:
-        required: false
-        type: string
-

 env:
  HF_HOME: /mnt/cache
@ -131,7 +127,6 @@ jobs:
      commit_sha: ${{ inputs.commit_sha || github.sha }}
      runner_type: ${{ inputs.runner_type }}
      report_repo_id: ${{ inputs.report_repo_id }}
-      pytest_marker: ${{ inputs.pytest_marker }}
    secrets: inherit

  run_trainer_and_fsdp_gpu:
@ -165,7 +160,7 @@ jobs:
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
-      image: huggingface/transformers-all-latest-gpu
+      image: huggingface/transformers-pytorch-gpu
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      - name: Update clone
--- a/.github/workflows/ssh-runner.yml
+++ b/.github/workflows/ssh-runner.yml
@ -41,9 +41,9 @@ jobs:
          elif [[ "$NUM_GPUS" == "multi" && "$RUNNER_TYPE" == "t4" ]]; then
            echo "RUNNER=aws-g4dn-12xlarge-cache" >> $GITHUB_ENV
          elif [[ "$NUM_GPUS" == "single" && "$RUNNER_TYPE" == "a10" ]]; then
-            echo "RUNNER=aws-g5-4xlarge-cache" >> $GITHUB_ENV
+            echo "RUNNER=aws-g5-4xlarge-cache-ssh-use2" >> $GITHUB_ENV
          elif [[ "$NUM_GPUS" == "multi" && "$RUNNER_TYPE" == "a10" ]]; then
-            echo "RUNNER=aws-g5-12xlarge-cache" >> $GITHUB_ENV
+            echo "RUNNER=aws-g5-12xlarge-cache-ssh" >> $GITHUB_ENV
          else
            echo "RUNNER=" >> $GITHUB_ENV
          fi
@ -106,7 +106,7 @@ jobs:
          else
            echo "SLACKCHANNEL=${{ secrets.SLACK_CIFEEDBACK_CHANNEL }}" >> $GITHUB_ENV
          fi
-
+        
      - name: Tailscale # In order to be able to SSH when a test fails
        uses: huggingface/tailscale-action@main
        with:
@ -115,3 +115,10 @@ jobs:
          slackToken: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
          waitForSSH: true
          sshTimeout: 15m
+          version: '1.90.3'
+          sha256sum: '96411140a11ccdfff6243b88e3f84692e1f176990050fb9f43a53970c0873f31'
+
+      - name: wait2
+        if: success() || failure()
+        shell: bash
+        run: sleep 15m
--- a/AGENTS.md
+++ b/AGENTS.md
@ -14,7 +14,7 @@ This AGENTS.md file provides guidance for code agents working with this codebase

 - PRs should be as brief as possible. Bugfix PRs in particular can often be only one or two lines long, and do not need large comments, docstrings or new functions in this case. Aim to minimize the size of the diff.
 - When writing tests, they should be added to an existing file. The only exception is for PRs to add a new model, when a new test directory should be created for that model.
- Code style is enforced in the CI. You can install the style tools with `pip install -e ".[quality]"`. You can then run `make fixup` to apply style and consistency fixes to your code.
+- Code style is enforced in the CI. You can install the style tools with `pip install -e .[quality]`. You can then run `make fixup` to apply style and consistency fixes to your code.

 ## Copying and inheritance

@ -36,4 +36,4 @@ After making changes, you should usually run `make fixup` to ensure any copies a
 the model you made the changes in and any other models that were updated by `make fixup`. Tests can be run with `pytest tests/models/[name]/test_modeling_[name].py`
 If your changes affect code in other classes like tokenizers or processors, you should run those tests instead, like `test_processing_[name].py` or `test_tokenization_[name].py`.

-In order to run tests, you may need to install dependencies. You can do this with `pip install -e ".[testing]"`. You will probably also need to `pip install torch accelerate` if your environment does not already have them.
+In order to run tests, you may need to install dependencies. You can do this with `pip install -e .[testing]`. You will probably also need to `pip install torch accelerate` if your environment does not already have them.
--- a/awesome-transformers.md
+++ b/awesome-transformers.md
@ -9,12 +9,6 @@ In this list, we showcase incredibly impactful and novel projects that have push
 adding other projects to the list. If you believe a project should be here and it's not, then please, open a PR
 to add it.

-## [◉ Universal Intelligence](https://github.com/blueraai/universal-intelligence)
-
-[Universal Intelligence](https://github.com/blueraai/universal-intelligence) aims to standardize models, tools, and agents —transforming them into simple, composable, portable, interoperable, framework-agnostic, hardware-agnostic interfaces (through auto-negotiation and resource sharing); for fast and accessible development of AI applications.
-
-Keywords: Protocol, Open-source, LLMs, Large Language Models, Agents, Low-code
-
 ## [gpt4all](https://github.com/nomic-ai/gpt4all)

 [gpt4all](https://github.com/nomic-ai/gpt4all) is an ecosystem of open-source chatbots trained on massive collections of clean assistant data including code, stories and dialogue. It offers open-source, large language models such as LLaMA and GPT-J trained in an assistant-style.
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@ -9,15 +9,10 @@ SHELL ["sh", "-lc"]
 # The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
 # to be used as arguments for docker build (so far).

-ARG PYTORCH='2.9.0'
+ARG PYTORCH='2.8.0'
 # Example: `cu102`, `cu113`, etc.
 ARG CUDA='cu126'

-# This needs to be compatible with the above `PYTORCH`.
-ARG TORCHCODEC='0.8.0'
-
-ARG FLASH_ATTN='false'
-
 RUN apt update
 RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg git-lfs
 RUN git lfs install
@ -26,44 +21,11 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip
 ARG REF=main
 RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF

-RUN python3 -m pip install --no-cache-dir -e ./transformers[dev]
-
 # 1. Put several commands in a single `RUN` to avoid image/layer exporting issue. Could be revised in the future.
-# 2. For `torchcodec`, use `cpu` as we don't have `libnvcuvid.so` on the host runner. See https://github.com/meta-pytorch/torchcodec/issues/912
-#    **Important**: We need to specify `torchcodec` version if the torch version is not the latest stable one.
-# 3. `set -e` means "exit immediately if any command fails".
-RUN set -e; \
-    # Determine torch version
-    if [ ${#PYTORCH} -gt 0 ] && [ "$PYTORCH" != "pre" ]; then \
-        VERSION="torch==${PYTORCH}.*"; \
-        TORCHCODEC_VERSION="torchcodec==${TORCHCODEC}.*"; \
-    else \
-        VERSION="torch"; \
-        TORCHCODEC_VERSION="torchcodec"; \
-    fi; \
-    \
-    # Log the version being installed
-    echo "Installing torch version: $VERSION"; \
-    \
-    # Install PyTorch packages
-    if [ "$PYTORCH" != "pre" ]; then \
-        python3 -m pip install --no-cache-dir -U \
-            $VERSION \
-            torchvision \
-            torchaudio \
-            --extra-index-url https://download.pytorch.org/whl/$CUDA; \
-        # We need to specify the version if the torch version is not the latest stable one.
-        python3 -m pip install --no-cache-dir -U \
-            $TORCHCODEC_VERSION --extra-index-url https://download.pytorch.org/whl/cpu; \
-    else \
-        python3 -m pip install --no-cache-dir -U --pre \
-            torch \
-            torchvision \
-            torchaudio \
-            --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA; \
-        python3 -m pip install --no-cache-dir -U --pre \
-            torchcodec --extra-index-url https://download.pytorch.org/whl/nightly/cpu; \
-    fi
+# 2. Regarding `torch` part, We might need to specify proper versions for `torchvision` and `torchaudio`.
+#    Currently, let's not bother to specify their versions explicitly (so installed with their latest release versions).
+# 3. For `torchcodec<0.8`: this is quickly added as torch 2.9.0 + torchcodec 0.8.0 fails on our CI env. Need to remove later once they work.
+RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' ||  VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio "torchcodec<0.8" --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA

 RUN python3 -m pip install --no-cache-dir -U timm

@ -92,7 +54,7 @@ RUN python3 -m pip install --no-cache-dir bitsandbytes
 RUN python3 -m pip install --no-cache-dir quanto

 # After using A10 as CI runner, let's run FA2 tests
-RUN [ "$FLASH_ATTN" != "false" ] && python3 -m pip uninstall -y ninja && python3 -m pip install --no-cache-dir ninja && python3 -m pip install flash-attn --no-cache-dir --no-build-isolation || echo "Don't install FA2 with nightly torch"
+RUN [ "$PYTORCH" != "pre" ] && python3 -m pip uninstall -y ninja && python3 -m pip install --no-cache-dir ninja && python3 -m pip install flash-attn --no-cache-dir --no-build-isolation || echo "Don't install FA2 with nightly torch"

 # TODO (ydshieh): check this again
 # `quanto` will install `ninja` which leads to many `CUDA error: an illegal memory access ...` in some model tests
--- a/docs/source/en/pr_checks.md
+++ b/docs/source/en/pr_checks.md
@ -38,7 +38,7 @@ pip install transformers[dev]
 or for an editable install:

 ```bash
-pip install -e ".[dev]"
+pip install -e .[dev]
 ```

 inside the Transformers repo. Since the number of optional dependencies of Transformers has grown a lot, it's possible you don't manage to get all of them. If the dev install fails, make sure to install PyTorch then do
@ -50,7 +50,7 @@ pip install transformers[quality]
 or for an editable install:

 ```bash
-pip install -e ".[quality]"
+pip install -e .[quality]
 ```

 ## Tests
--- a/docs/source/es/pr_checks.md
+++ b/docs/source/es/pr_checks.md
@ -37,7 +37,7 @@ pip install transformers[dev]
 o una instalación editable:

 ```bash
-pip install -e ".[dev]"
+pip install -e .[dev]
 ```

 del repositorio de Transformers.
--- a/docs/source/it/pr_checks.md
+++ b/docs/source/it/pr_checks.md
@ -37,7 +37,7 @@ pip install transformers[dev]
 o un'installazione modificabile:

 ```bash
-pip install -e ".[dev]"
+pip install -e .[dev]
 ```

 all'interno del repo Transformers.
--- a/docs/source/ja/pr_checks.md
+++ b/docs/source/ja/pr_checks.md
@ -40,7 +40,7 @@ pip install transformers[dev]


 ```bash
-pip install -e ".[dev]"
+pip install -e .[dev]
 ```

 トランスフォーマーズのリポジトリ内で作業しています。トランスフォーマーズのオプションの依存関係の数が増えたため、すべてを取得できない可能性があります。開発用インストールが失敗した場合、作業しているディープラーニングフレームワーク（PyTorch、TensorFlow、および/またはFlax）をインストールし、次の手順を実行してください。
@ -53,7 +53,7 @@ pip install transformers[quality]
 または編集可能なインストールの場合：

 ```bash
-pip install -e ".[quality]"
+pip install -e .[quality]
 ```

 ## Tests
--- a/docs/source/ko/pr_checks.md
+++ b/docs/source/ko/pr_checks.md
@ -37,7 +37,7 @@ pip install transformers[dev]
 또는 Transformers 저장소 내에 편집 가능한 설치가 필요합니다:

 ```bash
-pip install -e ".[dev]"
+pip install -e .[dev]
 ```

 Transformers의 선택적 종속성 수가 많이 늘어났기 때문에 개발 설치를 실패할 수도 있습니다. 개발 설치가 실패하는 경우, 작업 중인 Deep Learning 프레임워크 (PyTorch, TensorFlow 및/또는 Flax)를 설치하고 다음 명령을 실행하세요.
@ -49,7 +49,7 @@ pip install transformers[quality]
 편집 가능한 설치의 경우는 다음 명령을 실행하세요.

 ```bash
-pip install -e ".[quality]"
+pip install -e .[quality]
 ```


--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@ -308,19 +308,11 @@ def main():
            api = HfApi()
            repo_id = api.create_repo(repo_name, exist_ok=True, token=args.hub_token).repo_id

-            os.makedirs(args.output_dir, exist_ok=True)
-            gitignore_path = os.path.join(args.output_dir, ".gitignore")
-            content = ""
-            if os.path.exists(gitignore_path):
-                with open(gitignore_path, "r") as f:
-                    content = f.read()
-            with open(gitignore_path, "a") as f:
-                if content and not content.endswith("\n"):
-                    f.write("\n")
-                if "step_*" not in content:
-                    f.write("step_*\n")
-                if "epoch_*" not in content:
-                    f.write("epoch_*\n")
+            with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
+                if "step_*" not in gitignore:
+                    gitignore.write("step_*\n")
+                if "epoch_*" not in gitignore:
+                    gitignore.write("epoch_*\n")
        elif args.output_dir is not None:
            os.makedirs(args.output_dir, exist_ok=True)
    accelerator.wait_for_everyone()
--- a/notebooks/README.md
+++ b/notebooks/README.md
@ -33,9 +33,9 @@ You can open any page of the documentation as a notebook in Colab (there is a bu
 | [Quicktour of the library](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/quicktour.ipynb)  | A presentation of the various APIs in Transformers |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/quicktour.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/en/transformers_doc/quicktour.ipynb)| |
 | [Summary of the tasks](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/task_summary.ipynb)  | How to run the models of the Transformers library task by task |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/task_summary.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/task_summary.ipynb)| |
 | [Preprocessing data](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/preprocessing.ipynb)  | How to use a tokenizer to preprocess your data |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/preprocessing.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/preprocessing.ipynb)||
-| [Fine-tuning a pretrained model](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/training.ipynb)  | How to use the Trainer to fine-tune a pretrained model |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/training.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/training.ipynb)|[![Open in AMD Dev Cloud](https://oneclickamd.ai/static/amd.svg?v=2)](https://oneclickamd.ai/github/huggingface/notebooks/blob/main/transformers_doc/en/training.ipynb)|
-| [Summary of the tokenizers](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/tokenizer_summary.ipynb)  | The differences between the tokenizers algorithm |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/tokenizer_summary.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/tokenizer_summary.ipynb)|[![Open in AMD Dev Cloud](https://oneclickamd.ai/static/amd.svg?v=2)](https://oneclickamd.ai/github/huggingface/notebooks/blob/main/transformers_doc/en/tokenizer_summary.ipynb )|
-| [Multilingual models](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/multilingual.ipynb)  | How to use the multilingual models of the library |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/multilingual.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/multilingual.ipynb)|[![Open in AMD Dev Cloud](https://oneclickamd.ai/static/amd.svg?v=2)](https://oneclickamd.ai/github/huggingface/notebooks/blob/main/transformers_doc/en/multilingual.ipynb)|
+| [Fine-tuning a pretrained model](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/training.ipynb)  | How to use the Trainer to fine-tune a pretrained model |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/training.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/training.ipynb)| |
+| [Summary of the tokenizers](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/tokenizer_summary.ipynb)  | The differences between the tokenizers algorithm |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/tokenizer_summary.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/tokenizer_summary.ipynb)|[![Open in AMD Dev Cloud](https://oneclickamd.ai/static/amd.svg?v=2)](http://oneclickamd.ai/github/huggingface/notebooks/blob/main/transformers_doc/en/tokenizer_summary.ipynb )|
+| [Multilingual models](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/multilingual.ipynb)  | How to use the multilingual models of the library |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/multilingual.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/multilingual.ipynb)|[![Open in AMD Dev Cloud](https://oneclickamd.ai/static/amd.svg)](http://oneclickamd.ai/github/huggingface/notebooks/blob/main/transformers_doc/en/multilingual.ipynb)|

 ### PyTorch Examples

@ -43,14 +43,14 @@ You can open any page of the documentation as a notebook in Colab (there is a bu

 | Notebook     |      Description      |   |   |   |
 |:----------|:-------------|:-------------|:-------------|------:|
-| [Train your tokenizer](https://github.com/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb)  | How to train and use your very own tokenizer  |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb)|[![Open in AMD Dev Cloud](https://oneclickamd.ai/static/amd.svg?v=2)](https://oneclickamd.ai/github/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb)|
-| [Train your language model](https://github.com/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch.ipynb)   | How to easily start using transformers  |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch.ipynb)|[![Open in AMD Dev Cloud](https://oneclickamd.ai/static/amd.svg?v=2)](https://oneclickamd.ai/github/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch.ipynb)|
+| [Train your tokenizer](https://github.com/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb)  | How to train and use your very own tokenizer  |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb)|[![Open in AMD Dev Cloud](https://oneclickamd.ai/static/amd.svg?v=2)](http://oneclickamd.ai/github/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb)|
+| [Train your language model](https://github.com/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch.ipynb)   | How to easily start using transformers  |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch.ipynb)|[![Open in AMD Dev Cloud](https://oneclickamd.ai/static/amd.svg?v=2)](http://oneclickamd.ai/github/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch.ipynb)|
 | [How to fine-tune a model on text classification](https://github.com/huggingface/notebooks/blob/main/examples/text_classification.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on any GLUE task. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb)| |
-| [How to fine-tune a model on language modeling](https://github.com/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on a causal or masked LM task. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)| [![Open in AMD Dev Cloud](https://oneclickamd.ai/static/amd.svg?v=2)](https://oneclickamd.ai/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)|
-| [How to fine-tune a model on token classification](https://github.com/huggingface/notebooks/blob/main/examples/token_classification.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on a token classification task (NER, PoS). | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb)|[![Open in AMD Dev Cloud](https://oneclickamd.ai/static/amd.svg?v=2)](https://oneclickamd.ai/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb)|
-| [How to fine-tune a model on question answering](https://github.com/huggingface/notebooks/blob/main/examples/question_answering.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on SQUAD. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb)|[![Open in AMD Dev Cloud](https://oneclickamd.ai/static/amd.svg?v=2)](https://oneclickamd.ai/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb)|
-| [How to fine-tune a model on multiple choice](https://github.com/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on SWAG. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)|[![Open in AMD Dev Cloud](https://oneclickamd.ai/static/amd.svg?v=2)](https://oneclickamd.ai/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)|
-| [How to fine-tune a model on translation](https://github.com/huggingface/notebooks/blob/main/examples/translation.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on WMT. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/translation.ipynb)|[![Open in AMD Dev Cloud](https://oneclickamd.ai/static/amd.svg?v=2)](https://oneclickamd.ai/github/huggingface/notebooks/blob/main/examples/translation.ipynb)|
+| [How to fine-tune a model on language modeling](https://github.com/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on a causal or masked LM task. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)| [![Open in AMD Dev Cloud](https://oneclickamd.ai/static/amd.svg?v=2)](http://oneclickamd.ai/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)|
+| [How to fine-tune a model on token classification](https://github.com/huggingface/notebooks/blob/main/examples/token_classification.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on a token classification task (NER, PoS). | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb)| |
+| [How to fine-tune a model on question answering](https://github.com/huggingface/notebooks/blob/main/examples/question_answering.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on SQUAD. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb)| |
+| [How to fine-tune a model on multiple choice](https://github.com/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on SWAG. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)| |
+| [How to fine-tune a model on translation](https://github.com/huggingface/notebooks/blob/main/examples/translation.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on WMT. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/translation.ipynb)| |
 | [How to fine-tune a model on summarization](https://github.com/huggingface/notebooks/blob/main/examples/summarization.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on XSUM. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/summarization.ipynb)| |
 | [How to train a language model from scratch](https://github.com/huggingface/blog/blob/main/notebooks/01_how_to_train.ipynb)| Highlight all the steps to effectively train Transformer model on custom data | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/01_how_to_train.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/blog/blob/main/notebooks/01_how_to_train.ipynb)| |
 | [How to generate text](https://github.com/huggingface/blog/blob/main/notebooks/02_how_to_generate.ipynb)| How to use different decoding methods for language generation with transformers | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/02_how_to_generate.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/blog/blob/main/notebooks/02_how_to_generate.ipynb)| |
@ -58,16 +58,16 @@ You can open any page of the documentation as a notebook in Colab (there is a bu

 #### Computer Vision[[pytorch-cv]]

-| Notebook                                                                                                                                                                   | Description                                                                                                            |                                                                                                                                                                                                            |   |   |
-|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------|------:|
-| [How to fine-tune a model on image classification (Torchvision)](https://github.com/huggingface/notebooks/blob/main/examples/image_classification.ipynb)                   | Show how to preprocess the data using Torchvision and fine-tune any pretrained Vision model on Image Classification    | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb)                 | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb)| [![Open in AMD Dev Cloud](https://oneclickamd.ai/static/amd.svg?v=2)](https://oneclickamd.ai/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb)|
-| [How to fine-tune a model on image classification (Albumentations)](https://github.com/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb) | Show how to preprocess the data using Albumentations and fine-tune any pretrained Vision model on Image Classification | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb)  | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb)| |
-| [How to fine-tune a model on image classification (Kornia)](https://github.com/huggingface/notebooks/blob/main/examples/image_classification_kornia.ipynb)                 | Show how to preprocess the data using Kornia and fine-tune any pretrained Vision model on Image Classification         | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_kornia.ipynb)          | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_classification_kornia.ipynb)| |
-| [How to perform zero-shot object detection with OWL-ViT](https://github.com/huggingface/notebooks/blob/main/examples/zeroshot_object_detection_with_owlvit.ipynb)          | Show how to perform zero-shot object detection on images with text queries                                             | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/zeroshot_object_detection_with_owlvit.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/zeroshot_object_detection_with_owlvit.ipynb)| |
-| [How to fine-tune an image captioning model](https://github.com/huggingface/notebooks/blob/main/examples/image_captioning_blip.ipynb)                                      | Show how to fine-tune BLIP for image captioning on a custom dataset                                                    | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_captioning_blip.ipynb)                | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_captioning_blip.ipynb)| |
-| [How to build an image similarity system with Transformers](https://github.com/huggingface/notebooks/blob/main/examples/image_similarity.ipynb)                            | Show how to build an image similarity system                                                                           | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_similarity.ipynb)                     | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_similarity.ipynb)| |
-| [How to fine-tune a SegFormer model on semantic segmentation](https://github.com/huggingface/notebooks/blob/main/examples/semantic_segmentation.ipynb)                     | Show how to preprocess the data and fine-tune a pretrained SegFormer model on Semantic Segmentation                    | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/semantic_segmentation.ipynb)                | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/semantic_segmentation.ipynb)| |
-| [How to fine-tune a VideoMAE model on video classification](https://github.com/huggingface/notebooks/blob/main/examples/video_classification.ipynb)          | Show how to preprocess the data and fine-tune a pretrained VideoMAE model on Video Classification                      | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/video_classification.ipynb)                | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/video_classification.ipynb)| |
+| Notebook                                                                                                                                                                   | Description                                                                                                            |                                                                                                                                                                                                            |   |
+|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------:|
+| [How to fine-tune a model on image classification (Torchvision)](https://github.com/huggingface/notebooks/blob/main/examples/image_classification.ipynb)                   | Show how to preprocess the data using Torchvision and fine-tune any pretrained Vision model on Image Classification    | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb)                 | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb)|
+| [How to fine-tune a model on image classification (Albumentations)](https://github.com/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb) | Show how to preprocess the data using Albumentations and fine-tune any pretrained Vision model on Image Classification | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb)  | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb)|
+| [How to fine-tune a model on image classification (Kornia)](https://github.com/huggingface/notebooks/blob/main/examples/image_classification_kornia.ipynb)                 | Show how to preprocess the data using Kornia and fine-tune any pretrained Vision model on Image Classification         | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_kornia.ipynb)          | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_classification_kornia.ipynb)|
+| [How to perform zero-shot object detection with OWL-ViT](https://github.com/huggingface/notebooks/blob/main/examples/zeroshot_object_detection_with_owlvit.ipynb)          | Show how to perform zero-shot object detection on images with text queries                                             | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/zeroshot_object_detection_with_owlvit.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/zeroshot_object_detection_with_owlvit.ipynb)|
+| [How to fine-tune an image captioning model](https://github.com/huggingface/notebooks/blob/main/examples/image_captioning_blip.ipynb)                                      | Show how to fine-tune BLIP for image captioning on a custom dataset                                                    | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_captioning_blip.ipynb)                | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_captioning_blip.ipynb)|
+| [How to build an image similarity system with Transformers](https://github.com/huggingface/notebooks/blob/main/examples/image_similarity.ipynb)                            | Show how to build an image similarity system                                                                           | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_similarity.ipynb)                     | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_similarity.ipynb)|
+| [How to fine-tune a SegFormer model on semantic segmentation](https://github.com/huggingface/notebooks/blob/main/examples/semantic_segmentation.ipynb)                     | Show how to preprocess the data and fine-tune a pretrained SegFormer model on Semantic Segmentation                    | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/semantic_segmentation.ipynb)                | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/semantic_segmentation.ipynb)|
+| [How to fine-tune a VideoMAE model on video classification](https://github.com/huggingface/notebooks/blob/main/examples/video_classification.ipynb)          | Show how to preprocess the data and fine-tune a pretrained VideoMAE model on Video Classification                      | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/video_classification.ipynb)                | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/video_classification.ipynb)|

 #### Audio[[pytorch-audio]]

--- a/setup.py
+++ b/setup.py
@ -104,7 +104,7 @@ _deps = [
    "deepspeed>=0.9.3",
    "diffusers",
    "dill<0.3.5",
-    "evaluate>=0.4.6",
+    "evaluate>=0.2.0",
    "faiss-cpu",
    "fastapi",
    "filelock",
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@ -14,7 +14,7 @@ deps = {
    "deepspeed": "deepspeed>=0.9.3",
    "diffusers": "diffusers",
    "dill": "dill<0.3.5",
-    "evaluate": "evaluate>=0.4.6",
+    "evaluate": "evaluate>=0.2.0",
    "faiss-cpu": "faiss-cpu",
    "fastapi": "fastapi",
    "filelock": "filelock",
--- a/src/transformers/integrations/deepspeed.py
+++ b/src/transformers/integrations/deepspeed.py
@ -314,14 +314,13 @@ def _load_state_dict_into_zero3_model(model_to_load, state_dict):
        args = (state_dict, prefix, local_metadata, True, [], [], error_msgs)
        # Parameters of module and children will start with prefix. We can exit early if there are none in this
        # state_dict
-        if is_deepspeed_zero3_enabled():
+        if is_deepspeed_zero3_enabled() and len([key for key in state_dict if key.startswith(prefix)]) > 0:
            import deepspeed

            # In sharded models, each shard has only part of the full state_dict, so only gather
            # parameters that are in the current state_dict.
            named_parameters = dict(module.named_parameters(prefix=prefix[:-1], recurse=False))
-            params_to_gather = [named_parameters[k] for k in named_parameters if k in state_dict]
-
+            params_to_gather = [named_parameters[k] for k in state_dict if k in named_parameters]
            if len(params_to_gather) > 0:
                # because zero3 puts placeholders in model params, this context
                # manager gathers (unpartitions) the params of the current layer, then loads from
--- a/src/transformers/integrations/peft.py
+++ b/src/transformers/integrations/peft.py
@ -628,7 +628,7 @@ def maybe_load_adapters(
    **adapter_kwargs,
 ):
    if pretrained_model_name_or_path is None or not is_peft_available():
-        return None, pretrained_model_name_or_path, adapter_kwargs
+        return None, pretrained_model_name_or_path

    token = download_kwargs.get("token")

@ -651,15 +651,13 @@ def maybe_load_adapters(

    _adapter_model_path = adapter_kwargs.pop("_adapter_model_path", None)

-    token_from_adapter_kwargs = adapter_kwargs.pop("token", None)
-
    if _adapter_model_path is None:
        _adapter_model_path = find_adapter_config_file(
            pretrained_model_name_or_path,
            cache_dir=download_kwargs.get("cache_dir"),
            force_download=bool(download_kwargs.get("force_download", False)),
            proxies=download_kwargs.get("proxies"),
-            token=token or token_from_adapter_kwargs,
+            token=token,
            revision=download_kwargs.get("revision"),
            local_files_only=bool(download_kwargs.get("local_files_only", False)),
            subfolder=download_kwargs.get("subfolder", ""),
@ -672,4 +670,4 @@ def maybe_load_adapters(
            _adapter_model_path = pretrained_model_name_or_path
            pretrained_model_name_or_path = json.load(f)["base_model_name_or_path"]

-    return _adapter_model_path, pretrained_model_name_or_path, adapter_kwargs
+    return _adapter_model_path, pretrained_model_name_or_path
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@ -4353,7 +4353,7 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
        if adapter_kwargs is None:
            adapter_kwargs = {}

-        _adapter_model_path, pretrained_model_name_or_path, adapter_kwargs = maybe_load_adapters(
+        _adapter_model_path, pretrained_model_name_or_path = maybe_load_adapters(
            pretrained_model_name_or_path,
            download_kwargs_with_commit,
            **adapter_kwargs,
--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@ -538,12 +538,12 @@ class BartEncoder(BartPreTrainedModel):
        self.max_source_positions = config.max_position_embeddings
        embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0

+        self.embed_tokens = BartScaledWordEmbedding(
+            config.vocab_size, embed_dim, self.padding_idx, embed_scale=embed_scale
+        )
+
        if embed_tokens is not None:
-            self.embed_tokens = embed_tokens
-        else:
-            self.embed_tokens = BartScaledWordEmbedding(
-                config.vocab_size, embed_dim, self.padding_idx, embed_scale=embed_scale
-            )
+            self.embed_tokens.weight = embed_tokens.weight

        self.embed_positions = BartLearnedPositionalEmbedding(
            config.max_position_embeddings,
@ -682,12 +682,12 @@ class BartDecoder(BartPreTrainedModel):
        self.max_target_positions = config.max_position_embeddings
        embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0

+        self.embed_tokens = BartScaledWordEmbedding(
+            config.vocab_size, config.d_model, self.padding_idx, embed_scale=embed_scale
+        )
+
        if embed_tokens is not None:
-            self.embed_tokens = embed_tokens
-        else:
-            self.embed_tokens = BartScaledWordEmbedding(
-                config.vocab_size, config.d_model, self.padding_idx, embed_scale=embed_scale
-            )
+            self.embed_tokens.weight = embed_tokens.weight

        self.embed_positions = BartLearnedPositionalEmbedding(
            config.max_position_embeddings,
--- a/src/transformers/models/clip/modeling_clip.py
+++ b/src/transformers/models/clip/modeling_clip.py
@ -22,7 +22,7 @@ import torch
 from torch import nn

 from ...activations import ACT2FN
-from ...masking_utils import create_causal_mask
+from ...modeling_attn_mask_utils import _create_4d_causal_attention_mask, _prepare_4d_attention_mask
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
@ -310,6 +310,7 @@ class CLIPAttention(nn.Module):
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
        **kwargs: Unpack[TransformersKwargs],
    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
        """Input shape: Batch x Time x Channel"""
@ -323,6 +324,15 @@ class CLIPAttention(nn.Module):
        queries = queries.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2)
        keys = keys.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2)
        values = values.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2)
+        # CLIP text model uses both `causal_attention_mask` and `attention_mask`
+        # in case FA2 kernel is called, `is_causal` should be inferred from `causal_attention_mask`
+        if self.config._attn_implementation == "flash_attention_2":
+            self.is_causal = causal_attention_mask is not None
+        else:
+            if attention_mask is not None and causal_attention_mask is not None:
+                attention_mask = attention_mask + causal_attention_mask
+            elif causal_attention_mask is not None:
+                attention_mask = causal_attention_mask

        attention_interface: Callable = eager_attention_forward
        if self.config._attn_implementation != "eager":
@ -334,12 +344,13 @@ class CLIPAttention(nn.Module):
            keys,
            values,
            attention_mask,
+            is_causal=self.is_causal,
            scaling=self.scale,
            dropout=0.0 if not self.training else self.dropout,
            **kwargs,
        )

-        attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous()
+        attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous()
        attn_output = self.out_proj(attn_output)

        return attn_output, attn_weights
@ -373,14 +384,16 @@ class CLIPEncoderLayer(GradientCheckpointingLayer):
        self,
        hidden_states: torch.Tensor,
        attention_mask: torch.Tensor,
+        causal_attention_mask: torch.Tensor,
        **kwargs: Unpack[TransformersKwargs],
    ) -> torch.FloatTensor:
        residual = hidden_states

        hidden_states = self.layer_norm1(hidden_states)
-        hidden_states, _ = self.self_attn(
+        hidden_states, attn_weights = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
            **kwargs,
        )
        hidden_states = residual + hidden_states
@ -484,6 +497,7 @@ class CLIPEncoder(nn.Module):
        self,
        inputs_embeds,
        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
        **kwargs: Unpack[TransformersKwargs],
    ) -> BaseModelOutput:
        r"""
@ -498,6 +512,13 @@ class CLIPEncoder(nn.Module):
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

+                [What are attention masks?](../glossary#attention-mask)
+            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Causal mask for the text model. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
                [What are attention masks?](../glossary#attention-mask)
        """
        hidden_states = inputs_embeds
@ -505,6 +526,7 @@ class CLIPEncoder(nn.Module):
            hidden_states = encoder_layer(
                hidden_states,
                attention_mask,
+                causal_attention_mask,
                **kwargs,
            )

@ -541,19 +563,17 @@ class CLIPTextTransformer(nn.Module):

        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)

-        attention_mask = create_causal_mask(
-            config=self.config,
-            input_embeds=hidden_states,
-            attention_mask=attention_mask,
-            cache_position=torch.arange(hidden_states.shape[1], device=hidden_states.device),
-            past_key_values=None,
+        causal_attention_mask = _create_4d_causal_attention_mask(
+            input_shape, hidden_states.dtype, device=hidden_states.device
        )

-        kwargs.pop("is_causal", None)
+        if attention_mask is not None and self.config._attn_implementation != "flash_attention_2":
+            attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)
+
        encoder_outputs: BaseModelOutput = self.encoder(
            inputs_embeds=hidden_states,
            attention_mask=attention_mask,
-            is_causal=True,
+            causal_attention_mask=causal_attention_mask,
            **kwargs,
        )

@ -598,6 +618,7 @@ class CLIPTextModel(CLIPPreTrainedModel):
    input_modalities = "text"

    _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer"]
+    _supports_flash_attn = False  # mask creation only accounts for sdpa/eager

    def __init__(self, config: CLIPTextConfig):
        super().__init__(config)
@ -611,7 +632,8 @@ class CLIPTextModel(CLIPPreTrainedModel):
    def set_input_embeddings(self, value):
        self.text_model.embeddings.token_embedding = value

-    @check_model_inputs(tie_last_hidden_states=False)
+    @check_model_inputs()
+    @can_return_tuple
    @auto_docstring
    def forward(
        self,
@ -704,6 +726,7 @@ class CLIPVisionModel(CLIPPreTrainedModel):
        return self.vision_model.embeddings.patch_embedding

    @check_model_inputs(tie_last_hidden_states=False)
+    @can_return_tuple
    @auto_docstring
    def forward(
        self,
@ -743,6 +766,7 @@ class CLIPVisionModel(CLIPPreTrainedModel):
 class CLIPModel(CLIPPreTrainedModel):
    config: CLIPConfig
    _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer", "CLIPVisionEmbeddings"]
+    _supports_flash_attn = False  # mask creation only accounts for sdpa/eager

    def __init__(self, config: CLIPConfig):
        super().__init__(config)
@ -942,6 +966,7 @@ class CLIPTextModelWithProjection(CLIPPreTrainedModel):
    config: CLIPTextConfig
    input_modalities = "text"

+    _supports_flash_attn = False
    _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer"]

    def __init__(self, config: CLIPTextConfig):
@ -961,7 +986,8 @@ class CLIPTextModelWithProjection(CLIPPreTrainedModel):
    def set_input_embeddings(self, value):
        self.text_model.embeddings.token_embedding = value

-    @check_model_inputs(tie_last_hidden_states=False)
+    @check_model_inputs()
+    @can_return_tuple
    @auto_docstring
    def forward(
        self,
@ -1023,6 +1049,7 @@ class CLIPVisionModelWithProjection(CLIPPreTrainedModel):
        return self.vision_model.embeddings.patch_embedding

    @check_model_inputs(tie_last_hidden_states=False)
+    @can_return_tuple
    @auto_docstring
    def forward(
        self,
@ -1090,7 +1117,8 @@ class CLIPForImageClassification(CLIPPreTrainedModel):
        # Initialize weights and apply final processing
        self.post_init()

-    @check_model_inputs(tie_last_hidden_states=False)
+    @check_model_inputs()
+    @can_return_tuple
    @auto_docstring
    def forward(
        self,
--- a/src/transformers/models/emu3/modeling_emu3.py
+++ b/src/transformers/models/emu3/modeling_emu3.py
@ -1392,7 +1392,7 @@ class Emu3Model(Emu3PreTrainedModel):
        image_features = torch.split(image_features, split_sizes)
        return image_features

-    @torch.no_grad()
+    @torch.no_grad
    def decode_image_tokens(self, image_tokens: torch.LongTensor, height: int, width: int):
        """
        Decodes generated image tokens from language model to continuous pixel values
--- a/src/transformers/models/emu3/modular_emu3.py
+++ b/src/transformers/models/emu3/modular_emu3.py
@ -946,7 +946,7 @@ class Emu3Model(Emu3PreTrainedModel):
        image_features = torch.split(image_features, split_sizes)
        return image_features

-    @torch.no_grad()
+    @torch.no_grad
    def decode_image_tokens(self, image_tokens: torch.LongTensor, height: int, width: int):
        """
        Decodes generated image tokens from language model to continuous pixel values
--- a/src/transformers/models/florence2/convert_florence2_original_pytorch_to_hf.py
+++ b/src/transformers/models/florence2/convert_florence2_original_pytorch_to_hf.py
@ -473,9 +473,6 @@ def convert_florence2_checkpoint(hf_model_id, pytorch_dump_folder, output_hub_pa

    vision_config = convert_config(hf_config.vision_config.__dict__)
    text_config = hf_config.text_config.__dict__
-    if text_config.get("model_type") == "florence2_language":
-        text_config["model_type"] = "bart"
-
    config = Florence2Config(
        text_config=text_config,
        vision_config=vision_config,
--- a/src/transformers/models/gemma3/configuration_gemma3.py
+++ b/src/transformers/models/gemma3/configuration_gemma3.py
@ -156,7 +156,7 @@ class Gemma3TextConfig(PreTrainedConfig):
        layer_types: Optional[list[str]] = None,
        final_logit_softcapping: Optional[float] = None,
        attn_logit_softcapping: Optional[float] = None,
-        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
        use_bidirectional_attention: Optional[bool] = False,
        **kwargs,
    ):
@ -186,16 +186,10 @@ class Gemma3TextConfig(PreTrainedConfig):
        self.final_logit_softcapping = final_logit_softcapping
        self.attn_logit_softcapping = attn_logit_softcapping
        self.layer_types = layer_types
-
        # Try to set `rope_scaling` if available, otherwise use `rope_parameters`
-        if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None:
-            if rope_parameters is None:
-                rope_parameters = {"sliding_attention": {"rope_type": "default"}, "full_attention": rope_scaling}
-            elif "full_attention" in rope_parameters:
-                rope_parameters["full_attention"].update(rope_scaling)
-            else:
-                rope_parameters.update(rope_scaling)
-
+        rope_scaling = kwargs.pop("rope_scaling", None)
+        if rope_scaling is not None:
+            rope_parameters = {"sliding_attention": {"rope_type": "default"}, "full_attention": rope_scaling}
        self.rope_parameters = rope_parameters
        self.use_bidirectional_attention = use_bidirectional_attention
        if use_bidirectional_attention:
--- a/src/transformers/models/gemma3/convert_gemma3_weights.py
+++ b/src/transformers/models/gemma3/convert_gemma3_weights.py
@ -191,10 +191,7 @@ _VARIANTS = {
            num_hidden_layers=34,
            num_key_value_heads=4,
            sliding_window=1024,
-            rope_parameters={
-                "full_attention": {"rope_type": "linear", "factor": 8.0},
-                "sliding_attention": {"rope_type": "default"},
-            },
+            rope_parameters={"rope_type": "linear", "factor": 8.0},  # used for global RoPE only
            rope_theta=1_000_000,
            rope_local_base_freq=10_000,
            attn_logit_softcapping=None,
@ -212,10 +209,7 @@ _VARIANTS = {
            num_hidden_layers=48,
            num_key_value_heads=8,
            sliding_window=1024,
-            rope_parameters={
-                "full_attention": {"rope_type": "linear", "factor": 8.0},
-                "sliding_attention": {"rope_type": "default"},
-            },
+            rope_parameters={"rope_type": "linear", "factor": 8.0},  # used for global RoPE only
            rope_theta=1_000_000,
            rope_local_base_freq=10_000,
            attn_logit_softcapping=None,
@ -233,10 +227,7 @@ _VARIANTS = {
            num_key_value_heads=16,
            head_dim=128,
            sliding_window=1024,
-            rope_parameters={
-                "full_attention": {"rope_type": "linear", "factor": 8.0},
-                "sliding_attention": {"rope_type": "default"},
-            },
+            rope_parameters={"rope_type": "linear", "factor": 8.0},  # used for global RoPE only
            rope_theta=1_000_000,
            rope_local_base_freq=10_000,
            attn_logit_softcapping=None,
--- a/src/transformers/models/gemma3/modular_gemma3.py
+++ b/src/transformers/models/gemma3/modular_gemma3.py
@ -171,7 +171,7 @@ class Gemma3TextConfig(Gemma2Config, PreTrainedConfig):
        layer_types: Optional[list[str]] = None,
        final_logit_softcapping: Optional[float] = None,
        attn_logit_softcapping: Optional[float] = None,
-        rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None,
+        rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None,
        use_bidirectional_attention: Optional[bool] = False,
        **kwargs,
    ):
@ -201,16 +201,10 @@ class Gemma3TextConfig(Gemma2Config, PreTrainedConfig):
        self.final_logit_softcapping = final_logit_softcapping
        self.attn_logit_softcapping = attn_logit_softcapping
        self.layer_types = layer_types
-
        # Try to set `rope_scaling` if available, otherwise use `rope_parameters`
-        if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None:
-            if rope_parameters is None:
-                rope_parameters = {"sliding_attention": {"rope_type": "default"}, "full_attention": rope_scaling}
-            elif "full_attention" in rope_parameters:
-                rope_parameters["full_attention"].update(rope_scaling)
-            else:
-                rope_parameters.update(rope_scaling)
-
+        rope_scaling = kwargs.pop("rope_scaling", None)
+        if rope_scaling is not None:
+            rope_parameters = {"sliding_attention": {"rope_type": "default"}, "full_attention": rope_scaling}
        self.rope_parameters = rope_parameters
        self.use_bidirectional_attention = use_bidirectional_attention
        if use_bidirectional_attention:
--- a/src/transformers/models/janus/modeling_janus.py
+++ b/src/transformers/models/janus/modeling_janus.py
@ -1283,7 +1283,7 @@ class JanusForConditionalGeneration(JanusPreTrainedModel, GenerationMixin):
        decoded_image = decoded_image.permute(0, 2, 3, 1)
        return decoded_image

-    @torch.no_grad()
+    @torch.no_grad
    def generate(
        self,
        inputs: Optional[torch.Tensor] = None,
--- a/src/transformers/models/janus/modular_janus.py
+++ b/src/transformers/models/janus/modular_janus.py
@ -1099,7 +1099,7 @@ class JanusForConditionalGeneration(JanusPreTrainedModel, GenerationMixin):
        decoded_image = decoded_image.permute(0, 2, 3, 1)
        return decoded_image

-    @torch.no_grad()
+    @torch.no_grad
    def generate(
        self,
        inputs: Optional[torch.Tensor] = None,
--- a/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py
+++ b/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py
@ -209,8 +209,8 @@ class Lfm2VlImageProcessorFast(BaseImageProcessorFast):
    do_normalize = True
    do_pad = True
    return_row_col_info = False
-    image_mean = IMAGENET_STANDARD_MEAN
-    image_std = IMAGENET_STANDARD_STD
+    image_mean = IMAGENET_STANDARD_STD
+    image_std = IMAGENET_STANDARD_MEAN
    valid_kwargs = Lfm2VlImageProcessorKwargs
    model_input_names = ["pixel_values", "pixel_attention_mask", "spatial_shapes"]

--- a/src/transformers/models/metaclip_2/modeling_metaclip_2.py
+++ b/src/transformers/models/metaclip_2/modeling_metaclip_2.py
@ -12,7 +12,7 @@ import torch
 from torch import nn

 from ...activations import ACT2FN
-from ...masking_utils import create_causal_mask
+from ...modeling_attn_mask_utils import _create_4d_causal_attention_mask, _prepare_4d_attention_mask
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
@ -200,6 +200,7 @@ class MetaClip2Attention(nn.Module):
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
        **kwargs: Unpack[TransformersKwargs],
    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
        """Input shape: Batch x Time x Channel"""
@ -213,6 +214,15 @@ class MetaClip2Attention(nn.Module):
        queries = queries.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2)
        keys = keys.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2)
        values = values.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2)
+        # METACLIP_2 text model uses both `causal_attention_mask` and `attention_mask`
+        # in case FA2 kernel is called, `is_causal` should be inferred from `causal_attention_mask`
+        if self.config._attn_implementation == "flash_attention_2":
+            self.is_causal = causal_attention_mask is not None
+        else:
+            if attention_mask is not None and causal_attention_mask is not None:
+                attention_mask = attention_mask + causal_attention_mask
+            elif causal_attention_mask is not None:
+                attention_mask = causal_attention_mask

        attention_interface: Callable = eager_attention_forward
        if self.config._attn_implementation != "eager":
@ -224,12 +234,13 @@ class MetaClip2Attention(nn.Module):
            keys,
            values,
            attention_mask,
+            is_causal=self.is_causal,
            scaling=self.scale,
            dropout=0.0 if not self.training else self.dropout,
            **kwargs,
        )

-        attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous()
+        attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous()
        attn_output = self.out_proj(attn_output)

        return attn_output, attn_weights
@ -263,14 +274,16 @@ class MetaClip2EncoderLayer(GradientCheckpointingLayer):
        self,
        hidden_states: torch.Tensor,
        attention_mask: torch.Tensor,
+        causal_attention_mask: torch.Tensor,
        **kwargs: Unpack[TransformersKwargs],
    ) -> torch.FloatTensor:
        residual = hidden_states

        hidden_states = self.layer_norm1(hidden_states)
-        hidden_states, _ = self.self_attn(
+        hidden_states, attn_weights = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
            **kwargs,
        )
        hidden_states = residual + hidden_states
@ -374,6 +387,7 @@ class MetaClip2Encoder(nn.Module):
        self,
        inputs_embeds,
        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
        **kwargs: Unpack[TransformersKwargs],
    ) -> BaseModelOutput:
        r"""
@ -388,6 +402,13 @@ class MetaClip2Encoder(nn.Module):
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

+                [What are attention masks?](../glossary#attention-mask)
+            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Causal mask for the text model. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
                [What are attention masks?](../glossary#attention-mask)
        """
        hidden_states = inputs_embeds
@ -395,6 +416,7 @@ class MetaClip2Encoder(nn.Module):
            hidden_states = encoder_layer(
                hidden_states,
                attention_mask,
+                causal_attention_mask,
                **kwargs,
            )

@ -415,12 +437,14 @@ class MetaClip2TextTransformer(nn.Module):
        # For `pooled_output` computation
        self.eos_token_id = config.eos_token_id

+    @check_model_inputs(tie_last_hidden_states=False)
    @auto_docstring
    def forward(
        self,
        input_ids,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
        **kwargs: Unpack[TransformersKwargs],
    ) -> BaseModelOutputWithPooling:
        input_shape = input_ids.size()
@ -428,19 +452,21 @@ class MetaClip2TextTransformer(nn.Module):

        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)

-        attention_mask = create_causal_mask(
-            config=self.config,
-            input_embeds=hidden_states,
-            attention_mask=attention_mask,
-            cache_position=torch.arange(hidden_states.shape[1], device=hidden_states.device),
-            past_key_values=None,
+        # CLIP's text model uses causal mask, prepare it here.
+        # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
+        causal_attention_mask = _create_4d_causal_attention_mask(
+            input_shape, hidden_states.dtype, device=hidden_states.device
        )

-        kwargs.pop("is_causal", None)
+        # expand attention_mask
+        if attention_mask is not None and self.config._attn_implementation != "flash_attention_2":
+            # [batch_size, seq_len] -> [batch_size, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)
+
        encoder_outputs: BaseModelOutput = self.encoder(
            inputs_embeds=hidden_states,
            attention_mask=attention_mask,
-            is_causal=True,
+            causal_attention_mask=causal_attention_mask,
            **kwargs,
        )

@ -501,6 +527,7 @@ class MetaClip2TextModel(MetaClip2PreTrainedModel):
    input_modalities = "text"

    _no_split_modules = ["MetaClip2TextEmbeddings", "MetaClip2EncoderLayer"]
+    _supports_flash_attn = False  # mask creation only accounts for sdpa/eager

    def __init__(self, config: MetaClip2TextConfig):
        super().__init__(config)
@ -514,13 +541,16 @@ class MetaClip2TextModel(MetaClip2PreTrainedModel):
    def set_input_embeddings(self, value):
        self.text_model.embeddings.token_embedding = value

-    @check_model_inputs(tie_last_hidden_states=False)
+    @check_model_inputs()
+    @can_return_tuple
    @auto_docstring
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
        **kwargs: Unpack[TransformersKwargs],
    ) -> BaseModelOutputWithPooling:
        r"""
@ -600,6 +630,7 @@ class MetaClip2TextModelWithProjection(MetaClip2PreTrainedModel):
    config: MetaClip2TextConfig
    input_modalities = "text"

+    _supports_flash_attn = False
    _no_split_modules = ["MetaClip2TextEmbeddings", "MetaClip2EncoderLayer"]

    def __init__(self, config: MetaClip2TextConfig):
@ -619,13 +650,16 @@ class MetaClip2TextModelWithProjection(MetaClip2PreTrainedModel):
    def set_input_embeddings(self, value):
        self.text_model.embeddings.token_embedding = value

-    @check_model_inputs(tie_last_hidden_states=False)
+    @check_model_inputs()
+    @can_return_tuple
    @auto_docstring
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
        **kwargs: Unpack[TransformersKwargs],
    ) -> MetaClip2TextModelOutput:
        r"""
@ -758,6 +792,7 @@ class MetaClip2Model(MetaClip2PreTrainedModel):

    config: MetaClip2Config
    _no_split_modules = ["MetaClip2TextEmbeddings", "MetaClip2EncoderLayer", "MetaClip2VisionEmbeddings"]
+    _supports_flash_attn = False  # mask creation only accounts for sdpa/eager

    def __init__(self, config: MetaClip2Config):
        super().__init__(config)
@ -1043,7 +1078,7 @@ class MetaClip2VisionModel(MetaClip2PreTrainedModel):
        return self.vision_model.embeddings.patch_embedding

    @check_model_inputs(tie_last_hidden_states=False)
-    @auto_docstring
+    @can_return_tuple
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
@ -1152,6 +1187,7 @@ class MetaClip2VisionModelWithProjection(MetaClip2PreTrainedModel):
        return self.vision_model.embeddings.patch_embedding

    @check_model_inputs(tie_last_hidden_states=False)
+    @can_return_tuple
    @auto_docstring
    def forward(
        self,
@ -1218,7 +1254,8 @@ class MetaClip2ForImageClassification(MetaClip2PreTrainedModel):
        # Initialize weights and apply final processing
        self.post_init()

-    @check_model_inputs(tie_last_hidden_states=False)
+    @check_model_inputs()
+    @can_return_tuple
    @auto_docstring
    def forward(
        self,
--- a/src/transformers/models/metaclip_2/modular_metaclip_2.py
+++ b/src/transformers/models/metaclip_2/modular_metaclip_2.py
@ -3,8 +3,9 @@ from typing import Optional
 import torch
 from torch import nn

-from ...masking_utils import create_causal_mask
+from ...modeling_attn_mask_utils import _create_4d_causal_attention_mask, _prepare_4d_attention_mask
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from ...modeling_utils import PreTrainedModel
 from ...processing_utils import Unpack
 from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
 from ...utils.generic import check_model_inputs
@ -12,9 +13,9 @@ from ..clip.configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConf
 from ..clip.modeling_clip import (
    CLIPMLP,
    CLIPAttention,
+    CLIPEncoderLayer,
    CLIPForImageClassification,
    CLIPModel,
-    CLIPPreTrainedModel,
    CLIPTextEmbeddings,
    CLIPTextModel,
    CLIPTextModelWithProjection,
@ -213,9 +214,24 @@ class MetaClip2MLP(CLIPMLP):
    pass


+class MetaClip2EncoderLayer(CLIPEncoderLayer):
+    pass
+
+
@auto_docstring
-class MetaClip2PreTrainedModel(CLIPPreTrainedModel):
+class MetaClip2PreTrainedModel(PreTrainedModel):
+    config: MetaClip2Config
    base_model_prefix = "metaclip_2"
+    input_modalities = ["image", "text"]
+    supports_gradient_checkpointing = True
+    _supports_sdpa = True
+    _supports_flash_attn = True
+    _supports_flex_attn = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": MetaClip2EncoderLayer,
+        "attentions": MetaClip2Attention,
+    }

    def _init_weights(self, module):
        """Initialize the weights"""
@ -275,12 +291,14 @@ class MetaClip2PreTrainedModel(CLIPPreTrainedModel):


 class MetaClip2TextTransformer(CLIPTextTransformer):
+    @check_model_inputs(tie_last_hidden_states=False)
    @auto_docstring
    def forward(
        self,
        input_ids,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
        **kwargs: Unpack[TransformersKwargs],
    ) -> BaseModelOutputWithPooling:
        input_shape = input_ids.size()
@ -288,19 +306,21 @@ class MetaClip2TextTransformer(CLIPTextTransformer):

        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)

-        attention_mask = create_causal_mask(
-            config=self.config,
-            input_embeds=hidden_states,
-            attention_mask=attention_mask,
-            cache_position=torch.arange(hidden_states.shape[1], device=hidden_states.device),
-            past_key_values=None,
+        # CLIP's text model uses causal mask, prepare it here.
+        # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
+        causal_attention_mask = _create_4d_causal_attention_mask(
+            input_shape, hidden_states.dtype, device=hidden_states.device
        )

-        kwargs.pop("is_causal", None)
+        # expand attention_mask
+        if attention_mask is not None and self.config._attn_implementation != "flash_attention_2":
+            # [batch_size, seq_len] -> [batch_size, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)
+
        encoder_outputs: BaseModelOutput = self.encoder(
            inputs_embeds=hidden_states,
            attention_mask=attention_mask,
-            is_causal=True,
+            causal_attention_mask=causal_attention_mask,
            **kwargs,
        )

@ -352,13 +372,22 @@ class MetaClip2TextModel(CLIPTextModel):
    >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
    ```"""

-    @check_model_inputs(tie_last_hidden_states=False)
+    def __init__(self, config: MetaClip2TextConfig):
+        super().__init__(config)
+        self.text_model = MetaClip2TextTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @check_model_inputs()
+    @can_return_tuple
    @auto_docstring
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
        **kwargs: Unpack[TransformersKwargs],
    ):
        r"""
@ -380,6 +409,8 @@ class MetaClip2TextModel(CLIPTextModel):
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
            **kwargs,
        )

@ -415,13 +446,24 @@ class MetaClip2TextModelWithProjection(CLIPTextModelWithProjection):
    >>> text_embeds = outputs.text_embeds
    ```"""

-    @check_model_inputs(tie_last_hidden_states=False)
-    @auto_docstring
+    def __init__(self, config: MetaClip2TextConfig):
+        super().__init__(config)
+
+        text_model = MetaClip2TextModel._from_config(config)
+        self.text_model = text_model.text_model
+
+        self.text_projection = nn.Linear(config.hidden_size, config.projection_dim, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
        **kwargs: Unpack[TransformersKwargs],
    ):
        r"""
@ -442,6 +484,8 @@ class MetaClip2TextModelWithProjection(CLIPTextModelWithProjection):
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
            **kwargs,
        )

@ -506,8 +550,6 @@ class MetaClip2Model(CLIPModel):
        # Initialize weights and apply final processing
        self.post_init()

-    @can_return_tuple
-    @auto_docstring
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
@ -652,7 +694,7 @@ class MetaClip2VisionModel(CLIPVisionModel):
    ```"""

    @check_model_inputs(tie_last_hidden_states=False)
-    @auto_docstring
+    @can_return_tuple
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
@ -722,8 +764,6 @@ class MetaClip2VisionModelWithProjection(CLIPVisionModelWithProjection):
    >>> image_embeds = outputs.image_embeds
    ```"""

-    @check_model_inputs(tie_last_hidden_states=False)
-    @auto_docstring
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
--- a/src/transformers/models/mlcd/modeling_mlcd.py
+++ b/src/transformers/models/mlcd/modeling_mlcd.py
@ -25,12 +25,12 @@ import torch
 import torch.nn as nn

 from ...activations import ACT2FN
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
 from ...utils import TransformersKwargs, auto_docstring, torch_int
-from ...utils.generic import check_model_inputs
 from .configuration_mlcd import MLCDVisionConfig


@ -259,7 +259,7 @@ class MLCDAttention(nn.Module):
        hidden_states: torch.Tensor,
        position_embeddings: tuple[torch.Tensor, torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
-        **kwargs: Unpack[TransformersKwargs],
+        **kwargs: Unpack[FlashAttentionKwargs],
    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
        """Input shape: Batch x Time x Channel"""
        batch_size, seq_length = hidden_states.shape[:-1]
@ -316,7 +316,7 @@ class MLCDEncoderLayer(GradientCheckpointingLayer):
        hidden_states: torch.Tensor,
        position_embeddings: tuple[torch.Tensor, torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
-        **kwargs: Unpack[TransformersKwargs],
+        output_attentions: Optional[bool] = False,
    ) -> tuple[torch.FloatTensor]:
        """
        Args:
@ -328,15 +328,18 @@ class MLCDEncoderLayer(GradientCheckpointingLayer):
                Represents absolute positional embeddings for the query and key in the attention mechanism.
            attention_mask (`torch.FloatTensor`):
                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
        """
        residual = hidden_states

        hidden_states = self.layer_norm1(hidden_states)
-        hidden_states, _ = self.self_attn(
+        hidden_states, attn_weights = self.self_attn(
            hidden_states=hidden_states,
            position_embeddings=position_embeddings,
            attention_mask=attention_mask,
-            **kwargs,
+            output_attentions=output_attentions,
        )
        hidden_states = residual + hidden_states

@ -345,7 +348,12 @@ class MLCDEncoderLayer(GradientCheckpointingLayer):
        hidden_states = self.mlp(hidden_states)
        hidden_states = residual + hidden_states

-        return hidden_states
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs


 class MLCDEncoder(nn.Module):
@ -369,7 +377,9 @@ class MLCDEncoder(nn.Module):
        inputs_embeds: torch.FloatTensor,
        position_embeddings: tuple[torch.Tensor, torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
-        **kwargs: Unpack[TransformersKwargs],
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
    ) -> Union[tuple, BaseModelOutput]:
        r"""
        Args:
@ -385,18 +395,114 @@ class MLCDEncoder(nn.Module):
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        """
+
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
        hidden_states = inputs_embeds
-        for encoder_layer in self.layers:
-            hidden_states = encoder_layer(
-                hidden_states,
-                position_embeddings,
-                attention_mask,
-                **kwargs,
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            layer_outputs = encoder_layer(
+                hidden_states=hidden_states,
+                position_embeddings=position_embeddings,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
            )

+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
        return BaseModelOutput(
            last_hidden_state=hidden_states,
+            hidden_states=encoder_states,
+            attentions=all_attentions,
+        )
+
+
+class MLCDVisionTransformer(nn.Module):
+    def __init__(self, config: MLCDVisionConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = MLCDVisionEmbeddings(config)
+        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.encoder = MLCDEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.vision_rotary_embedding = MLCDRotaryEmbedding(config.hidden_size // config.num_attention_heads // 2)
+        self.class_pos_emb = nn.Parameter(torch.randn(1, config.hidden_size // config.num_attention_heads // 2))
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithPooling]:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        num_patches_height = pixel_values.shape[-2] // self.config.patch_size
+        num_patches_width = pixel_values.shape[-1] // self.config.patch_size
+        rotary_pos_emb = self.vision_rotary_embedding(num_patches_height, num_patches_width)
+        rotary_pos_emb = rotary_pos_emb.to(self.class_pos_emb.device)
+        rotary_pos_emb = torch.cat([self.class_pos_emb, rotary_pos_emb], dim=0)
+        emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+        position_embeddings = (emb.cos(), emb.sin())
+
+        hidden_states = self.embeddings(pixel_values)
+        hidden_states = self.pre_layrnorm(hidden_states)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            position_embeddings=position_embeddings,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
        )


@ -405,15 +511,8 @@ class MLCDPreTrainedModel(PreTrainedModel):
    config: MLCDVisionConfig
    base_model_prefix = "mlcd"
    supports_gradient_checkpointing = True
-    accepts_loss_kwargs = False
    _supports_flash_attn = True
    _supports_sdpa = True
-    _supports_flex_attn = True
-    _supports_attention_backend = True
-    _can_record_outputs = {
-        "hidden_states": MLCDEncoderLayer,
-        "attentions": MLCDAttention,
-    }

    def _init_weights(self, module):
        """Initialize the weights"""
@ -447,55 +546,6 @@ class MLCDPreTrainedModel(PreTrainedModel):
            module.bias.data.zero_()


-class MLCDVisionTransformer(nn.Module):
-    def __init__(self, config: MLCDVisionConfig):
-        super().__init__()
-        self.config = config
-        embed_dim = config.hidden_size
-
-        self.embeddings = MLCDVisionEmbeddings(config)
-        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
-        self.encoder = MLCDEncoder(config)
-        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
-        self.vision_rotary_embedding = MLCDRotaryEmbedding(config.hidden_size // config.num_attention_heads // 2)
-        self.class_pos_emb = nn.Parameter(torch.randn(1, config.hidden_size // config.num_attention_heads // 2))
-
-    @auto_docstring
-    def forward(
-        self,
-        pixel_values: Optional[torch.FloatTensor] = None,
-        **kwargs: Unpack[TransformersKwargs],
-    ) -> Union[tuple, BaseModelOutputWithPooling]:
-        if pixel_values is None:
-            raise ValueError("You have to specify pixel_values")
-
-        num_patches_height = pixel_values.shape[-2] // self.config.patch_size
-        num_patches_width = pixel_values.shape[-1] // self.config.patch_size
-        rotary_pos_emb = self.vision_rotary_embedding(num_patches_height, num_patches_width)
-        rotary_pos_emb = rotary_pos_emb.to(self.class_pos_emb.device)
-        rotary_pos_emb = torch.cat([self.class_pos_emb, rotary_pos_emb], dim=0)
-        emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
-        position_embeddings = (emb.cos(), emb.sin())
-
-        hidden_states = self.embeddings(pixel_values)
-        hidden_states = self.pre_layrnorm(hidden_states)
-
-        encoder_outputs = self.encoder(
-            inputs_embeds=hidden_states,
-            position_embeddings=position_embeddings,
-            **kwargs,
-        )
-
-        last_hidden_state = encoder_outputs[0]
-        pooled_output = last_hidden_state[:, 0, :]
-        pooled_output = self.post_layernorm(pooled_output)
-
-        return BaseModelOutputWithPooling(
-            last_hidden_state=last_hidden_state,
-            pooler_output=pooled_output,
-        )
-
-
@auto_docstring(
    custom_intro="""
    The vision model from M_L_C_D without any head or projection on top.
@ -516,12 +566,13 @@ class MLCDVisionModel(MLCDPreTrainedModel):
    def get_input_embeddings(self) -> nn.Module:
        return self.vision_model.embeddings.patch_embedding

-    @check_model_inputs(tie_last_hidden_states=False)
    @auto_docstring
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
-        **kwargs: Unpack[TransformersKwargs],
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
    ) -> Union[tuple, BaseModelOutputWithPooling]:
        r"""
        Example:
@ -545,9 +596,17 @@ class MLCDVisionModel(MLCDPreTrainedModel):
        >>> print(f"Number of attention layers: {len(outputs.attentions)}")
        >>> print(f"Attention shape: {outputs.attentions[0].shape}")
        ```"""
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
        return self.vision_model(
            pixel_values=pixel_values,
-            **kwargs,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
        )


--- a/src/transformers/models/mlcd/modular_mlcd.py
+++ b/src/transformers/models/mlcd/modular_mlcd.py
@ -19,11 +19,11 @@ import torch
 import torch.nn as nn

 from ...configuration_utils import PreTrainedConfig
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
-from ...utils import TransformersKwargs, auto_docstring, logging
-from ...utils.generic import check_model_inputs
+from ...utils import auto_docstring, logging
 from ..clip.modeling_clip import (
    CLIPMLP,
    CLIPAttention,
@ -206,7 +206,7 @@ class MLCDAttention(CLIPAttention):
        hidden_states: torch.Tensor,
        position_embeddings: tuple[torch.Tensor, torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
-        **kwargs: Unpack[TransformersKwargs],
+        **kwargs: Unpack[FlashAttentionKwargs],
    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
        batch_size, seq_length = hidden_states.shape[:-1]

@ -258,7 +258,7 @@ class MLCDEncoderLayer(CLIPEncoderLayer):
        hidden_states: torch.Tensor,
        position_embeddings: tuple[torch.Tensor, torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
-        **kwargs: Unpack[TransformersKwargs],
+        output_attentions: Optional[bool] = False,
    ) -> tuple[torch.FloatTensor]:
        """
        Args:
@ -270,15 +270,18 @@ class MLCDEncoderLayer(CLIPEncoderLayer):
                Represents absolute positional embeddings for the query and key in the attention mechanism.
            attention_mask (`torch.FloatTensor`):
                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
        """
        residual = hidden_states

        hidden_states = self.layer_norm1(hidden_states)
-        hidden_states, _ = self.self_attn(
+        hidden_states, attn_weights = self.self_attn(
            hidden_states=hidden_states,
            position_embeddings=position_embeddings,
            attention_mask=attention_mask,
-            **kwargs,
+            output_attentions=output_attentions,
        )
        hidden_states = residual + hidden_states

@ -287,7 +290,12 @@ class MLCDEncoderLayer(CLIPEncoderLayer):
        hidden_states = self.mlp(hidden_states)
        hidden_states = residual + hidden_states

-        return hidden_states
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs


 class MLCDEncoder(CLIPEncoder):
@ -308,7 +316,9 @@ class MLCDEncoder(CLIPEncoder):
        inputs_embeds: torch.FloatTensor,
        position_embeddings: tuple[torch.Tensor, torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
-        **kwargs: Unpack[TransformersKwargs],
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
    ) -> Union[tuple, BaseModelOutput]:
        r"""
        Args:
@ -324,18 +334,107 @@ class MLCDEncoder(CLIPEncoder):
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        """
+
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
        hidden_states = inputs_embeds
-        for encoder_layer in self.layers:
-            hidden_states = encoder_layer(
-                hidden_states,
-                position_embeddings,
-                attention_mask,
-                **kwargs,
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            layer_outputs = encoder_layer(
+                hidden_states=hidden_states,
+                position_embeddings=position_embeddings,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
            )

+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
        return BaseModelOutput(
            last_hidden_state=hidden_states,
+            hidden_states=encoder_states,
+            attentions=all_attentions,
+        )
+
+
+class MLCDVisionTransformer(CLIPVisionTransformer):
+    def __init__(self, config: MLCDVisionConfig):
+        super().__init__(config)
+        self.vision_rotary_embedding = MLCDRotaryEmbedding(config.hidden_size // config.num_attention_heads // 2)
+        self.class_pos_emb = nn.Parameter(torch.randn(1, config.hidden_size // config.num_attention_heads // 2))
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithPooling]:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        num_patches_height = pixel_values.shape[-2] // self.config.patch_size
+        num_patches_width = pixel_values.shape[-1] // self.config.patch_size
+        rotary_pos_emb = self.vision_rotary_embedding(num_patches_height, num_patches_width)
+        rotary_pos_emb = rotary_pos_emb.to(self.class_pos_emb.device)
+        rotary_pos_emb = torch.cat([self.class_pos_emb, rotary_pos_emb], dim=0)
+        emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
+        position_embeddings = (emb.cos(), emb.sin())
+
+        hidden_states = self.embeddings(pixel_values)
+        hidden_states = self.pre_layrnorm(hidden_states)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            position_embeddings=position_embeddings,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
        )


@ -344,15 +443,8 @@ class MLCDPreTrainedModel(PreTrainedModel):
    config: MLCDVisionConfig
    base_model_prefix = "mlcd"
    supports_gradient_checkpointing = True
-    accepts_loss_kwargs = False
    _supports_flash_attn = True
    _supports_sdpa = True
-    _supports_flex_attn = True
-    _supports_attention_backend = True
-    _can_record_outputs = {
-        "hidden_states": MLCDEncoderLayer,
-        "attentions": MLCDAttention,
-    }

    def _init_weights(self, module):
        """Initialize the weights"""
@ -386,55 +478,14 @@ class MLCDPreTrainedModel(PreTrainedModel):
            module.bias.data.zero_()


-class MLCDVisionTransformer(CLIPVisionTransformer):
-    def __init__(self, config: MLCDVisionConfig):
-        super().__init__(config)
-        self.vision_rotary_embedding = MLCDRotaryEmbedding(config.hidden_size // config.num_attention_heads // 2)
-        self.class_pos_emb = nn.Parameter(torch.randn(1, config.hidden_size // config.num_attention_heads // 2))
-
-    @auto_docstring
-    def forward(
-        self,
-        pixel_values: Optional[torch.FloatTensor] = None,
-        **kwargs: Unpack[TransformersKwargs],
-    ) -> Union[tuple, BaseModelOutputWithPooling]:
-        if pixel_values is None:
-            raise ValueError("You have to specify pixel_values")
-
-        num_patches_height = pixel_values.shape[-2] // self.config.patch_size
-        num_patches_width = pixel_values.shape[-1] // self.config.patch_size
-        rotary_pos_emb = self.vision_rotary_embedding(num_patches_height, num_patches_width)
-        rotary_pos_emb = rotary_pos_emb.to(self.class_pos_emb.device)
-        rotary_pos_emb = torch.cat([self.class_pos_emb, rotary_pos_emb], dim=0)
-        emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
-        position_embeddings = (emb.cos(), emb.sin())
-
-        hidden_states = self.embeddings(pixel_values)
-        hidden_states = self.pre_layrnorm(hidden_states)
-
-        encoder_outputs = self.encoder(
-            inputs_embeds=hidden_states,
-            position_embeddings=position_embeddings,
-            **kwargs,
-        )
-
-        last_hidden_state = encoder_outputs[0]
-        pooled_output = last_hidden_state[:, 0, :]
-        pooled_output = self.post_layernorm(pooled_output)
-
-        return BaseModelOutputWithPooling(
-            last_hidden_state=last_hidden_state,
-            pooler_output=pooled_output,
-        )
-
-
 class MLCDVisionModel(CLIPVisionModel):
-    @check_model_inputs(tie_last_hidden_states=False)
    @auto_docstring
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
-        **kwargs: Unpack[TransformersKwargs],
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
    ) -> Union[tuple, BaseModelOutputWithPooling]:
        r"""
        Example:
@ -458,9 +509,17 @@ class MLCDVisionModel(CLIPVisionModel):
        >>> print(f"Number of attention layers: {len(outputs.attentions)}")
        >>> print(f"Attention shape: {outputs.attentions[0].shape}")
        ```"""
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
        return self.vision_model(
            pixel_values=pixel_values,
-            **kwargs,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
        )


--- a/src/transformers/models/plbart/modeling_plbart.py
+++ b/src/transformers/models/plbart/modeling_plbart.py
@ -343,12 +343,12 @@ class PLBartEncoder(PLBartPreTrainedModel):
        self.max_source_positions = config.max_position_embeddings
        embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0

+        self.embed_tokens = PLBartScaledWordEmbedding(
+            config.vocab_size, embed_dim, self.padding_idx, embed_scale=embed_scale
+        )
+
        if embed_tokens is not None:
-            self.embed_tokens = embed_tokens
-        else:
-            self.embed_tokens = PLBartScaledWordEmbedding(
-                config.vocab_size, embed_dim, self.padding_idx, embed_scale=embed_scale
-            )
+            self.embed_tokens.weight = embed_tokens.weight

        self.embed_positions = PLBartLearnedPositionalEmbedding(
            config.max_position_embeddings,
@ -595,12 +595,12 @@ class PLBartDecoder(PLBartPreTrainedModel):
        self.max_target_positions = config.max_position_embeddings
        embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0

+        self.embed_tokens = PLBartScaledWordEmbedding(
+            config.vocab_size, config.d_model, self.padding_idx, embed_scale=embed_scale
+        )
+
        if embed_tokens is not None:
-            self.embed_tokens = embed_tokens
-        else:
-            self.embed_tokens = PLBartScaledWordEmbedding(
-                config.vocab_size, config.d_model, self.padding_idx, embed_scale=embed_scale
-            )
+            self.embed_tokens.weight = embed_tokens.weight

        self.embed_positions = PLBartLearnedPositionalEmbedding(
            config.max_position_embeddings,
--- a/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
+++ b/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
@ -1453,6 +1453,8 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
        Example:

        ```python
+        >>> from PIL import Image
+        >>> import requests
        >>> from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration

        >>> model = Qwen2_5_VLForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
@ -1462,30 +1464,22 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2_5_VLPreTrainedModel, GenerationMi
            {
                "role": "user",
                "content": [
-                    {
-                        "type": "image",
-                        "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
-                    },
-                    {"type": "text", "text": "Describe the image."},
+                    {"type": "image"},
+                    {"type": "text", "text": "What is shown in this image?"},
                ],
-            }
+            },
        ]
+        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)

-        >>> inputs = processor.apply_chat_template(
-            messages,
-            tokenize=True,
-            add_generation_prompt=True,
-            return_dict=True,
-            return_tensors="pt"
-        )
+        >>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        >>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos])

        >>> # Generate
-        >>> generated_ids = model.generate(**inputs, max_new_tokens=1024)
-        >>> generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
-        >>> output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        >>> print(output_text)
-        ```
-        """
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
+        ```"""

        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
--- a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py
+++ b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py
@ -684,6 +684,8 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2VLForConditionalGeneration):
        Example:

        ```python
+        >>> from PIL import Image
+        >>> import requests
        >>> from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration

        >>> model = Qwen2_5_VLForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
@ -693,30 +695,22 @@ class Qwen2_5_VLForConditionalGeneration(Qwen2VLForConditionalGeneration):
            {
                "role": "user",
                "content": [
-                    {
-                        "type": "image",
-                        "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
-                    },
-                    {"type": "text", "text": "Describe the image."},
+                    {"type": "image"},
+                    {"type": "text", "text": "What is shown in this image?"},
                ],
-            }
+            },
        ]
+        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)

-        >>> inputs = processor.apply_chat_template(
-            messages,
-            tokenize=True,
-            add_generation_prompt=True,
-            return_dict=True,
-            return_tensors="pt"
-        )
+        >>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        >>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos])

        >>> # Generate
-        >>> generated_ids = model.generate(**inputs, max_new_tokens=1024)
-        >>> generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
-        >>> output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        >>> print(output_text)
-        ```
-        """
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
+        ```"""

        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
--- a/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py
+++ b/src/transformers/models/qwen2_audio/modeling_qwen2_audio.py
@ -25,7 +25,6 @@ from torch import nn
 from ...activations import ACT2FN
 from ...cache_utils import Cache
 from ...generation import GenerationMixin
-from ...masking_utils import create_bidirectional_mask
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutput, ModelOutput
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
@ -775,19 +774,14 @@ class Qwen2AudioForConditionalGeneration(Qwen2AudioPreTrainedModel, GenerationMi
                lengths_expand = audio_feat_lengths.unsqueeze(1).expand(batch_size, max_seq_len)
                # Create mask
                padding_mask = seq_range >= lengths_expand
-                audio_attention_mask_2d = (~padding_mask).to(dtype=torch.long, device=audio_feat_lengths.device)

-                dummy_embeds = torch.zeros(
-                    (batch_size, max_seq_len, 1),
-                    dtype=inputs_embeds.dtype,
-                    device=inputs_embeds.device,
+                audio_attention_mask_ = padding_mask.view(batch_size, 1, 1, max_seq_len).expand(
+                    batch_size, 1, max_seq_len, max_seq_len
                )
-
-                audio_attention_mask = create_bidirectional_mask(
-                    config=self.audio_tower.config,
-                    input_embeds=dummy_embeds,
-                    attention_mask=audio_attention_mask_2d,
+                audio_attention_mask = audio_attention_mask_.to(
+                    dtype=self.audio_tower.conv1.weight.dtype, device=self.audio_tower.conv1.weight.device
                )
+                audio_attention_mask[audio_attention_mask_] = float("-inf")

                audio_outputs = self.audio_tower(input_features, attention_mask=audio_attention_mask)
                selected_audio_feature = audio_outputs.last_hidden_state
--- a/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
@ -1348,6 +1348,8 @@ class Qwen2VLForConditionalGeneration(Qwen2VLPreTrainedModel, GenerationMixin):
        Example:

        ```python
+        >>> from PIL import Image
+        >>> import requests
        >>> from transformers import AutoProcessor, Qwen2VLForConditionalGeneration

        >>> model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
@ -1357,30 +1359,22 @@ class Qwen2VLForConditionalGeneration(Qwen2VLPreTrainedModel, GenerationMixin):
            {
                "role": "user",
                "content": [
-                    {
-                        "type": "image",
-                        "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
-                    },
-                    {"type": "text", "text": "Describe the image."},
+                    {"type": "image"},
+                    {"type": "text", "text": "What is shown in this image?"},
                ],
-            }
+            },
        ]
+        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)

-        >>> inputs = processor.apply_chat_template(
-            messages,
-            tokenize=True,
-            add_generation_prompt=True,
-            return_dict=True,
-            return_tensors="pt"
-        )
+        >>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        >>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos])

        >>> # Generate
-        >>> generated_ids = model.generate(**inputs, max_new_tokens=1024)
-        >>> generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
-        >>> output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        >>> print(output_text)
-        ```
-        """
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
+        ```"""

        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
--- a/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py
+++ b/src/transformers/models/qwen3_vl/modeling_qwen3_vl.py
@ -1369,42 +1369,8 @@ class Qwen3VLForConditionalGeneration(Qwen3VLPreTrainedModel, GenerationMixin):
            The temporal, height and width of feature shape of each video in LLM.

        Example:
-
-        ```python
-        >>> from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
-
-        >>> model = Qwen3VLForConditionalGeneration.from_pretrained("Qwen/Qwen3-VL-8B-Instruct")
-        >>> processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-8B-Instruct")
-
-        >>> messages = [
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "image",
-                        "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
-                    },
-                    {"type": "text", "text": "Describe the image."},
-                ],
-            }
-        ]
-
-        >>> inputs = processor.apply_chat_template(
-            messages,
-            tokenize=True,
-            add_generation_prompt=True,
-            return_dict=True,
-            return_tensors="pt"
-        )
-
-        >>> # Generate
-        >>> generated_ids = model.generate(**inputs, max_new_tokens=1024)
-        >>> generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
-        >>> output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        >>> print(output_text)
-        ```
+            TODO: Add example
        """
-
        outputs = self.model(
            input_ids=input_ids,
            pixel_values=pixel_values,
--- a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py
+++ b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py
@ -1134,42 +1134,8 @@ class Qwen3VLForConditionalGeneration(Qwen2_5_VLForConditionalGeneration):
            The temporal, height and width of feature shape of each video in LLM.

        Example:
-
-        ```python
-        >>> from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
-
-        >>> model = Qwen3VLForConditionalGeneration.from_pretrained("Qwen/Qwen3-VL-8B-Instruct")
-        >>> processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-8B-Instruct")
-
-        >>> messages = [
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "image",
-                        "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
-                    },
-                    {"type": "text", "text": "Describe the image."},
-                ],
-            }
-        ]
-
-        >>> inputs = processor.apply_chat_template(
-            messages,
-            tokenize=True,
-            add_generation_prompt=True,
-            return_dict=True,
-            return_tensors="pt"
-        )
-
-        >>> # Generate
-        >>> generated_ids = model.generate(**inputs, max_new_tokens=1024)
-        >>> generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
-        >>> output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        >>> print(output_text)
-        ```
+            TODO: Add example
        """
-
        outputs = self.model(
            input_ids=input_ids,
            pixel_values=pixel_values,
@ -1350,6 +1316,7 @@ class Qwen3VLProcessor(Qwen2VLProcessor):
                video_metadata = videos_inputs.pop("video_metadata")
            else:
                video_metadata = videos_inputs["video_metadata"]
+            video_grid_thw = videos_inputs["video_grid_thw"]
        else:
            videos_inputs = {}
            video_grid_thw = None
--- a/src/transformers/models/qwen3_vl/processing_qwen3_vl.py
+++ b/src/transformers/models/qwen3_vl/processing_qwen3_vl.py
@ -157,6 +157,7 @@ class Qwen3VLProcessor(ProcessorMixin):
                video_metadata = videos_inputs.pop("video_metadata")
            else:
                video_metadata = videos_inputs["video_metadata"]
+            video_grid_thw = videos_inputs["video_grid_thw"]
        else:
            videos_inputs = {}
            video_grid_thw = None
--- a/src/transformers/models/voxtral/processing_voxtral.py
+++ b/src/transformers/models/voxtral/processing_voxtral.py
@ -206,7 +206,7 @@ class VoxtralProcessor(ProcessorMixin):
        tokenizer_kwargs = {**processed_kwargs["template_kwargs"], **text_kwargs}
        tokenizer_kwargs["return_tensors"] = None  # let's not return tensors here
        tokenize = tokenizer_kwargs.pop("tokenize", False)
-        return_dict = tokenizer_kwargs.pop("return_dict", True)
+        return_dict = tokenizer_kwargs.pop("return_dict", False)

        encoded_instruct_inputs = self.tokenizer.apply_chat_template(
            conversations,
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@ -1603,7 +1603,7 @@ class ProcessorMixin(PushToHubMixin):
            conversations = [conversation]

        tokenize = processed_kwargs["template_kwargs"].pop("tokenize", False)
-        return_dict = processed_kwargs["template_kwargs"].pop("return_dict", True)
+        return_dict = processed_kwargs["template_kwargs"].pop("return_dict", False)
        mm_load_kwargs = processed_kwargs["mm_load_kwargs"]

        if tokenize:
--- a/src/transformers/quantizers/quantizer_mxfp4.py
+++ b/src/transformers/quantizers/quantizer_mxfp4.py
@ -383,10 +383,6 @@ class Mxfp4HfQuantizer(HfQuantizer):

        state_dict = model.state_dict()

-        # Get num_local_experts from model config
-        num_local_experts = getattr(model.config, "num_local_experts", 32)
-        hidden_size = getattr(model.config, "hidden_size", 2880)
-
        for name, module in model.named_modules():
            if (
                isinstance(module, Mxfp4GptOssExperts)
@ -396,7 +392,7 @@ class Mxfp4HfQuantizer(HfQuantizer):
                state_dict[f"{name}.gate_up_proj_blocks"] = (
                    module.gate_up_proj.storage.layout.unswizzle_data(module.gate_up_proj.storage.data)
                    .transpose(-1, -2)
-                    .reshape(num_local_experts, -1, 90, 16)
+                    .reshape(32, -1, 90, 16)
                )
                state_dict[f"{name}.gate_up_proj_scales"] = (
                    module.gate_up_proj_precision_config.weight_scale.storage.layout.unswizzle_data(
@ -406,7 +402,7 @@ class Mxfp4HfQuantizer(HfQuantizer):
                state_dict[f"{name}.down_proj_blocks"] = (
                    module.down_proj.storage.layout.unswizzle_data(module.down_proj.storage.data)
                    .transpose(-1, -2)
-                    .reshape(num_local_experts, hidden_size, 90, -1)
+                    .reshape(32, 2880, 90, -1)
                )
                state_dict[f"{name}.down_proj_scales"] = (
                    module.down_proj_precision_config.weight_scale.storage.layout.unswizzle_data(
--- a/src/transformers/tokenization_mistral_common.py
+++ b/src/transformers/tokenization_mistral_common.py
@ -1378,7 +1378,7 @@ class MistralCommonTokenizer(PushToHubMixin):
        truncation: bool = False,
        max_length: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
-        return_dict: bool = True,
+        return_dict: bool = False,
        **kwargs,
    ) -> Union[str, list[int], list[str], list[list[int]], BatchEncoding]:
        """
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@ -18,8 +18,6 @@ fronting encoding methods) Special token mixing (host the special tokens logic)
 of output with special method for the Fast tokenizers)
 """

-from __future__ import annotations
-
 import copy
 import json
 import os
@ -785,7 +783,7 @@ class BatchEncoding(UserDict):

        return self

-    def to(self, device: Union[str, torch.device], *, non_blocking: bool = False) -> BatchEncoding:
+    def to(self, device: Union[str, "torch.device"], *, non_blocking: bool = False) -> "BatchEncoding":
        """
        Send all values to device by calling `v.to(device, non_blocking=non_blocking)` (PyTorch only).

@ -1588,7 +1586,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
        truncation: bool = False,
        max_length: Optional[int] = None,
        return_tensors: Optional[Union[str, TensorType]] = None,
-        return_dict: bool = True,
+        return_dict: bool = False,
        return_assistant_tokens_mask: bool = False,
        tokenizer_kwargs: Optional[dict[str, Any]] = None,
        **kwargs,
@ -1661,11 +1659,14 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
            set, will return a dict of tokenizer outputs instead.
        """

-        if not tokenize:
-            return_dict = False  # dicts are only returned by the tokenizer anyway
+        if return_dict and not tokenize:
+            raise ValueError(
+                "`return_dict=True` is incompatible with `tokenize=False`, because there is no dict "
+                "of tokenizer outputs to return."
+            )

-        if return_assistant_tokens_mask and not (return_dict and tokenize):
-            raise ValueError("`return_assistant_tokens_mask=True` requires `return_dict=True` and `tokenize=True`")
+        if return_assistant_tokens_mask and not return_dict:
+            raise ValueError("`return_assistant_tokens_mask=True` is incompatible with `return_dict=False`")

        if tokenizer_kwargs is None:
            tokenizer_kwargs = {}
@ -1780,17 +1781,13 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
            )

        if conversation_history is None or len(conversation_history) == 0:
-            return self.apply_chat_template(
-                [message], add_generation_prompt=False, tokenize=True, return_dict=False, **kwargs
-            )
+            return self.apply_chat_template([message], add_generation_prompt=False, tokenize=True, **kwargs)

        conversation = conversation_history + [message]
-        tokens = self.apply_chat_template(
-            conversation, add_generation_prompt=False, tokenize=True, return_dict=False, **kwargs
-        )
+        tokens = self.apply_chat_template(conversation, add_generation_prompt=False, tokenize=True, **kwargs)

        prefix_tokens = self.apply_chat_template(
-            conversation_history, add_generation_prompt=False, tokenize=True, return_dict=False, **kwargs
+            conversation_history, add_generation_prompt=False, tokenize=True, **kwargs
        )
        # It's possible that the prefix tokens are not a prefix of the full list of tokens.
        # For example, if the prefix is `<s>User: Hi` and the full conversation is `<s>User: Hi</s><s>Assistant: Hello`.
@ -1861,11 +1858,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):

        return chat_template

-    def parse_response(
-        self,
-        response: str | list[str | int | list[int]] | np.ndarray | torch.Tensor,
-        schema: list | dict | None = None,
-    ):
+    def parse_response(self, response: str, schema: Optional[Union[list, dict]] = None):
        """
        Converts an output string created by generating text from a model into a parsed message dictionary.
        This method is intended for use with chat models, and will read the tokenizer's `response_schema` attribute to
@ -1876,29 +1869,16 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):

        Args:
            response (`str`):
-                The output string generated by the model. This can be either a decoded string or list of strings,
-                or token IDs as a list/array.
+                The output string generated by the model. This should be the decoded string, not raw tokens.
            schema (`Union[list, dict]`, *optional*):
                A response schema that indicates the expected output format and how parsing should be performed.
                If not provided, the tokenizer's `response_schema` attribute will be used.
        """
-        batched = (
-            (isinstance(response, list) and not isinstance(response[0], int))
-            or getattr(response, "ndim", 0) > 1  # For torch/numpy tensors
-        )
-
        if schema is None:
            if getattr(self, "response_schema", None) is None:
                raise AttributeError("This tokenizer does not have a `response_schema` for parsing chat responses!")
            schema = self.response_schema
-        if batched:
-            if not (isinstance(response, list) and isinstance(response[0], str)):
-                response = self.batch_decode(response)
-            return [recursive_parse(single_response, schema) for single_response in response]
-        else:
-            if not isinstance(response, str):
-                response = self.decode(response)
-            return recursive_parse(response, schema)
+        return recursive_parse(response, schema)

    @classmethod
    def from_pretrained(
@ -3883,7 +3863,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):

    def batch_decode(
        self,
-        sequences: Union[list[int], list[list[int]], np.ndarray, torch.Tensor],
+        sequences: Union[list[int], list[list[int]], np.ndarray, "torch.Tensor"],
        skip_special_tokens: bool = False,
        clean_up_tokenization_spaces: Optional[bool] = None,
        **kwargs,
@ -3917,7 +3897,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):

    def decode(
        self,
-        token_ids: Union[int, list[int], np.ndarray, torch.Tensor],
+        token_ids: Union[int, list[int], np.ndarray, "torch.Tensor"],
        skip_special_tokens: bool = False,
        clean_up_tokenization_spaces: Optional[bool] = None,
        **kwargs,
--- a/src/transformers/utils/chat_parsing_utils.py
+++ b/src/transformers/utils/chat_parsing_utils.py
@ -173,12 +173,12 @@ def recursive_parse(
            return parsed_schema
        elif isinstance(node_content, dict):
            for key, child_node in node_schema.get("properties", {}).items():
-                if "const" in child_node:
-                    parsed_schema[key] = child_node["const"]
-                elif key in node_content:
+                if key in node_content:
                    parsed_schema[key] = recursive_parse(node_content[key], child_node)
                elif "default" in child_node:
                    parsed_schema[key] = child_node["default"]
+                else:
+                    pass
            if "additionalProperties" in node_schema:
                for key, value in node_content.items():
                    if key not in node_schema.get("properties", {}):
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@ -47,7 +47,8 @@ PACKAGE_DISTRIBUTION_MAPPING = importlib.metadata.packages_distributions()
 def _is_package_available(pkg_name: str, return_version: bool = False) -> tuple[bool, str] | bool:
    """Check if `pkg_name` exist, and optionally try to get its version"""
    spec = importlib.util.find_spec(pkg_name)
-    package_exists = spec is not None
+    # the spec might be not None but not importable
+    package_exists = spec is not None and spec.loader is not None
    package_version = "N/A"
    if package_exists and return_version:
        try:
--- a/tests/models/aria/test_modeling_aria.py
+++ b/tests/models/aria/test_modeling_aria.py
@ -520,6 +520,7 @@ class AriaForConditionalGenerationIntegrationTest(unittest.TestCase):
            quantization_config=BitsAndBytesConfig(load_in_4bit=True, llm_int8_skip_modules=["multihead_attn"]),
        )
        processor = AutoProcessor.from_pretrained(model_id)
+        assert model.device.type == "cuda", "This test is only supported on CUDA"  # TODO: remove this
        # Prepare inputs with no images
        inputs = processor(text="Hello, I am", return_tensors="pt").to(torch_device)

--- a/tests/models/aya_vision/test_modeling_aya_vision.py
+++ b/tests/models/aya_vision/test_modeling_aya_vision.py
@ -267,7 +267,7 @@ class AyaVisionIntegrationTest(unittest.TestCase):

        EXPECTED_LOGITS = Expectations(
            {
-                ("xpu", 3): [1.6699, 0.6260, 3.2266, 8.5547, 2.209],
+                ("xpu", 3): [0.4109, 0.1532, 0.8018, 2.1328, 0.5483],
                # 4-bit
                ("cuda", 7): [0.1097, 0.3481, 3.8340, 9.7969, 2.0488],
                ("cuda", 8): [1.6396, 0.6094, 3.1992, 8.5234, 2.1875],
@ -308,7 +308,7 @@ class AyaVisionIntegrationTest(unittest.TestCase):

        expected_outputs = Expectations(
            {
-                ("xpu", 3): "Whispers on the breeze,\nLeaves dance under moonlit sky,\nNature's quiet song.",
+                ("xpu", 3): "Whispers on the breeze,\nLeaves dance under moonlit skies,\nNature's quiet song.",
                # 4-bit
                ("cuda", 7): "Sure, here's a haiku for you:\n\nMorning dew sparkles,\nPetals unfold in sunlight,\n",
                ("cuda", 8): "Whispers on the breeze,\nLeaves dance under moonlit skies,\nNature's quiet song.",
@ -474,7 +474,7 @@ class AyaVisionIntegrationTest(unittest.TestCase):
        # Batching seems to alter the output slightly, but it is also the case in the original implementation. This seems to be expected: https://github.com/huggingface/transformers/issues/23017#issuecomment-1649630232
        expected_outputs = Expectations(
            {
-                ("xpu", 3): "Wooden path to water,\nMountains echo in stillness,\nPeaceful forest scene.",
+                ("xpu", 3): "Wooden path to water,\nMountains echo in stillness,\nPeaceful forest lake.",
                ("cuda", 7): 'Wooden bridge stretches\nMirrored lake below, mountains rise\nPeaceful, serene',
                ("cuda", 8): 'Wooden path to water,\nMountains echo in stillness,\nPeaceful forest scene.',
            }
--- a/tests/models/bart/test_modeling_bart.py
+++ b/tests/models/bart/test_modeling_bart.py
@ -16,7 +16,6 @@
 import copy
 import tempfile
 import unittest
-import unittest.mock
 from functools import cached_property

 import timeout_decorator  # noqa
@ -478,23 +477,6 @@ class BartModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
            with torch.no_grad():
                model(**inputs)[0]

-    def test_input_embeddings_support_forward_hook(self):
-        # Make sure that registering hooks on the input embeddings are indeed called
-        # in forward. This is necessary for gradient checkpointing in PEFT, see also #41821.
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            hook = unittest.mock.MagicMock(return_value=None)
-            model.get_input_embeddings().register_forward_hook(hook)
-
-            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-            model(**inputs)
-
-            self.assertGreater(hook.call_count, 0)
-
    @require_torch_fp16
    def test_generate_fp16(self):
        config, input_dict = self.model_tester.prepare_config_and_inputs()
--- a/tests/models/blenderbot/test_tokenization_blenderbot.py
+++ b/tests/models/blenderbot/test_tokenization_blenderbot.py
@ -18,6 +18,7 @@ import unittest
 from functools import cached_property

 from transformers import BlenderbotTokenizer, BlenderbotTokenizerFast
+from transformers.testing_utils import require_jinja


 class Blenderbot3BTokenizerTests(unittest.TestCase):
@ -50,3 +51,24 @@ class Blenderbot3BTokenizerTests(unittest.TestCase):
    def test_3B_tokenization_same_as_parlai_rust_tokenizer(self):
        assert self.rust_tokenizer_3b.add_prefix_space
        assert self.rust_tokenizer_3b([" Sam", "Sam"]).input_ids == [[5502, 2], [5502, 2]]
+
+    @require_jinja
+    def test_tokenization_for_chat(self):
+        tok = self.tokenizer_3b
+        test_chats = [
+            [{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}],
+            [
+                {"role": "system", "content": "You are a helpful chatbot."},
+                {"role": "user", "content": "Hello!"},
+                {"role": "assistant", "content": "Nice to meet you."},
+            ],
+            [{"role": "assistant", "content": "Nice to meet you."}, {"role": "user", "content": "Hello!"}],
+        ]
+        tokenized_chats = [tok.apply_chat_template(test_chat) for test_chat in test_chats]
+        expected_tokens = [
+            [553, 366, 265, 4792, 3879, 73, 311, 21, 228, 228, 6950, 8, 2],
+            [553, 366, 265, 4792, 3879, 73, 311, 21, 228, 228, 6950, 8, 228, 3490, 287, 2273, 304, 21, 2],
+            [3490, 287, 2273, 304, 21, 228, 228, 6950, 8, 2],
+        ]
+        for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens):
+            self.assertListEqual(tokenized_chat, expected_tokens)
--- a/tests/models/bloom/test_tokenization_bloom.py
+++ b/tests/models/bloom/test_tokenization_bloom.py
@ -18,7 +18,7 @@ import unittest
 from datasets import load_dataset

 from transformers import BloomTokenizerFast
-from transformers.testing_utils import require_tokenizers
+from transformers.testing_utils import require_jinja, require_tokenizers

 from ...test_tokenization_common import TokenizerTesterMixin

@ -137,6 +137,28 @@ class BloomTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
        predicted_text = [tokenizer.decode(x, clean_up_tokenization_spaces=False) for x in output_tokens]
        self.assertListEqual(predicted_text, input_text)

+    @require_jinja
+    def test_tokenization_for_chat(self):
+        tokenizer = self.get_rust_tokenizer()
+        tokenizer.chat_template = "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}"
+        test_chats = [
+            [{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}],
+            [
+                {"role": "system", "content": "You are a helpful chatbot."},
+                {"role": "user", "content": "Hello!"},
+                {"role": "assistant", "content": "Nice to meet you."},
+            ],
+            [{"role": "assistant", "content": "Nice to meet you."}, {"role": "user", "content": "Hello!"}],
+        ]
+        tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats]
+        expected_tokens = [
+            [5448, 1306, 267, 66799, 44799, 37143, 17, 2, 59414, 4, 2],
+            [5448, 1306, 267, 66799, 44799, 37143, 17, 2, 59414, 4, 2, 229126, 427, 11890, 1152, 17, 2],
+            [229126, 427, 11890, 1152, 17, 2, 59414, 4, 2],
+        ]
+        for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens):
+            self.assertListEqual(tokenized_chat, expected_tokens)
+
    def test_add_prefix_space_fast(self):
        tokenizer_w_prefix = self.get_rust_tokenizer(add_prefix_space=True)
        tokenizer_wo_prefix = self.get_rust_tokenizer(add_prefix_space=False)
--- a/tests/models/cohere/test_tokenization_cohere.py
+++ b/tests/models/cohere/test_tokenization_cohere.py
@ -146,6 +146,32 @@ class CohereTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
        self.assertGreaterEqual(len(self.tokenizer_class.pretrained_vocab_files_map), 1)
        self.assertGreaterEqual(len(list(self.tokenizer_class.pretrained_vocab_files_map.values())[0]), 1)

+    @require_jinja
+    def test_tokenization_for_chat(self):
+        tokenizer = self.get_rust_tokenizer()
+        test_chats = [
+            [{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}],
+            [
+                {"role": "system", "content": "You are a helpful chatbot."},
+                {"role": "user", "content": "Hello!"},
+                {"role": "assistant", "content": "Nice to meet you."},
+            ],
+        ]
+        tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats]
+        # fmt: off
+        expected_tokens = [
+            [5, 36, 99, 59, 60, 41, 58, 60, 71, 55, 46, 71, 60, 61, 58, 54, 71, 60, 55, 51, 45, 54, 99, 38, 36, 99, 59, 65, 59, 60, 45, 53, 71, 60, 55, 51, 45, 54, 99, 38, 65, 243, 394, 204, 336, 84, 88, 887, 374, 216, 74, 286, 22, 8, 36, 99, 59, 60, 41, 58, 60, 71, 55, 46, 71, 60, 61, 58, 54, 71, 60, 55, 51, 45, 54, 99, 38, 36, 99, 61, 59, 45, 58, 71, 60, 55, 51, 45, 54, 99, 38, 48, 420, 87, 9, 8],
+            [5, 36, 99, 59, 60, 41, 58, 60, 71, 55, 46, 71, 60, 61, 58, 54, 71, 60, 55, 51, 45, 54, 99, 38, 36, 99, 59, 65,
+            59, 60, 45, 53, 71, 60, 55, 51, 45, 54, 99, 38, 65, 243, 394, 204, 336, 84, 88, 887, 374, 216, 74, 286, 22, 8,
+            36, 99, 59, 60, 41, 58, 60, 71, 55, 46, 71, 60, 61, 58, 54, 71, 60, 55, 51, 45, 54, 99, 38, 36, 99, 61, 59,
+            45, 58, 71, 60, 55, 51, 45, 54, 99, 38, 48, 420, 87, 9, 8, 36, 99, 59, 60, 41, 58, 60, 71, 55, 46, 71, 60, 61,
+            58, 54, 71, 60, 55, 51, 45, 54, 99, 38, 36, 99, 43, 48, 41, 60, 42, 55, 60, 71, 60, 55, 51, 45, 54, 99, 38,
+            54, 567, 235, 693, 276, 411, 243, 22, 8]
+        ]
+        # fmt: on
+        for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens):
+            self.assertListEqual(tokenized_chat, expected_tokens)
+
    @require_jinja
    def test_tokenization_for_tool_use(self):
        tokenizer = self.get_rust_tokenizer()
--- a/tests/models/gemma/test_tokenization_gemma.py
+++ b/tests/models/gemma/test_tokenization_gemma.py
@ -27,6 +27,7 @@ from transformers.convert_slow_tokenizer import convert_slow_tokenizer
 from transformers.testing_utils import (
    get_tests_dir,
    nested_simplify,
+    require_jinja,
    require_read_token,
    require_sentencepiece,
    require_tokenizers,
@ -427,6 +428,25 @@ class GemmaIntegrationTest(unittest.TestCase):
        # a dummy prefix space is not added by the sp_model as it was de-activated
        self.assertEqual(tokens, tokenizer.sp_model.encode("▁▁", out_type=str))

+    @require_jinja
+    def test_tokenization_for_chat(self):
+        tokenizer = GemmaTokenizer.from_pretrained("hf-internal-testing/dummy-gemma")
+
+        test_chats = [
+            [{"role": "user", "content": "Hello!"}],
+            [
+                {"role": "user", "content": "Hello!"},
+                {"role": "assistant", "content": "Nice to meet you."},
+            ],
+            [{"role": "user", "content": "Hello!"}],
+        ]
+        # Matt: The third test case tests the default system message, but if this is ever changed in the
+        #       class/repo code then that test will fail, and the case will need to be updated.
+        tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats]
+        expected_tokens = [[235322, 235371, 571, 235298, 2997, 73786, 1645, 108, 4521, 149907, 235371, 571, 235298, 615, 73786, 108], [235322, 235371, 571, 235298, 2997, 73786, 1645, 108, 4521, 149907, 235371, 571, 235298, 615, 73786, 108, 235322, 235371, 571, 235298, 2997, 73786, 105776, 108, 7731, 577, 4664, 692, 35606, 235371, 571, 235298, 615, 73786, 108], [235322, 235371, 571, 235298, 2997, 73786, 1645, 108, 4521, 149907, 235371, 571, 235298, 615, 73786, 108]]  # fmt: skip
+        for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens):
+            self.assertListEqual(tokenized_chat, expected_tokens)
+
    def test_save_fast_load_slow(self):
        # Ensure that we can save a fast tokenizer and load it as a slow tokenizer
        slow_tokenizer = self.tokenizer
--- a/tests/models/gemma3/test_modeling_gemma3.py
+++ b/tests/models/gemma3/test_modeling_gemma3.py
@ -499,7 +499,7 @@ class Gemma3IntegrationTest(unittest.TestCase):

        EXPECTED_TEXTS = Expectations(
            {
-                ("xpu", 3): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with turquoise water and a blue sky in the background. It looks like a'],
+                ("xpu", 3): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown and white cow standing on a sandy beach with turquoise water in the background. It looks like a lovely,'],
                ("cuda", (8, 0)): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with clear turquoise water and a blue sky in the background. It looks like'],
                ("cuda", (8, 6)): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with clear blue water and a blue sky in the background. It looks like'],
                ("rocm", (9, 4)): ['user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with turquoise water and a blue sky in the background. It looks like a'],
@ -610,7 +610,7 @@ class Gemma3IntegrationTest(unittest.TestCase):
        EXPECTED_NUM_IMAGES = 3  # one for the origin image and two crops of images
        EXPECTED_TEXTS = Expectations(
            {
-                ("xpu", 3): ["user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There's a bright blue sky with some white clouds in the"],
+                ("xpu", 3): ['user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There are clouds in the blue sky above.'],
                ("cuda", 7): [],
                ("cuda", (8, 6)): ["user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There's a clear blue sky with some white clouds above."],
                ("cuda", (8, 0)): ["user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There's a blue sky with some white clouds in the background"],
--- a/tests/models/glm4v/test_modeling_glm4v.py
+++ b/tests/models/glm4v/test_modeling_glm4v.py
@ -24,9 +24,7 @@ from transformers import (
    is_torch_available,
 )
 from transformers.testing_utils import (
-    Expectations,
    cleanup,
-    require_deterministic_for_xpu,
    require_flash_attn,
    require_torch,
    require_torch_gpu,
@ -415,7 +413,6 @@ class Glm4vIntegrationTest(unittest.TestCase):
        )

    @slow
-    @require_deterministic_for_xpu
    def test_small_model_integration_test_expand(self):
        model = Glm4vForConditionalGeneration.from_pretrained(
            "THUDM/GLM-4.1V-9B-Thinking", dtype="auto", device_map="auto"
@ -429,23 +426,14 @@ class Glm4vIntegrationTest(unittest.TestCase):

        output = model.generate(**inputs, max_new_tokens=30, do_sample=False, num_beams=2, num_return_sequences=2)

-        # fmt: off
-        EXPECTED_DECODED_TEXTS = Expectations(
-            {
-
-                (None, None): ["\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture doesn't look like a dog; it's actually a cat. Specifically",
-                               "\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture doesn't look like a dog; it's actually a cat, specifically"
-                              ],
-                ("xpu", None): ["\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat. Specifically, it looks",
-                                "\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture is not a dog; it's a cat, specifically a Pallas"
-                               ],
-            }
+        EXPECTED_DECODED_TEXT = [
+            "\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture doesn't look like a dog; it's actually a cat. Specifically",
+            "\nWhat kind of dog is this?\n<think>Got it, let's look at the image. The animal in the picture doesn't look like a dog; it's actually a cat, specifically"
+        ]  # fmt: skip
+        self.assertEqual(
+            self.processor.batch_decode(output, skip_special_tokens=True),
+            EXPECTED_DECODED_TEXT,
        )
-        # fmt: on
-        EXPECTED_DECODED_TEXT = EXPECTED_DECODED_TEXTS.get_expectation()
-
-        decoded_text = self.processor.batch_decode(output, skip_special_tokens=True)
-        self.assertEqual(decoded_text, EXPECTED_DECODED_TEXT)

    @slow
    def test_small_model_integration_test_batch_wo_image(self):
--- a/tests/models/gpt2/test_tokenization_gpt2.py
+++ b/tests/models/gpt2/test_tokenization_gpt2.py
@ -19,7 +19,7 @@ import unittest

 from transformers import AutoTokenizer, GPT2Tokenizer, GPT2TokenizerFast
 from transformers.models.gpt2.tokenization_gpt2 import VOCAB_FILES_NAMES
-from transformers.testing_utils import require_tiktoken, require_tokenizers
+from transformers.testing_utils import require_jinja, require_tiktoken, require_tokenizers

 from ...test_tokenization_common import TokenizerTesterMixin

@ -281,6 +281,28 @@ class GPT2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
                filtered_sequence = [x for x in filtered_sequence if x is not None]
                self.assertEqual(encoded_sequence, filtered_sequence)

+    @require_jinja
+    def test_tokenization_for_chat(self):
+        tokenizer = GPT2Tokenizer.from_pretrained(self.tmpdirname)
+        tokenizer.chat_template = "{% for message in messages %}{{ message.content }}{{ eos_token }}{% endfor %}"
+        test_chats = [
+            [{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}],
+            [
+                {"role": "system", "content": "You are a helpful chatbot."},
+                {"role": "user", "content": "Hello!"},
+                {"role": "assistant", "content": "Nice to meet you."},
+            ],
+            [{"role": "assistant", "content": "Nice to meet you."}, {"role": "user", "content": "Hello!"}],
+        ]
+        tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats]
+        # fmt: off
+        expected_tokens = [[20, 1, 20, 10, 20, 4, 3, 10, 20, 10, 20, 3, 0, 20, 20, 20, 0, 10, 20, 20, 20, 6, 20, 1, 6, 20, 20, 20, 3, 0, 0, 1, 20, 20],
+                          [20, 1, 20, 10, 20, 4, 3, 10, 20, 10, 20, 3, 0, 20, 20, 20, 0, 10, 20, 20, 20, 6, 20, 1, 6, 20, 20, 20, 3, 0, 0, 1, 20, 20, 20, 7, 20, 3, 10, 6, 1, 10, 20, 3, 3, 6, 10, 20, 1, 20, 20, 20],
+                          [20, 7, 20, 3, 10, 6, 1, 10, 20, 3, 3, 6, 10, 20, 1, 20, 20, 20, 20, 3, 0, 0, 1, 20, 20]]
+        # fmt: on
+        for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens):
+            self.assertListEqual(tokenized_chat, expected_tokens)
+
    @require_tiktoken
    def test_tokenization_tiktoken(self):
        from tiktoken import encoding_name_for_model
--- a/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py
+++ b/tests/models/gpt_sw3/test_tokenization_gpt_sw3.py
@ -15,7 +15,7 @@
 import unittest

 from transformers import GPTSw3Tokenizer
-from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers, slow
+from transformers.testing_utils import get_tests_dir, require_jinja, require_sentencepiece, require_tokenizers, slow

 from ...test_tokenization_common import TokenizerTesterMixin

@ -127,3 +127,36 @@ class GPTSw3TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
            model_name="AI-Sweden-Models/gpt-sw3-126m",
            sequences=sequences,
        )
+
+    @require_jinja
+    def test_tokenization_for_chat(self):
+        tokenizer = GPTSw3Tokenizer(SAMPLE_VOCAB)
+        tokenizer.chat_template = (
+            "{{ eos_token }}{{ bos_token }}"
+            "{% for message in messages %}"
+            "{% if message['role'] == 'user' %}{{ 'User: ' + message['content']}}"
+            "{% else %}{{ 'Bot: ' + message['content']}}{% endif %}"
+            "{{ message['text'] }}{{ bos_token }}"
+            "{% endfor %}"
+            "Bot:"
+        )
+        # This is in English, but it's just here to make sure the chat control tokens are being added properly
+        test_chats = [
+            [{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}],
+            [
+                {"role": "system", "content": "You are a helpful chatbot."},
+                {"role": "user", "content": "Hello!"},
+                {"role": "assistant", "content": "Nice to meet you."},
+            ],
+            [{"role": "assistant", "content": "Nice to meet you."}, {"role": "user", "content": "Hello!"}],
+        ]
+        tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats]
+        # fmt: off
+        expected_tokens = [
+            [2000, 1, 575, 541, 419, 530, 339, 265, 878, 708, 727, 275, 347, 541, 260, 1, 968, 263, 314, 419, 366, 354, 294, 360, 1, 575, 541, 419],
+            [2000, 1, 575, 541, 419, 530, 339, 265, 878, 708, 727, 275, 347, 541, 260, 1, 968, 263, 314, 419, 366, 354, 294, 360, 1, 575, 541, 419, 984, 429, 281, 264, 1261, 291, 260, 1, 575, 541, 419],
+            [2000, 1, 575, 541, 419, 984, 429, 281, 264, 1261, 291, 260, 1, 968, 263, 314, 419, 366, 354, 294, 360, 1, 575, 541, 419]
+            ]
+        # fmt: on
+        for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens):
+            self.assertListEqual(tokenized_chat, expected_tokens)
--- a/tests/models/grounding_dino/test_modeling_grounding_dino.py
+++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py
@ -682,7 +682,7 @@ class GroundingDinoModelIntegrationTests(unittest.TestCase):

        expectations = Expectations(
            {
-                (None, None): [0.4526, 0.4082],
+                (None, None): [[0.4526, 0.4082]],
                ("cuda", 8): [0.4524, 0.4074],
            }
        )
--- a/tests/models/internvl/test_modeling_internvl.py
+++ b/tests/models/internvl/test_modeling_internvl.py
@ -227,7 +227,6 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
    def tearDown(self):
        cleanup(torch_device, gc_collect=True)

-    @require_deterministic_for_xpu
    def test_qwen2_small_model_integration_generate(self):
        processor = AutoProcessor.from_pretrained(self.small_model_checkpoint)
        model = InternVLForConditionalGeneration.from_pretrained(
@ -245,16 +244,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
            decoded_output = processor.decode(
                generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
            )
-
-        # fmt: off
-        expected_outputs = Expectations(
-            {
-                (None, None): "The image shows two cats lying on a pink surface, which appears to be a bed or couch.",
-                ("xpu", 3): "The image shows two cats lying on a pink blanket. The cat on the left is a tabby",
-            }
-        )
-        # fmt: on
-        expected_output = expected_outputs.get_expectation()
+        expected_output = "The image shows two cats lying on a pink surface, which appears to be a bed or couch."

        self.assertEqual(decoded_output, expected_output)

@ -278,9 +268,9 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
        actual_logits = output.logits[0, -1, :5].cpu()
        expected_logits_all = Expectations(
            {
-                ("xpu", 3): torch.tensor([11.9922, 14.7188, 14.3125, 10.6719, 6.9297], dtype=torch.float16),
-                ("cuda", 7): torch.tensor([11.9531, 14.7031, 14.2734, 10.6562, 6.9219], dtype=torch.float16),
-                ("cuda", 8): torch.tensor([11.9609, 14.7188, 14.2734, 10.6484, 6.9141], dtype=torch.float16),
+                ("xpu", 3): torch.tensor([11.7500, 14.7500, 14.1250, 10.5625, 6.7812], dtype=torch.float16),
+                ("cuda", 7): torch.tensor([11.9531, 14.7031, 14.2734, 10.6562,  6.9219], dtype=torch.float16),
+                ("cuda", 8): torch.tensor([11.9609, 14.7188, 14.2734, 10.6484,  6.9141], dtype=torch.float16),
            }
        )  # fmt: skip
        expected_logits = expected_logits_all.get_expectation()
@ -308,7 +298,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):

        expected_outputs = Expectations(
            {
-                ("xpu", 3): "Whispers of dawn,\nSilent whispers of night,\nPeace in the stillness.",
+                ("xpu", 3): "Whispers of dawn,\nSilent whispers of the night,\nNew day's light.",
                ("cuda", 7): 'Whispers of dawn,\nSilent whispers of night,\nPeace in the stillness.',
                ("cuda", 8): 'Whispers of dawn,\nSilent whispers of night,\nPeace in the stillness.',
            }
@ -580,7 +570,7 @@ class InternVLQwen2IntegrationTest(unittest.TestCase):
        decoded_output = processor.decode(output[1], skip_special_tokens=True)
        expected_outputs = Expectations(
            {
-                ("xpu", 3): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nA forehand shot",
+                ("xpu", 3): "user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nThe man is performing a forehand shot.",
                ("cuda", 7): 'user\nFrame1: \nFrame2: \nFrame3: \nFrame4: \nFrame5: \nFrame6: \nFrame7: \nFrame8: \nWhat type of shot is the man performing?\nassistant\nA forehand shot',
            }
        )  # fmt: skip
--- a/tests/models/lfm2_moe/test_modeling_lfm2_moe.py
+++ b/tests/models/lfm2_moe/test_modeling_lfm2_moe.py
@ -18,9 +18,7 @@ import unittest

 from transformers import AutoTokenizer, is_torch_available, set_seed
 from transformers.testing_utils import (
-    Expectations,
    cleanup,
-    require_deterministic_for_xpu,
    require_read_token,
    require_torch,
    require_torch_accelerator,
@ -172,30 +170,36 @@ class Lfm2MoeIntegrationTest(unittest.TestCase):
        input_ids = torch.tensor([input_ids]).to(model.model.embed_tokens.weight.device)
        with torch.no_grad():
            out = model(input_ids).logits.float().cpu()
-        # fmt: off
        # Expected mean on dim = -1
-        EXPECTED_MEANS = Expectations(
-            {
-                ("cuda", None): torch.tensor([[-1.3855, -0.5123, -1.3143, -1.2144, -1.0791, -1.2117, -1.4704, -0.7648, -0.6175, -1.2402, -1.1459, -1.0083, -1.0247, -0.8830, -1.5643, -1.7266, -1.6254,]]),
-                ("xpu", None): torch.tensor([[-1.3863, -0.4653, -1.3246, -1.3199, -1.0940, -1.2254, -1.4716, -0.8852, -0.5920, -1.2182, -1.1782, -1.0268, -1.0114, -0.8816, -1.5774, -1.7408, -1.6147,]]),
-            }
+        EXPECTED_MEAN = torch.tensor(
+            [
+                [
+                    -1.3855,
+                    -0.5123,
+                    -1.3143,
+                    -1.2144,
+                    -1.0791,
+                    -1.2117,
+                    -1.4704,
+                    -0.7648,
+                    -0.6175,
+                    -1.2402,
+                    -1.1459,
+                    -1.0083,
+                    -1.0247,
+                    -0.8830,
+                    -1.5643,
+                    -1.7266,
+                    -1.6254,
+                ]
+            ]
        )
-        # fmt: on
-        EXPECTED_MEAN = EXPECTED_MEANS.get_expectation()
-        out_mean = out.mean(-1)
-        torch.testing.assert_close(out_mean, EXPECTED_MEAN, rtol=1e-2, atol=1e-2)
-        # fmt: off
+        torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, rtol=1e-2, atol=1e-2)
        # Expected portion of the logits
-        EXPECTED_SLICES = Expectations(
-            {
-                ("cuda", None): torch.tensor([-1.2656, 2.4844, 5.5000, -1.3359, -1.3203, -1.3438, 1.9375, 5.8438, -0.6523, -1.2891]),
-                ("xpu", None): torch.tensor([-1.2656, 2.4531, 5.4375, -1.3438, -1.3203, -1.3516, 1.9297, 5.7812, -0.6719, -1.3203]),
-            }
+        EXPECTED_SLICE = torch.tensor(
+            [-1.2656, 2.4844, 5.5000, -1.3359, -1.3203, -1.3438, 1.9375, 5.8438, -0.6523, -1.2891]
        )
-        # fmt: on
-        EXPECTED_SLICE = EXPECTED_SLICES.get_expectation()
-        out_slice = out[0, 0, :10]
-        torch.testing.assert_close(out_slice, EXPECTED_SLICE, rtol=1e-4, atol=1e-4)
+        torch.testing.assert_close(out[0, 0, :10], EXPECTED_SLICE, rtol=1e-4, atol=1e-4)

    @slow
    def test_model_1a8b_generation(self):
@ -213,25 +217,13 @@ class Lfm2MoeIntegrationTest(unittest.TestCase):
        self.assertEqual(EXPECTED_TEXT_COMPLETION, text)

    @slow
-    @require_deterministic_for_xpu
    def test_model_1a8b_batched_chat_generation(self):
        prompts = ["Who are you?", "Complete the text: Lorem ipsum dolor ", "The Meji Restoration in Japan ended"]
-        # fmt: off
-        EXPECTED_TEXT_COMPLETIONS = Expectations(
-            {
-                ("cuda", None): ["Who are you?, a language model designed to assist with information and tasks?  \nI am",
-                                 "Complete the text: Lorem ipsum dolor ipsum dolor ipsum dolor ipsum dolor ipsum dolor",
-                                 "The Meji Restoration in Japan ended or the Meiji Restoration (1868–1912) marked a pivotal",
-                                ],
-                ("xpu", None): ['Who are you? (AI) designed to assist?  \nI am an AI assistant developed to',
-                                'Complete the text: Lorem ipsum dolor ipsum dolor ipsum dolor ipsum dolor ipsum dolor',
-                                'The Meji Restoration in Japan ended**  \n* **Key Event:** The overthrow of the Tokugawa'
-                               ],
-            }
-        )
-        # fmt: on
-        EXPECTED_TEXT_COMPLETION = EXPECTED_TEXT_COMPLETIONS.get_expectation()
-
+        EXPECTED_TEXT_COMPLETIONS = [
+            "Who are you?, a language model designed to assist with information and tasks?  \nI am",
+            "Complete the text: Lorem ipsum dolor ipsum dolor ipsum dolor ipsum dolor ipsum dolor",
+            "The Meji Restoration in Japan ended or the Meiji Restoration (1868–1912) marked a pivotal",
+        ]
        set_seed(1789)
        tokenizer = AutoTokenizer.from_pretrained("LiquidAI/LFM2-8B-A1B", use_fast=False)
        model = self.get_model()
@ -241,4 +233,4 @@ class Lfm2MoeIntegrationTest(unittest.TestCase):
        with torch.no_grad():
            generated_ids = model.generate(**batched_input_ids, max_new_tokens=15, do_sample=False)
        text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-        self.assertEqual(EXPECTED_TEXT_COMPLETION, text)
+        self.assertEqual(EXPECTED_TEXT_COMPLETIONS, text)
--- a/tests/models/llama/test_tokenization_llama.py
+++ b/tests/models/llama/test_tokenization_llama.py
@ -32,6 +32,7 @@ from transformers.convert_slow_tokenizer import convert_slow_tokenizer
 from transformers.testing_utils import (
    get_tests_dir,
    nested_simplify,
+    require_jinja,
    require_read_token,
    require_sentencepiece,
    require_tiktoken,
@ -701,6 +702,32 @@ class LlamaIntegrationTest(unittest.TestCase):
        with self.assertRaises(ValueError):
            tokenizer = LlamaTokenizerFast(SAMPLE_VOCAB, eos_token=None, add_bos_token=True, add_eos_token=True)

+    @require_jinja
+    def test_tokenization_for_chat(self):
+        tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b", legacy=False)
+
+        test_chats = [
+            [{"role": "system", "content": "You are a helpful chatbot."}, {"role": "user", "content": "Hello!"}],
+            [
+                {"role": "system", "content": "You are a helpful chatbot."},
+                {"role": "user", "content": "Hello!"},
+                {"role": "assistant", "content": "Nice to meet you."},
+            ],
+            [{"role": "user", "content": "Hello!"}],
+        ]
+        # Matt: The third test case tests the default system message, but if this is ever changed in the
+        #       class/repo code then that test will fail, and the case will need to be updated.
+        tokenized_chats = [tokenizer.apply_chat_template(test_chat) for test_chat in test_chats]
+        # fmt: off
+        expected_tokens = [
+            [1, 29961, 25580, 29962, 3532, 14816, 29903, 6778, 13, 3492, 526, 263, 8444, 13563, 7451, 29889, 13, 29966, 829, 14816, 29903, 6778, 13, 13, 10994, 29991, 518, 29914, 25580, 29962],
+            [1, 29961, 25580, 29962, 3532, 14816, 29903, 6778, 13, 3492, 526, 263, 8444, 13563, 7451, 29889, 13, 29966, 829, 14816, 29903, 6778, 13, 13, 10994, 29991, 518, 29914, 25580, 29962, 20103, 304, 5870, 366, 29889, 29871, 2],
+            [1, 29961, 25580, 29962, 15043, 29991, 518, 29914, 25580, 29962]
+        ]
+        # fmt: on
+        for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens):
+            self.assertListEqual(tokenized_chat, expected_tokens)
+

@require_sentencepiece
@require_tokenizers
--- a/tests/models/llava/test_modeling_llava.py
+++ b/tests/models/llava/test_modeling_llava.py
@ -152,11 +152,10 @@ class LlavaVisionText2TextModelTester:
    def prepare_config_and_inputs_for_common(self):
        config_and_inputs = self.prepare_config_and_inputs()
        config, pixel_values = config_and_inputs
-
-        input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 2
-        attention_mask = torch.ones(input_ids.shape, dtype=torch.long).to(torch_device)
+        input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1
        input_ids[input_ids == config.image_token_index] = self.pad_token_id
        input_ids[:, : self.num_image_tokens] = config.image_token_index
+        attention_mask = input_ids.ne(1).to(torch_device)

        inputs_dict = {
            "pixel_values": pixel_values,
--- a/tests/models/mistral3/test_modeling_mistral3.py
+++ b/tests/models/mistral3/test_modeling_mistral3.py
@ -275,7 +275,6 @@ class Mistral3IntegrationTest(unittest.TestCase):
        self.assertEqual(decoded_output, expected_output)

    @require_read_token
-    @require_deterministic_for_xpu
    def test_mistral3_integration_generate(self):
        processor = AutoProcessor.from_pretrained(self.model_checkpoint)
        processor.chat_template = processor.chat_template.replace('strftime_now("%Y-%m-%d")', '"2025-06-20"')
@ -300,7 +299,7 @@ class Mistral3IntegrationTest(unittest.TestCase):

        expected_outputs = Expectations(
            {
-                ("xpu", 3): "The image features two tabby cats lying on a pink surface, which appears to be a cushion or",
+                ("xpu", 3): "The image features two cats resting on a pink blanket. The cat on the left is a kitten",
                ("cuda", 8): 'The image features two cats lying on a pink surface, which appears to be a couch or a bed',
                ("rocm", (9, 4)): "The image features two cats lying on a pink surface, which appears to be a couch or a bed",
                ("rocm", (9, 5)): "The image features two tabby cats lying on a pink surface, which appears to be a cushion or"
--- a/tests/models/mlcd/test_modeling_mlcd.py
+++ b/tests/models/mlcd/test_modeling_mlcd.py
@ -146,7 +146,7 @@ class MLCDVisionModelIntegrationTest(unittest.TestCase):
    @slow
    def test_inference(self):
        model_name = "DeepGlint-AI/mlcd-vit-bigG-patch14-448"
-        model = MLCDVisionModel.from_pretrained(model_name, attn_implementation="eager").to(torch_device)
+        model = MLCDVisionModel.from_pretrained(model_name).to(torch_device)
        processor = AutoProcessor.from_pretrained(model_name)

        # process single image
--- a/tests/models/mllama/test_modeling_mllama.py
+++ b/tests/models/mllama/test_modeling_mllama.py
@ -547,7 +547,7 @@ class MllamaForConditionalGenerationIntegrationTest(unittest.TestCase):
        decoded_output = processor.decode(output[0], skip_special_tokens=True)
        expected_outputs = Expectations(
                {
-                    ("xpu", 3): "If I had to write a haiku about my life, I would write:\nLife is a messy stream\nRipples of joy and pain\nFlowing, ever",
+                    ("xpu", 3): "If I had to write a haiku about my life, I would write:\nLife is a messy tapestry\n Threads of joy and sorrow\nWeft of memories",
                    ("cuda", 7): "If I had to write a haiku about my life, I would write:\nLife is a messy stream\nRipples of joy and pain\nFlowing, ever",
                    ("cuda", 8): "If I had to write a haiku about my life, I would write:\nLife is a messy stream\nRipples of joy and pain\nFlowing, ever",
                }
--- a/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py
+++ b/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py
@ -193,15 +193,7 @@ class RecurrentGemmaIntegrationTest(unittest.TestCase):
    @require_bitsandbytes
    @require_read_token
    def test_model_2b_8bit(self):
-        # fmt: off
-        EXPECTED_TEXTS = Expectations(
-            {
-                ("xpu", None): ['Hello I am doing a project on the topic of "The impact of the internet on the society" and I am stuck', "Hi today I'm going to show you how to make a simple and easy to make a 3D"],
-                (None, None): ['Hello I am doing a project on the topic of "The impact of social media on the society" and I am looking', "Hi today I'm going to show you how to make a simple and easy to make a 3D"],
-            }
-        )
-        # fmt: on
-        EXPECTED_TEXT = EXPECTED_TEXTS.get_expectation()
+        EXPECTED_TEXTS = ['Hello I am doing a project on the topic of "The impact of social media on the society" and I am looking', "Hi today I'm going to show you how to make a simple and easy to make a 3D"]  # fmt: skip

        model = AutoModelForCausalLM.from_pretrained(
            "gg-hf/recurrent-gemma-2b-hf",
@ -216,7 +208,7 @@ class RecurrentGemmaIntegrationTest(unittest.TestCase):
        output = model.generate(**inputs, max_new_tokens=20, do_sample=False)
        output_text = tokenizer.batch_decode(output, skip_special_tokens=True)

-        self.assertEqual(output_text, EXPECTED_TEXT)
+        self.assertEqual(output_text, EXPECTED_TEXTS)

    @require_read_token
    def test_long_context(self):
--- a/tests/models/voxtral/test_modeling_voxtral.py
+++ b/tests/models/voxtral/test_modeling_voxtral.py
@ -23,7 +23,6 @@ from transformers import (
    is_torch_available,
 )
 from transformers.testing_utils import (
-    Expectations,
    cleanup,
    require_torch,
    slow,
@ -298,15 +297,9 @@ class VoxtralForConditionalGenerationIntegrationTest(unittest.TestCase):
        outputs = model.generate(**inputs, do_sample=False, max_new_tokens=500)
        decoded_outputs = self.processor.batch_decode(outputs, skip_special_tokens=True)

-        # fmt: off
-        EXPECTED_OUTPUTS = Expectations(
-            {
-                (None, None): ["What can you tell me about this audio?This audio is a farewell address by President Barack Obama, delivered in Chicago. In the speech, he reflects on his eight years in office, highlighting the resilience, hope, and unity of the American people. He acknowledges the diverse perspectives and conversations he had with the public, which kept him honest and inspired. The president also emphasizes the importance of self-government and civic engagement, encouraging Americans to participate in their democracy actively. He expresses optimism about the country's future and looks forward to continuing his work as a citizen. The audio concludes with a heartfelt thank you and a blessing for the United States."],
-                ("xpu", None): ["What can you tell me about this audio?This audio is a farewell address by President Barack Obama, delivered in Chicago. In the speech, he reflects on his eight years in office, highlighting the resilience, hope, and unity of the American people. He emphasizes the importance of self-government and active citizenship, encouraging listeners to engage in their communities and participate in democracy. The president expresses his optimism about the country's future and his commitment to continuing to serve as a citizen. He concludes the speech with a heartfelt thank you and a blessing for the United States."],
-            }
-        )
-        # fmt: on
-        EXPECTED_OUTPUT = EXPECTED_OUTPUTS.get_expectation()
+        EXPECTED_OUTPUT = [
+            "What can you tell me about this audio?This audio is a farewell address by President Barack Obama, delivered in Chicago. In the speech, he reflects on his eight years in office, highlighting the resilience, hope, and unity of the American people. He acknowledges the diverse perspectives and conversations he had with the public, which kept him honest and inspired. The president also emphasizes the importance of self-government and civic engagement, encouraging Americans to participate in their democracy actively. He expresses optimism about the country's future and looks forward to continuing his work as a citizen. The audio concludes with a heartfelt thank you and a blessing for the United States."
+        ]
        self.assertEqual(decoded_outputs, EXPECTED_OUTPUT)

    @slow
--- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
@ -35,7 +35,6 @@ from transformers.pipelines import AutomaticSpeechRecognitionPipeline, pipeline
 from transformers.pipelines.audio_utils import chunk_bytes_iter, ffmpeg_microphone_live
 from transformers.pipelines.automatic_speech_recognition import chunk_iter
 from transformers.testing_utils import (
-    Expectations,
    compare_pipeline_output_to_hub_spec,
    is_pipeline_test,
    is_torch_available,
@ -1444,14 +1443,8 @@ class AutomaticSpeechRecognitionPipelineTests(unittest.TestCase):
    @slow
    def test_whisper_longform(self):
        # fmt: off
-        EXPECTED_RESULTS = Expectations(
-            {
-                (None, None): " Folks, if you watch the show, you know, I spent a lot of time right over there. Patiently and astutely scrutinizing the boxwood and mahogany chest set of the day's biggest stories developing the central headline pawns, definitely maneuvering an oso topical night to F6, fainting a classic Sicilian, nade door variation on the news, all the while seeing eight moves deep and patiently marshalling the latest press releases into a fisher's shows in Lip Nitsky attack that culminates in the elegant lethal slow-played, all-passant checkmate that is my nightly monologue. But sometimes, sometimes, folks, I. CHEERING AND APPLAUSE Sometimes I startle away, cubside down in the monkey bars of a condemned playground on a super fun site. Get all hept up on goofballs. Rummage that were discarded tag bag of defective toys. Yank out a fist bowl of disembodied doll limbs, toss them on Saturday, Rusty Cargo, container down by the Wharf, and challenge toothless drifters to the godless bughouse lets of tournament that is my segment. MUSIC Meanwhile!",
-                ("xpu", None): " Folks, if you watch the show, you know, I spent a lot of time right over there. Patiently and astutely scrutinizing the boxwood and mahogany chest set of the day's biggest stories developing the central headline pawns, definitely maneuvering an oso topical night to F6, fainting of classics, Sicilian, nade door variation on the news, all the while seeing eight moves deep and patiently marshalling the latest press releases into a Fisher shows in Lip Nitsky attack that culminates in the elegant lethal slow-played, all-passant checkmate that is my nightly monologue. But sometimes, sometimes, folks, I... APPLAUSE Sometimes I... Startle away, upside down on the monkey bars of a condemned playground on a superfund site. Get all heaped up on goofballs, rummaged that would discard a tag bag of defective toys, yank out a fist bowl of disembodied doll limbs, toss them on a stain kid's place mat from a defunct denys, set up a table inside a rusty cargo container down by the Wharf and challenge toothless drifters to the godless bug house blitz of tournament that is my segment.",
-            }
-        )
+        EXPECTED_RESULT = " Folks, if you watch the show, you know, I spent a lot of time right over there. Patiently and astutely scrutinizing the boxwood and mahogany chest set of the day's biggest stories developing the central headline pawns, definitely maneuvering an oso topical night to F6, fainting a classic Sicilian, nade door variation on the news, all the while seeing eight moves deep and patiently marshalling the latest press releases into a fisher's shows in Lip Nitsky attack that culminates in the elegant lethal slow-played, all-passant checkmate that is my nightly monologue. But sometimes, sometimes, folks, I. CHEERING AND APPLAUSE Sometimes I startle away, cubside down in the monkey bars of a condemned playground on a super fun site. Get all hept up on goofballs. Rummage that were discarded tag bag of defective toys. Yank out a fist bowl of disembodied doll limbs, toss them on Saturday, Rusty Cargo, container down by the Wharf, and challenge toothless drifters to the godless bughouse lets of tournament that is my segment. MUSIC Meanwhile!"
        # fmt: on
-        EXPECTED_RESULT = EXPECTED_RESULTS.get_expectation()

        processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
--- a/tests/tensor_parallel/test_tensor_parallel.py
+++ b/tests/tensor_parallel/test_tensor_parallel.py
@ -12,68 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-# Run all tests: RUN_SLOW=1 pytest -v tests/tensor_parallel/test_tensor_parallel.py
-# Run specific config: RUN_SLOW=1 pytest -v tests/tensor_parallel/test_tensor_parallel.py -k "2Proc"
-# Run multiple configs: RUN_SLOW=1 pytest -v tests/tensor_parallel/test_tensor_parallel.py -k "2Proc or 4Proc"
-# Run spefic test: RUN_SLOW=1 pytest -v tests/tensor_parallel/test_tensor_parallel.py::TestTensorParallel2Proc::test_model_forward
+# Run the test: CUDA_VISIBLE_DEVICES=0,1 RUN_SLOW=1 pytest -sv tests/tensor_parallel/test_tensor_parallel.py

 import os
 import tempfile
-import warnings
+import textwrap

-from safetensors import safe_open
-
-from transformers import AutoModelForCausalLM, AutoTokenizer, is_torch_available
+from transformers import is_torch_available
 from transformers.integrations.tensor_parallel import get_packed_weights, repack_weights
 from transformers.testing_utils import (
    TestCasePlus,
    backend_device_count,
-    get_torch_dist_unique_port,
    require_huggingface_hub_greater_or_equal,
    require_torch_multi_accelerator,
    torch_device,
+    torchrun,
 )


 if is_torch_available():
    import torch
-    import torch.multiprocessing as mp
-
-
-def global_wrapper(rank, func, tp, port, func_args, func_kwargs):
-    def setup_dist_env(rank, world_size, port):
-        os.environ["WORLD_SIZE"] = str(world_size)
-        os.environ["RANK"] = str(rank)
-        os.environ["LOCAL_RANK"] = str(rank)
-        os.environ["MASTER_ADDR"] = "localhost"
-        os.environ["MASTER_PORT"] = str(port)
-
-    world_size = tp
-    setup_dist_env(rank, world_size, port)
-
-    if torch.cuda.is_available():
-        torch.cuda.set_device(rank)
-        torch.distributed.init_process_group(backend="nccl", rank=rank, world_size=world_size)
-    else:
-        torch.distributed.init_process_group(backend="gloo", rank=rank, world_size=world_size)
-
-    func(rank, *func_args, **func_kwargs)
-
-    torch.distributed.barrier()
-    torch.distributed.destroy_process_group()
-
-
-def init_distributed(tp: int):
-    def _init_distributed(func):
-        def wrapper(*args, **kwargs):
-            world_size = tp
-            port = get_torch_dist_unique_port()
-            spawn_args = (func, tp, port, args, kwargs)
-            mp.spawn(global_wrapper, args=spawn_args, nprocs=world_size)
-
-        return wrapper
-
-    return _init_distributed


 class TestTensorParallelUtils(TestCasePlus):
@ -105,9 +63,191 @@ class TestTensorParallelUtils(TestCasePlus):
        assert torch.allclose(unpacked_weights, original_packed_weights)


+class TestTensorParallel(TestCasePlus):
+    nproc_per_node = 2
+
+    def test_model_forward(self):
+        script_to_run = textwrap.dedent(
+            """
+            import torch
+            import os
+            from transformers import AutoModelForCausalLM, AutoTokenizer
+
+            model_id = "JackFram/llama-68m"
+
+            model = AutoModelForCausalLM.from_pretrained(model_id, dtype="auto", tp_plan="auto")
+            torch.distributed.barrier()
+
+            has_dtensor = 0
+            for name, parameter in model.named_parameters():
+                if isinstance(parameter.data, torch.distributed.tensor.DTensor):
+                    has_dtensor = 1
+                    break
+
+            assert has_dtensor == 1, "TP model must has DTensor"
+
+            tokenizer = AutoTokenizer.from_pretrained(model_id, legacy=False)
+            prompt = "Can I help"
+
+            inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
+            outputs = model(inputs)
+
+            next_token_logits = outputs[0][:, -1, :]
+            next_token = torch.argmax(next_token_logits, dim=-1)
+            response = tokenizer.decode(next_token)
+            assert response == "with"
+
+            torch.distributed.barrier()
+            torch.distributed.destroy_process_group()
+            """
+        )
+        torchrun(script_to_run, self.nproc_per_node, env=self.get_env())
+
+    def test_model_backward_pass(self):
+        script_to_run = textwrap.dedent(
+            """
+            import torch
+            import os
+            from transformers import AutoModelForCausalLM
+            from torch import nn
+
+            model_id = "JackFram/llama-68m"
+
+            model = AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.float32, tp_plan="auto")
+            torch.distributed.barrier()
+
+            # Dummy forward and backward pass
+            # Note that loss.backward() will fail if there is a bug in the TP implementation
+            inputs = torch.randint(0, model.config.vocab_size, (2, 10), device=model.device)
+            labels = torch.randint(0, model.config.vocab_size, (2, 10), device=model.device)
+            loss = model(inputs, labels=labels).loss
+            loss.backward()
+
+            torch.distributed.barrier()
+            torch.distributed.destroy_process_group()
+            """
+        )
+        torchrun(script_to_run, self.nproc_per_node, env=self.get_env())
+
+    def test_model_generate(self):
+        script_to_run = textwrap.dedent(
+            """
+            import torch
+            import os
+            from transformers import AutoModelForCausalLM, AutoTokenizer
+
+            model_id = "JackFram/llama-68m"
+
+            model = AutoModelForCausalLM.from_pretrained(model_id, dtype="auto", tp_plan="auto")
+            torch.distributed.barrier()
+
+            model.forward = torch.compile(model.forward)
+
+            has_dtensor = 0
+            for name, parameter in model.named_parameters():
+                if isinstance(parameter.data, torch.distributed.tensor.DTensor):
+                    has_dtensor = 1
+                    break
+
+            assert has_dtensor == 1, "TP model must have DTensor"
+
+            tokenizer = AutoTokenizer.from_pretrained(model_id)
+            prompt = "Can I help"
+
+            inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
+            outputs = model.generate(inputs, max_new_tokens=10, cache_implementation="static")
+
+            output_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+            assert output_text[0].startswith(prompt), f"Expected output to start with '{prompt}', got '{output_text[0]}'"
+
+            torch.distributed.barrier()
+            torch.distributed.destroy_process_group()
+            """
+        )
+        torchrun(script_to_run, self.nproc_per_node, env=self.get_env())
+
+    @require_huggingface_hub_greater_or_equal("0.31.4")
+    def test_model_save(self):
+        from safetensors import safe_open
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            for is_torchrun in [True, False]:
+                script_to_run = textwrap.dedent(
+                    f"""
+                    import torch
+                    import os
+                    from transformers import AutoModelForCausalLM
+
+                    model_id = "JackFram/llama-68m"
+                    kwargs = dict()
+
+                    if os.environ.get("RANK", None) is not None:
+                        kwargs["tp_plan"] = "auto"
+                        result_dir = "{tmp_dir}/tp"
+                    else:
+                        result_dir = "{tmp_dir}/nontp"
+
+                    model = AutoModelForCausalLM.from_pretrained(model_id, **kwargs)
+                    model.save_pretrained(result_dir)
+                    """
+                )
+                torchrun(script_to_run, self.nproc_per_node, is_torchrun=is_torchrun, env=self.get_env())
+
+            non_tp_model_path = os.path.join(tmp_dir, "nontp")
+            tp_model_path = os.path.join(tmp_dir, "tp")
+
+            for filename in os.listdir(non_tp_model_path):
+                if not filename.endswith(".safetensors"):
+                    continue
+
+                non_tp_model = safe_open(os.path.join(non_tp_model_path, filename), device="cpu", framework="pt")
+                tp_model = safe_open(os.path.join(tp_model_path, filename), device="cpu", framework="pt")
+                for non_tp_key in non_tp_model.keys():
+                    non_tp_tensor = non_tp_model.get_tensor(non_tp_key)
+                    tp_tensor = tp_model.get_tensor(non_tp_key)
+                    assert torch.allclose(non_tp_tensor, tp_tensor), f"Tensor with key: {non_tp_key} does not match"
+                    del non_tp_tensor, tp_tensor
+
+    def test_custom_tp_plan(self):
+        script_to_run = textwrap.dedent(
+            r"""
+            import re
+            import torch
+            from torch.distributed.tensor import DTensor
+            from transformers import AutoModelForCausalLM
+
+            model_id = "JackFram/llama-68m"
+            # only shard attentions, but not mlps
+            tp_plan = {
+                "model.layers.*.self_attn.q_proj": "colwise",
+                "model.layers.*.self_attn.k_proj": "colwise",
+                "model.layers.*.self_attn.v_proj": "colwise",
+                "model.layers.*.self_attn.o_proj": "rowwise",
+            }
+
+            # Use custom tp_plan directly in from_pretrained
+            model = AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16, tp_plan=tp_plan)
+
+            # Check we can generate with the tp_plan
+            inputs = torch.randint(100, 200, (1, 10), device=model.device)
+            out = model.generate(inputs, max_new_tokens=10, do_sample=False)
+
+            # Check only the attentions are sharded
+            for name, param in model.named_parameters():
+                if re.search(r"\.self_attn\.(q|k|v|o)_proj\.", name):
+                    assert isinstance(param, DTensor)
+                else:
+                    assert not isinstance(param, DTensor)
+            """
+        )
+        torchrun(script_to_run, self.nproc_per_node, env=self.get_env())
+
+
 class TestTensorParallelProperties(TestCasePlus):
    def test_tp_plan_property_setter_getter(self):
        """Test that tp_plan property can be set and retrieved correctly."""
+        from transformers import AutoModelForCausalLM
+
        model_id = "JackFram/llama-68m"
        model = AutoModelForCausalLM.from_pretrained(model_id, dtype="auto")

@ -135,6 +275,8 @@ class TestTensorParallelProperties(TestCasePlus):

    def test_tp_plan_validation_invalid_style(self):
        """Test that invalid parallel styles are rejected."""
+        from transformers import AutoModelForCausalLM
+
        model_id = "JackFram/llama-68m"
        model = AutoModelForCausalLM.from_pretrained(model_id, dtype="auto")

@ -147,6 +289,9 @@ class TestTensorParallelProperties(TestCasePlus):

    def test_tp_plan_validation_nonexistent_layer_warning(self):
        """Test that warnings are issued for non-existent layer patterns."""
+        import warnings
+
+        from transformers import AutoModelForCausalLM

        model_id = "JackFram/llama-68m"
        model = AutoModelForCausalLM.from_pretrained(model_id, dtype="auto")
@ -163,6 +308,10 @@ class TestTensorParallelProperties(TestCasePlus):

    def test_tp_plan_valid_layer_patterns(self):
        """Test that valid layer patterns are accepted without warnings."""
+        import warnings
+
+        from transformers import AutoModelForCausalLM
+
        model_id = "JackFram/llama-68m"
        model = AutoModelForCausalLM.from_pretrained(model_id, dtype="auto")

@ -198,6 +347,8 @@ class TestTensorParallelProperties(TestCasePlus):

    def test_tp_plan_none_handling(self):
        """Test that None values are handled correctly."""
+        from transformers import AutoModelForCausalLM
+
        model_id = "JackFram/llama-68m"
        model = AutoModelForCausalLM.from_pretrained(model_id, dtype="auto")

@ -210,172 +361,6 @@ class TestTensorParallelProperties(TestCasePlus):
        self.assertEqual(model.tp_plan, {"model.layers.*.self_attn.q_proj": "colwise"})


-# ====== TEST FUNCTIONS ======
-def _test_model_forward_impl(rank):
-    """Implementation of test_model_forward for distributed execution."""
-    model_id = "JackFram/llama-68m"
-
-    int(os.environ["RANK"])
-    int(os.environ["WORLD_SIZE"])
-    model = AutoModelForCausalLM.from_pretrained(model_id, dtype="auto", tp_plan="auto")
-    torch.distributed.barrier()
-
-    has_dtensor = 0
-    for name, parameter in model.named_parameters():
-        if isinstance(parameter.data, torch.distributed.tensor.DTensor):
-            has_dtensor = 1
-            break
-
-    assert has_dtensor == 1, "TP model must has DTensor"
-
-    tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False)
-    prompt = "Can I help"
-
-    inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
-    outputs = model(inputs)
-
-    next_token_logits = outputs[0][:, -1, :]
-    next_token = torch.argmax(next_token_logits, dim=-1)
-    response = tokenizer.decode(next_token)
-    assert response == "with"
-    print("response:", response)
-    torch.distributed.barrier()
-
-
-def _test_model_backward_pass_impl(rank):
-    """Implementation of test_model_backward_pass for distributed execution."""
-    model_id = "JackFram/llama-68m"
-
-    model = AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.float32, tp_plan="auto")
-    torch.distributed.barrier()
-
-    # Dummy forward and backward pass
-    # Note that loss.backward() will fail if there is a bug in the TP implementation
-    inputs = torch.randint(0, model.config.vocab_size, (2, 10), device=model.device)
-    labels = torch.randint(0, model.config.vocab_size, (2, 10), device=model.device)
-    loss = model(inputs, labels=labels).loss
-    loss.backward()
-
-    torch.distributed.barrier()
-
-
-def _test_model_generate_impl(rank):
-    """Implementation of test_model_generate for distributed execution."""
-    model_id = "JackFram/llama-68m"
-
-    int(os.environ["RANK"])
-    int(os.environ["WORLD_SIZE"])
-
-    model = AutoModelForCausalLM.from_pretrained(model_id, dtype="auto", tp_plan="auto")
-    torch.distributed.barrier()
-
-    model.forward = torch.compile(model.forward)
-
-    has_dtensor = 0
-    for name, parameter in model.named_parameters():
-        if isinstance(parameter.data, torch.distributed.tensor.DTensor):
-            has_dtensor = 1
-            break
-
-    assert has_dtensor == 1, "TP model must has DTensor"
-
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
-    prompt = "Can I help"
-
-    inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
-    outputs = model.generate(inputs, max_new_tokens=10, cache_implementation="static")
-
-    output_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-    assert output_text[0].startswith(prompt), f"Expected output to start with '{prompt}', got '{output_text[0]}'"
-
-    torch.distributed.barrier()
-
-
-def _test_model_save_impl(rank, tmp_dir, is_torchrun):
-    """Implementation of test_model_save for distributed execution."""
-    model_id = "JackFram/llama-68m"
-    kwargs = {}
-
-    if os.environ.get("RANK", None) is not None:
-        kwargs["tp_plan"] = "auto"
-        result_dir = f"{tmp_dir}/tp"
-    else:
-        result_dir = f"{tmp_dir}/nontp"
-
-    model = AutoModelForCausalLM.from_pretrained(model_id, **kwargs)
-    model.save_pretrained(result_dir)
-
-
-class TestTensorParallelBase(TestCasePlus):
-    """Base class for tensor parallel tests. Subclasses must set nproc_per_node."""
-
-    nproc_per_node = None
-
-    @require_torch_multi_accelerator
-    def test_model_forward(self):
-        if self.nproc_per_node is None:
-            self.skipTest("nproc_per_node not set")
-        if backend_device_count(torch_device) < self.nproc_per_node:
-            self.skipTest(f"Need at least {self.nproc_per_node} devices, have {backend_device_count(torch_device)}")
-
-        init_distributed(tp=self.nproc_per_node)(_test_model_forward_impl)()
-
-    @require_torch_multi_accelerator
-    def test_model_backward_pass(self):
-        if self.nproc_per_node is None:
-            self.skipTest("nproc_per_node not set")
-        if backend_device_count(torch_device) < self.nproc_per_node:
-            self.skipTest(f"Need at least {self.nproc_per_node} devices, have {backend_device_count(torch_device)}")
-
-        init_distributed(tp=self.nproc_per_node)(_test_model_backward_pass_impl)()
-
-    @require_torch_multi_accelerator
-    def test_model_generate(self):
-        if self.nproc_per_node is None:
-            self.skipTest("nproc_per_node not set")
-        if backend_device_count(torch_device) < self.nproc_per_node:
-            self.skipTest(f"Need at least {self.nproc_per_node} devices, have {backend_device_count(torch_device)}")
-
-        init_distributed(tp=self.nproc_per_node)(_test_model_generate_impl)()
-
-    @require_huggingface_hub_greater_or_equal("0.31.4")
-    @require_torch_multi_accelerator
-    def test_model_save(self):
-        if self.nproc_per_node is None:
-            self.skipTest("nproc_per_node not set")
-        if backend_device_count(torch_device) < self.nproc_per_node:
-            self.skipTest(f"Need at least {self.nproc_per_node} devices, have {backend_device_count(torch_device)}")
-
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            # First run with TP (distributed)
-            init_distributed(tp=self.nproc_per_node)(_test_model_save_impl)(tmp_dir, True)
-
-            # Then run without TP (non-distributed)
-            _test_model_save_impl(0, tmp_dir, False)
-
-            non_tp_model_path = os.path.join(tmp_dir, "nontp")
-            tp_model_path = os.path.join(tmp_dir, "tp")
-
-            for filename in os.listdir(non_tp_model_path):
-                if not filename.endswith(".safetensors"):
-                    continue
-
-                non_tp_model = safe_open(os.path.join(non_tp_model_path, filename), device="cpu", framework="pt")
-                tp_model = safe_open(os.path.join(tp_model_path, filename), device="cpu", framework="pt")
-                for non_tp_key in non_tp_model.keys():
-                    non_tp_tensor = non_tp_model.get_tensor(non_tp_key)
-                    tp_tensor = tp_model.get_tensor(non_tp_key)
-                    assert torch.allclose(non_tp_tensor, tp_tensor), f"Tensor with key: {non_tp_key} does not match"
-                    del non_tp_tensor, tp_tensor
-
-
-class TestTensorParallel2Proc(TestTensorParallelBase):
-    """Test tensor parallel with 2 processes."""
-
-    nproc_per_node = 2
-
-
-class TestTensorParallel4Proc(TestTensorParallelBase):
-    """Test tensor parallel with 4 processes."""
-
-    nproc_per_node = 4
+@require_torch_multi_accelerator
+class TestTensorParallelAccelerator(TestTensorParallel):
+    nproc_per_node = backend_device_count(torch_device)
--- a/tests/test_processing_common.py
+++ b/tests/test_processing_common.py
@ -37,9 +37,8 @@ from transformers.testing_utils import (
 from transformers.utils import is_torch_available, is_vision_available


-parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
-sys.path.append(os.path.join(parent_dir, "utils"))
-from fetch_hub_objects_for_ci import url_to_local_path  # noqa: E402
+sys.path.append(".")
+from utils.fetch_hub_objects_for_ci import url_to_local_path


 global_rng = random.Random()
--- a/tests/test_tokenization_mistral_common.py
+++ b/tests/test_tokenization_mistral_common.py
@ -799,9 +799,7 @@ class TestMistralCommonTokenizer(unittest.TestCase):

        # Test 2:
        # without tokenize
-        self.assertEqual(
-            self.tokenizer.apply_chat_template(conversation, tokenize=True).input_ids, expected_tokenized.tokens
-        )
+        self.assertEqual(self.tokenizer.apply_chat_template(conversation, tokenize=True), expected_tokenized.tokens)

        with self.assertRaises(
            ValueError, msg="Kwargs [unk_args] are not supported by `MistralCommonTokenizer.apply_chat_template`."
@ -826,7 +824,7 @@ class TestMistralCommonTokenizer(unittest.TestCase):
            expected_tokenized.text,
        )
        self.assertEqual(
-            self.tokenizer.apply_chat_template(conversation, tokenize=True, continue_final_message=True).input_ids,
+            self.tokenizer.apply_chat_template(conversation, tokenize=True, continue_final_message=True),
            expected_tokenized.tokens,
        )

@ -848,7 +846,7 @@ class TestMistralCommonTokenizer(unittest.TestCase):
            token_outputs = self.tokenizer.apply_chat_template(
                conversation, tokenize=True, add_generation_prompt=add_generation_prompt
            )
-            self.assertEqual(token_outputs.input_ids, expected_tokenized.tokens)
+            self.assertEqual(token_outputs, expected_tokenized.tokens)

        # Test 2:
        # with continue_final_message
@ -960,16 +958,18 @@ class TestMistralCommonTokenizer(unittest.TestCase):
                },
            ]

-            output = self.tokenizer.apply_chat_template(conversation).input_ids
+            output = self.tokenizer.apply_chat_template(conversation, tokenize=True)
            self.assertEqual(output, expected_tokenized.tokens)

-        output_dict = self.tokenizer.apply_chat_template(conversation, tokenize=True)
+        output_dict = self.tokenizer.apply_chat_template(conversation, tokenize=True, return_dict=True)
        self.assertEqual(output_dict["input_ids"], expected_tokenized.tokens)
        self.assertEqual(len(output_dict["pixel_values"]), len(expected_tokenized.images))
        for o, e in zip(output_dict["pixel_values"], expected_tokenized.images):
            self.assertTrue(np.allclose(o, e))

-        output_dict = self.tokenizer.apply_chat_template(conversation, tokenize=True, return_tensors="pt")
+        output_dict = self.tokenizer.apply_chat_template(
+            conversation, tokenize=True, return_dict=True, return_tensors="pt"
+        )
        self.assertEqual(output_dict["input_ids"].tolist()[0], expected_tokenized.tokens)
        expected_images_pt_tensor = torch.from_numpy(np.stack(expected_tokenized.images))
        self.assertTrue(torch.allclose(output_dict["pixel_values"], expected_images_pt_tensor))
@ -1013,7 +1013,7 @@ class TestMistralCommonTokenizer(unittest.TestCase):
                },
            ]

-            output = self.tokenizer_audio.apply_chat_template(conversation, tokenize=True).input_ids
+            output = self.tokenizer_audio.apply_chat_template(conversation, tokenize=True)
            self.assertEqual(output, expected_tokenized.tokens)

        output_dict = self.tokenizer_audio.apply_chat_template(conversation, tokenize=True, return_dict=True)
@ -1041,14 +1041,14 @@ class TestMistralCommonTokenizer(unittest.TestCase):
        # Test 1:
        # with truncation
        self.assertEqual(
-            self.tokenizer.apply_chat_template(conversation, tokenize=True, truncation=True, max_length=20).input_ids,
+            self.tokenizer.apply_chat_template(conversation, tokenize=True, truncation=True, max_length=20),
            expected_tokenized.tokens[:20],
        )

        # Test 2:
        # without truncation
        self.assertEqual(
-            self.tokenizer.apply_chat_template(conversation, tokenize=True, truncation=False, max_length=20).input_ids,
+            self.tokenizer.apply_chat_template(conversation, tokenize=True, truncation=False, max_length=20),
            expected_tokenized.tokens,
        )

@ -1130,7 +1130,7 @@ class TestMistralCommonTokenizer(unittest.TestCase):
        ]

        text_outputs = self.tokenizer.apply_chat_template(conversations, tools=tools, tokenize=False)
-        token_outputs = self.tokenizer.apply_chat_template(conversations, tools=tools, tokenize=True).input_ids
+        token_outputs = self.tokenizer.apply_chat_template(conversations, tools=tools, tokenize=True)

        self.assertEqual(len(text_outputs), len(token_outputs))
        self.assertEqual(len(text_outputs), len(expected_tokenized))
@ -1202,7 +1202,7 @@ class TestMistralCommonTokenizer(unittest.TestCase):
            ChatCompletionRequest.from_openai(ref_conversation)
        )

-        output = self.tokenizer.apply_chat_template(conversations, tokenize=True).input_ids
+        output = self.tokenizer.apply_chat_template(conversations, tokenize=True)
        self.assertEqual(output, [expected_tokenized.tokens] * 3)

        output = self.tokenizer.apply_chat_template(conversations, tokenize=True, return_dict=True)
@ -1248,9 +1248,7 @@ class TestMistralCommonTokenizer(unittest.TestCase):
            for conversation in conversations
        ]

-        token_outputs = self.tokenizer.apply_chat_template(
-            conversations, tokenize=True, continue_final_message=True
-        ).input_ids
+        token_outputs = self.tokenizer.apply_chat_template(conversations, tokenize=True, continue_final_message=True)

        for output, expected in zip(token_outputs, expected_tokenized):
            self.assertEqual(output, expected.tokens)
@ -1299,7 +1297,7 @@ class TestMistralCommonTokenizer(unittest.TestCase):
            ]
            token_outputs = self.tokenizer.apply_chat_template(
                conversations, tokenize=True, add_generation_prompt=add_generation_prompt
-            ).input_ids
+            )
            for output, expected in zip(token_outputs, expected_tokenized):
                self.assertEqual(output, expected.tokens)

@ -1333,7 +1331,7 @@ class TestMistralCommonTokenizer(unittest.TestCase):
        # with truncation
        token_outputs = self.tokenizer.apply_chat_template(
            self.fixture_conversations, tokenize=True, truncation=True, max_length=20
-        ).input_ids
+        )

        for output, expected in zip(token_outputs, self.tokenized_fixture_conversations):
            self.assertEqual(output, expected.tokens[:20])
@ -1342,7 +1340,7 @@ class TestMistralCommonTokenizer(unittest.TestCase):
        # without truncation
        token_outputs = self.tokenizer.apply_chat_template(
            self.fixture_conversations, tokenize=True, truncation=False, max_length=20
-        ).input_ids
+        )
        self.assertEqual(len(token_outputs), len(self.tokenized_fixture_conversations))
        for output, expected in zip(token_outputs, self.tokenized_fixture_conversations):
            self.assertEqual(output, expected.tokens)
@ -1360,9 +1358,7 @@ class TestMistralCommonTokenizer(unittest.TestCase):
        for padding in [True, "max_length", PaddingStrategy.LONGEST, PaddingStrategy.MAX_LENGTH]:
            if padding == PaddingStrategy.MAX_LENGTH:
                # No padding if no max length is provided
-                token_outputs = self.tokenizer.apply_chat_template(
-                    self.fixture_conversations, padding=padding, return_dict=False
-                )
+                token_outputs = self.tokenizer.apply_chat_template(self.fixture_conversations, padding=padding)
                self.assertEqual(len(token_outputs), len(self.tokenized_fixture_conversations))
                for output, expected in zip(token_outputs, self.tokenized_fixture_conversations):
                    self.assertEqual(output, expected.tokens)
@ -1370,7 +1366,7 @@ class TestMistralCommonTokenizer(unittest.TestCase):
            max_length = 20 if padding == PaddingStrategy.MAX_LENGTH else None

            token_outputs = self.tokenizer.apply_chat_template(
-                self.fixture_conversations, tokenize=True, padding=padding, max_length=max_length, return_dict=False
+                self.fixture_conversations, tokenize=True, padding=padding, max_length=max_length
            )

            if padding != PaddingStrategy.MAX_LENGTH:
@ -1394,7 +1390,7 @@ class TestMistralCommonTokenizer(unittest.TestCase):

        for padding in [False, "do_not_pad", PaddingStrategy.DO_NOT_PAD]:
            token_outputs = self.tokenizer.apply_chat_template(
-                self.fixture_conversations, tokenize=True, padding=padding, return_dict=False
+                self.fixture_conversations, tokenize=True, padding=padding
            )
            self.assertEqual(len(token_outputs), len(self.tokenized_fixture_conversations))
            for output, expected in zip(token_outputs, self.tokenized_fixture_conversations):
@ -1406,12 +1402,7 @@ class TestMistralCommonTokenizer(unittest.TestCase):
        max_length = 20
        for padding in [True, "max_length", PaddingStrategy.LONGEST, PaddingStrategy.MAX_LENGTH]:
            token_outputs = self.tokenizer.apply_chat_template(
-                self.fixture_conversations,
-                tokenize=True,
-                truncation=True,
-                padding=padding,
-                max_length=max_length,
-                return_dict=False,
+                self.fixture_conversations, tokenize=True, truncation=True, padding=padding, max_length=max_length
            )
            self.assertEqual(len(token_outputs), len(self.tokenized_fixture_conversations))
            for output, expected in zip(token_outputs, self.tokenized_fixture_conversations):
@ -1420,12 +1411,7 @@ class TestMistralCommonTokenizer(unittest.TestCase):
                )
        for padding in [False, "do_not_pad", PaddingStrategy.DO_NOT_PAD]:
            token_outputs = self.tokenizer.apply_chat_template(
-                self.fixture_conversations,
-                tokenize=True,
-                truncation=True,
-                padding=padding,
-                max_length=max_length,
-                return_dict=False,
+                self.fixture_conversations, tokenize=True, truncation=True, padding=padding, max_length=max_length
            )
            self.assertEqual(len(token_outputs), len(self.tokenized_fixture_conversations))
            for output, expected in zip(token_outputs, self.tokenized_fixture_conversations):
@ -1435,7 +1421,7 @@ class TestMistralCommonTokenizer(unittest.TestCase):
        # Test 1:
        # with tokenize
        token_outputs = self.tokenizer.apply_chat_template(
-            self.fixture_conversations, tokenize=True, return_tensors="pt", padding=True, return_dict=False
+            self.fixture_conversations, tokenize=True, return_tensors="pt", padding=True
        )
        self.assertIsInstance(token_outputs, torch.Tensor)
        self.assertEqual(
@ -1446,7 +1432,7 @@ class TestMistralCommonTokenizer(unittest.TestCase):
        # Test 2:
        # without tokenize, should ignore return_tensors
        token_outputs = self.tokenizer.apply_chat_template(
-            self.fixture_conversations, tokenize=False, return_tensors="pt", padding=True, return_dict=False
+            self.fixture_conversations, tokenize=False, return_tensors="pt", padding=True
        )
        self.assertEqual(token_outputs, [t.text for t in self.tokenized_fixture_conversations])

--- a/tests/tokenization/test_tokenization_utils.py
+++ b/tests/tokenization/test_tokenization_utils.py
@ -323,7 +323,7 @@ class TokenizerUtilsTest(unittest.TestCase):
        ]

        # First, test the default case, where we encode the whole conversation at once
-        whole_conversation_tokens = tokenizer.apply_chat_template(conversation, tokenize=True, return_dict=False)
+        whole_conversation_tokens = tokenizer.apply_chat_template(conversation, tokenize=True)

        # Now, test the message-by-message encoding
        tokens = []
--- a/tests/utils/test_chat_parsing_utils.py
+++ b/tests/utils/test_chat_parsing_utils.py
@ -200,40 +200,6 @@ class ChatSchemaParserTest(unittest.TestCase):
        tokenizer_parsed_chat = tokenizer.parse_response(model_out)
        self.assertEqual(tokenizer_parsed_chat, parsed_chat)

-    def test_batched_inputs(self):
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
-        model_out = '<|START_THINKING|>I should call a tool.<|END_THINKING|><|START_ACTION|>[\n    {"tool_call_id": "0", "tool_name": "simple_tool", "parameters": {"temperature_format": "Celsius"}}\n]<|END_ACTION|><|END_OF_TURN_TOKEN|>'
-        tokenizer.response_schema = cohere_schema
-        parsed_chat = tokenizer.parse_response(model_out)
-        self.assertEqual(tokenizer.parse_response([model_out]), [parsed_chat])
-        self.assertEqual(tokenizer.parse_response([model_out] * 2), [parsed_chat] * 2)
-
-    def test_token_id_inputs(self):
-        tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")  # Need an actual tokenizer to encode
-        model_out = '<|START_THINKING|>I should call a tool.<|END_THINKING|><|START_ACTION|>[\n    {"tool_call_id": "0", "tool_name": "simple_tool", "parameters": {"temperature_format": "Celsius"}}\n]<|END_ACTION|><|END_OF_TURN_TOKEN|>'
-        tokenizer.response_schema = cohere_schema
-        parsed_chat = tokenizer.parse_response(model_out)
-        tokenized_out = tokenizer(model_out).input_ids
-        self.assertEqual(tokenizer.parse_response(tokenized_out), parsed_chat)
-        self.assertEqual(tokenizer.parse_response([tokenized_out]), [parsed_chat])
-        self.assertEqual(tokenizer.parse_response([tokenized_out] * 2), [parsed_chat] * 2)
-
-    def test_numpy_inputs(self):
-        tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")  # Need an actual tokenizer to encode
-        model_out = '<|START_THINKING|>I should call a tool.<|END_THINKING|><|START_ACTION|>[\n    {"tool_call_id": "0", "tool_name": "simple_tool", "parameters": {"temperature_format": "Celsius"}}\n]<|END_ACTION|><|END_OF_TURN_TOKEN|>'
-        tokenizer.response_schema = cohere_schema
-        parsed_chat = tokenizer.parse_response(model_out)
-        tokenized_out = tokenizer(model_out, return_tensors="np").input_ids
-        self.assertEqual(tokenizer.parse_response(tokenized_out), [parsed_chat])
-
-    def test_tensor_inputs(self):
-        tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")  # Need an actual tokenizer to encode
-        model_out = '<|START_THINKING|>I should call a tool.<|END_THINKING|><|START_ACTION|>[\n    {"tool_call_id": "0", "tool_name": "simple_tool", "parameters": {"temperature_format": "Celsius"}}\n]<|END_ACTION|><|END_OF_TURN_TOKEN|>'
-        tokenizer.response_schema = cohere_schema
-        parsed_chat = tokenizer.parse_response(model_out)
-        tokenized_out = tokenizer(model_out, return_tensors="pt").input_ids
-        self.assertEqual(tokenizer.parse_response(tokenized_out), [parsed_chat])
-
    def test_cohere_template(self):
        model_out = '<|START_THINKING|>I should call a tool.<|END_THINKING|><|START_ACTION|>[\n    {"tool_call_id": "0", "tool_name": "simple_tool", "parameters": {"temperature_format": "Celsius"}}\n]<|END_ACTION|><|END_OF_TURN_TOKEN|>'
        parsed_chat = recursive_parse(model_out, cohere_schema)
@ -315,7 +281,6 @@ class ChatSchemaParserTest(unittest.TestCase):
        self.assertEqual(
            parsed_chat,
            {
-                "role": "assistant",
                "thinking": 'Okay, the user said, "Hello! How are you?" I need to respond appropriately. Since this is the first message, I should greet them back and ask how I can assist. I should keep it friendly and open-ended. Let me make sure the response is welcoming and encourages them to share what they need help with. I\'ll avoid any technical jargon and keep it simple. Let me check for any typos and ensure the tone is positive.',
                "tool_calls": [
                    {
@ -337,10 +302,9 @@ class ChatSchemaParserTest(unittest.TestCase):
        self.assertEqual(
            parsed_chat,
            {
-                "role": "assistant",
                "tool_calls": [
                    {"type": "function", "function": {"name": "get_weather", "arguments": {"city": "Paris"}}}
-                ],
+                ]
            },
        )

@ -350,7 +314,6 @@ class ChatSchemaParserTest(unittest.TestCase):
        self.assertEqual(
            parsed_chat,
            {
-                "role": "assistant",
                "content": "Some content about gravity goes here but I'm cutting it off to make this shorter!",
                "thinking": 'Okay, the user asked, "Hey! Can you tell me about gravity?" Let me start by breaking down what they might be looking for. They probably want a basic understanding of gravity, maybe for a school project or just personal curiosity. I should explain what gravity is, how it works, and maybe some examples.',
            },
@ -362,7 +325,6 @@ class ChatSchemaParserTest(unittest.TestCase):
        self.assertEqual(
            parsed_chat,
            {
-                "role": "assistant",
                "tool_calls": [
                    {
                        "type": "function",
@ -374,6 +336,6 @@ class ChatSchemaParserTest(unittest.TestCase):
                            },
                        },
                    }
-                ],
+                ]
            },
        )
--- a/utils/notification_service.py
+++ b/utils/notification_service.py
@ -1407,10 +1407,7 @@ if __name__ == "__main__":
    if not os.path.isdir(os.path.join(os.getcwd(), f"ci_results_{job_name}")):
        os.makedirs(os.path.join(os.getcwd(), f"ci_results_{job_name}"))

-    nvidia_daily_ci_workflow = (
-        "huggingface/transformers/.github/workflows/self-scheduled-caller.yml",
-        "huggingface/transformers/.github/workflows/self-scheduled-flash-attn-caller.yml",
-    )
+    nvidia_daily_ci_workflow = "huggingface/transformers/.github/workflows/self-scheduled-caller.yml"
    amd_daily_ci_workflows = (
        "huggingface/transformers/.github/workflows/self-scheduled-amd-mi325-caller.yml",
        "huggingface/transformers/.github/workflows/self-scheduled-amd-mi355-caller.yml",
Author	SHA1	Message	Date
Guillaume LEGENDRE	ce334e82c9	Add sha256sum to ssh-runner workflow	2025-10-28 18:59:44 +01:00
Guillaume LEGENDRE	02aba5d63d	Update SSH runner version to 1.90.3	2025-10-28 18:57:31 +01:00
Guillaume LEGENDRE	9c4b1c0a65	Update runner type for single GPU A10 instance	2025-10-24 17:39:17 +02:00
Guillaume LEGENDRE	4cbb53cd31	Remove wait step from ssh-runner.yml Removed the wait step from the SSH runner workflow.	2025-10-24 13:58:19 +02:00
Guillaume LEGENDRE	8dfe70e808	Rename wait step to wait2 in ssh-runner.yml	2025-10-24 13:36:37 +02:00
Guillaume LEGENDRE	6c087019d3	Add wait step to SSH runner workflow	2025-10-24 13:20:52 +02:00
Guillaume LEGENDRE	bacc7db5ac	Change ssh runner type	2025-10-24 12:46:35 +02:00