push working qwen vl viz

fixup
biig update for ragged inputs why not
2025-11-07 05:54:35 +08:00 · 2025-10-21 18:04:08 +02:00 · 2025-10-21 16:33:01 +02:00 · 2025-10-21 16:17:45 +02:00 · 2025-10-21 14:19:35 +02:00 · 2025-08-11 19:20:28 +02:00
272 changed files with 8780 additions and 2925 deletions
--- a/.github/scripts/codeowners_for_review_action
+++ b/.github/scripts/codeowners_for_review_action
@ -22,6 +22,7 @@ tests/generation/ @gante
 /src/transformers/models/auto/ @ArthurZucker
 /src/transformers/utils/ @ArthurZucker @Rocketknight1
 /src/transformers/loss/ @ArthurZucker
+/src/transformers/onnx/ @michaelbenayoun

 # Specific files come after the sections/globs, so they take priority
 /.circleci/config.yml @ArthurZucker @ydshieh
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@ -28,7 +28,7 @@ jobs:
      (github.event_name == 'pull_request' && contains( github.event.pull_request.labels.*.name, 'run-benchmark') )||
      (github.event_name == 'push' && github.ref == 'refs/heads/main')
    container:
-      image: huggingface/transformers-all-latest-gpu
+      image: huggingface/transformers-pytorch-gpu
      options: --gpus all --privileged --ipc host
    steps:
      - name: Get repo
--- a/.github/workflows/benchmark_v2_a10_caller.yml
+++ b/.github/workflows/benchmark_v2_a10_caller.yml
@ -9,7 +9,7 @@ jobs:
    uses: ./.github/workflows/benchmark_v2.yml
    with:
      runner: aws-g5-4xlarge-cache-use1-public-80
-      container_image: huggingface/transformers-all-latest-gpu
+      container_image: huggingface/transformers-pytorch-gpu
      container_options: --gpus all --privileged --ipc host --shm-size "16gb"
      commit_sha: ${{ github.sha }}
      run_id: ${{ github.run_id }}
--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
@ -45,52 +45,26 @@ jobs:
            REF=main
          push: true
          tags: huggingface/transformers-all-latest-gpu${{ inputs.image_postfix }}
-
-      - name: Post to Slack
-        if: always()
-        uses: huggingface/hf-workflows/.github/actions/post-slack@main
-        with:
-          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
-          title: 🤗 Results of the transformers-all-latest-gpu docker build
-          status: ${{ job.status }}
-          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
-
-  flash-attn-ci-image:
-    name: "PyTorch with Flash Attn [dev]"
-    runs-on:
-      group: aws-general-8-plus
-    steps:
+      # Push CI images still need to be re-built daily
      -
-        name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-      -
-        name: Check out code
-        uses: actions/checkout@v4
-      -
-        name: Login to DockerHub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_PASSWORD }}
-      -
-        name: Build and push
+        name: Build and push (for Push CI) in a daily basis
+        # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
+        # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
+        if: inputs.image_postfix != '-push-ci'
        uses: docker/build-push-action@v5
        with:
          context: ./docker/transformers-all-latest-gpu
          build-args: |
            REF=main
-            PYTORCH=2.8.0
-            TORCHCODEC=0.7.0
-            FLASH_ATTN=yes
          push: true
-          tags: huggingface/transformers-all-latest-gpu${{ inputs.image_postfix }}:flash-attn
+          tags: huggingface/transformers-all-latest-gpu-push-ci

      - name: Post to Slack
        if: always()
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
-          title: 🤗 Results of the transformers-all-latest-gpu docker build
+          title: 🤗 Results of the transformers-all-latest-gpu-push-ci docker build
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

@ -130,8 +104,51 @@ jobs:
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

+  # Can't build 2 images in a single job `latest-torch-deepspeed-docker` (for `nvcr.io/nvidia`)
+  latest-torch-deepspeed-docker-for-push-ci-daily-build:
+    name: "Latest PyTorch + DeepSpeed (Push CI - Daily Build)"
+    runs-on:
+      group: aws-general-8-plus
+    steps:
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      -
+        name: Check out code
+        uses: actions/checkout@v4
+      -
+        name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      # Push CI images still need to be re-built daily
+      -
+        name: Build and push (for Push CI) in a daily basis
+        # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
+        # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
+        if: inputs.image_postfix != '-push-ci'
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-pytorch-deepspeed-latest-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci
+
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        with:
+          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
+          title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu-push-ci docker build
+          status: ${{ job.status }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+
  doc-builder:
    name: "Doc builder"
+    # Push CI doesn't need this image
+    if: inputs.image_postfix != '-push-ci'
    runs-on:
      group: aws-general-8-plus
    steps:
@ -164,6 +181,44 @@ jobs:
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

+  latest-pytorch:
+    name: "Latest PyTorch [dev]"
+    # Push CI doesn't need this image
+    if: inputs.image_postfix != '-push-ci'
+    runs-on:
+      group: aws-general-8-plus
+    steps:
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      -
+        name: Check out code
+        uses: actions/checkout@v4
+      -
+        name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      -
+        name: Build and push
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-pytorch-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-pytorch-gpu
+
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        with:
+          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
+          title: 🤗 Results of the huggingface/transformers-pytorch-gpudocker build
+          status: ${{ job.status }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+
  latest-pytorch-amd:
    name: "Latest PyTorch (AMD) [dev]"
    runs-on:
@ -190,47 +245,29 @@ jobs:
            REF=main
          push: true
          tags: huggingface/transformers-pytorch-amd-gpu${{ inputs.image_postfix }}
+      # Push CI images still need to be re-built daily
+      -
+        name: Build and push (for Push CI) in a daily basis
+        # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
+        # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
+        if: inputs.image_postfix != '-push-ci'
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-pytorch-amd-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-pytorch-amd-gpu-push-ci

      - name: Post to Slack
        if: always()
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
-          title: 🤗 Results of the huggingface/transformers-pytorch-amd-gpu build
+          title: 🤗 Results of the huggingface/transformers-pytorch-amd-gpu-push-ci build
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

-  cache-latest-pytorch-amd:
-    name: "Cache Latest Pytorch (AMD) Image"
-    needs: latest-pytorch-amd
-    runs-on:
-      group: amd-mi325-1gpu
-    steps:
-      -
-        name: Login to DockerHub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_PASSWORD }}
-        
-      - 
-        name: Pull and save docker image to cache
-        run: |
-          image="huggingface/transformers-pytorch-amd-gpu"
-          final_path="/mnt/image-cache/transformers-pytorch-amd-gpu.tar"
-          tmp_path="${final_path}.tmp"
-
-          echo "Pulling image: ${image}"
-          docker pull "${image}"
-
-          echo "Saving to temp file: ${tmp_path}"
-          docker save "${image}" -o "${tmp_path}"
-
-          echo "Moving to final path: ${final_path}"
-          mv -f "${tmp_path}" "${final_path}"
-
-          echo "Cache populated successfully at ${final_path}"
-
  latest-pytorch-deepspeed-amd:
    name: "PyTorch + DeepSpeed (AMD) [dev]"
    runs-on:
@ -257,6 +294,19 @@ jobs:
            REF=main
          push: true
          tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }}
+      # Push CI images still need to be re-built daily
+      -
+        name: Build and push (for Push CI) in a daily basis
+        # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
+        # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
+        if: inputs.image_postfix != '-push-ci'
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-pytorch-deepspeed-amd-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-pytorch-deepspeed-amd-gpu-push-ci

      - name: Post to Slack
        if: always()
@ -269,6 +319,8 @@ jobs:

  latest-quantization-torch-docker:
    name: "Latest Pytorch + Quantization [dev]"
+     # Push CI doesn't need this image
+    if: inputs.image_postfix != '-push-ci'
    runs-on:
      group: aws-general-8-plus
    steps:
--- a/.github/workflows/model_jobs.yml
+++ b/.github/workflows/model_jobs.yml
@ -28,9 +28,6 @@ on:
      report_repo_id:
        required: false
        type: string
-      pytest_marker:
-        required: false
-        type: string

 env:
  HF_HOME: /mnt/cache
@ -140,7 +137,7 @@ jobs:
      - name: Run all tests on GPU
        working-directory: /transformers
        run: |
-          script -q -c "PATCH_TESTING_METHODS_TO_COLLECT_OUTPUTS=yes _PATCHED_TESTING_METHODS_OUTPUT_DIR=/transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports python3 -m pytest -rsfE -v -m '${{ inputs.pytest_marker }}' --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports tests/${{ matrix.folders }}" test_outputs.txt
+          script -q -c "PATCH_TESTING_METHODS_TO_COLLECT_OUTPUTS=yes _PATCHED_TESTING_METHODS_OUTPUT_DIR=/transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports python3 -m pytest -rsfE -v --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports tests/${{ matrix.folders }}" test_outputs.txt
          ls -la
          # Extract the exit code from the output file
          EXIT_CODE=$(tail -1 test_outputs.txt | grep -o 'COMMAND_EXIT_CODE="[0-9]*"' | cut -d'"' -f2)
--- a/.github/workflows/push-important-models.yml
+++ b/.github/workflows/push-important-models.yml
@ -149,7 +149,7 @@ jobs:
    with:
      job: run_models_gpu
      slack_report_channel: "#transformers-ci-push"
-      docker: huggingface/transformers-all-latest-gpu:flash-attn
+      docker: huggingface/transformers-all-latest-gpu
      ci_event: push
      report_repo_id: hf-internal-testing/transformers_ci_push
      commit_sha: ${{ github.sha }}
--- a/.github/workflows/self-push-amd-mi210-caller.yml
+++ b/.github/workflows/self-push-amd-mi210-caller.yml
@ -0,0 +1,25 @@
+name: Self-hosted runner (AMD mi210 CI caller)
+
+on:
+  #workflow_run:
+  #  workflows: ["Self-hosted runner (push-caller)"]
+  #  branches: ["main"]
+  #  types: [completed]
+  push:
+    branches:
+      - run_amd_push_ci_caller*
+    paths:
+      - "src/**"
+      - "tests/**"
+      - ".github/**"
+      - "templates/**"
+      - "utils/**"
+
+jobs:
+  run_amd_ci:
+    name: AMD mi210
+    if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_push_ci_caller')))
+    uses: ./.github/workflows/self-push-amd.yml
+    with:
+      gpu_flavor: mi210
+    secrets: inherit
--- a/.github/workflows/self-push-amd-mi250-caller.yml
+++ b/.github/workflows/self-push-amd-mi250-caller.yml
@ -0,0 +1,25 @@
+name: Self-hosted runner (AMD mi250 CI caller)
+
+on:
+  #workflow_run:
+  #  workflows: ["Self-hosted runner (push-caller)"]
+  #  branches: ["main"]
+  #  types: [completed]
+  push:
+    branches:
+      - run_amd_push_ci_caller*
+    paths:
+      - "src/**"
+      - "tests/**"
+      - ".github/**"
+      - "templates/**"
+      - "utils/**"
+
+jobs:
+  run_amd_ci:
+    name: AMD mi250
+    if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_push_ci_caller')))
+    uses: ./.github/workflows/self-push-amd.yml
+    with:
+      gpu_flavor: mi250
+    secrets: inherit
--- a/.github/workflows/self-push-amd.yml
+++ b/.github/workflows/self-push-amd.yml
@ -0,0 +1,334 @@
+name: Self-hosted runner AMD GPU (push)
+
+on:
+  workflow_call:
+    inputs:
+      gpu_flavor:
+        required: true
+        type: string
+
+env:
+  HF_HOME: /mnt/cache
+  TRANSFORMERS_IS_CI: yes
+  OMP_NUM_THREADS: 8
+  MKL_NUM_THREADS: 8
+  PYTEST_TIMEOUT: 60
+  TF_FORCE_GPU_ALLOW_GROWTH: true
+  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
+
+jobs:
+  check_runner_status:
+    name: Check Runner Status
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Checkout transformers
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 2
+
+      - name: Check Runner Status
+        run: python utils/check_self_hosted_runner.py --target_runners amd-mi210-single-gpu-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
+
+  check_runners:
+    name: Check Runners
+    needs: check_runner_status
+    strategy:
+      matrix:
+        machine_type: [single-gpu, multi-gpu]
+    runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+    container:
+      image: huggingface/transformers-pytorch-amd-gpu-push-ci  # <--- We test only for PyTorch for now
+      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: ROCM-SMI
+        run: |
+          rocm-smi
+      - name: ROCM-INFO
+        run: |
+          rocminfo  | grep "Agent" -A 14
+      - name: Show ROCR environment
+        run: |
+          echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+  setup_gpu:
+    name: Setup
+    needs: check_runners
+    strategy:
+      matrix:
+        machine_type: [single-gpu, multi-gpu]
+    runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+    container:
+      image: huggingface/transformers-pytorch-amd-gpu-push-ci  # <--- We test only for PyTorch for now
+      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+      test_map: ${{ steps.set-matrix.outputs.test_map }}
+    env:
+      # `CI_BRANCH_PUSH`: The branch name from the push event
+      # `CI_BRANCH_WORKFLOW_RUN`: The name of the branch on which this workflow is triggered by `workflow_run` event
+      # `CI_SHA_PUSH`: The commit SHA from the push event
+      # `CI_SHA_WORKFLOW_RUN`: The commit SHA that triggers this workflow by `workflow_run` event
+      CI_BRANCH_PUSH: ${{ github.event.ref }}
+      CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }}
+      CI_SHA_PUSH: ${{ github.event.head_commit.id }}
+      CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }}
+    steps:
+      # Necessary to get the correct branch name and commit SHA for `workflow_run` event
+      # We also take into account the `push` event (we might want to test some changes in a branch)
+      - name: Prepare custom environment variables
+        shell: bash
+        # `CI_BRANCH`: The non-empty branch name from the above two (one and only one of them is empty)
+        # `CI_SHA`: The non-empty commit SHA from the above two (one and only one of them is empty)
+        run: |
+          CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
+          echo $CI_BRANCH_PUSH
+          echo $CI_BRANCH_WORKFLOW_RUN
+          echo $CI_SHA_PUSH
+          echo $CI_SHA_WORKFLOW_RUN
+          [[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
+          [[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
+
+      - name: print environment variables
+        run: |
+          echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
+          echo "env.CI_SHA = ${{ env.CI_SHA }}"
+
+      - name: Update clone using environment variables
+        working-directory: /transformers
+        run: |
+          echo "original branch = $(git branch --show-current)"
+          git fetch && git checkout ${{ env.CI_BRANCH }}
+          echo "updated branch = $(git branch --show-current)"
+          git checkout ${{ env.CI_SHA }}
+          echo "log = $(git log -n 1)"
+
+      - name: Cleanup
+        working-directory: /transformers
+        run: |
+          rm -rf tests/__pycache__
+          rm -rf tests/models/__pycache__
+          rm -rf reports
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Fetch the tests to run
+        working-directory: /transformers
+        # TODO: add `git-python` in the docker images
+        run: |
+          pip install --upgrade git-python
+          python3 utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
+
+      - name: Report fetched tests
+        uses: actions/upload-artifact@v4
+        with:
+          name: test_fetched
+          path: /transformers/test_preparation.txt
+
+      - id: set-matrix
+        name: Organize tests into models
+        working-directory: /transformers
+        # The `keys` is used as GitHub actions matrix for jobs, i.e. `models/bert`, `tokenization`, `pipeline`, etc.
+        # The `test_map` is used to get the actual identified test files under each key.
+        # If no test to run (so no `test_map.json` file), create a dummy map (empty matrix will fail)
+        run: |
+          if [ -f test_map.json ]; then
+              keys=$(python3 -c 'import json; fp = open("test_map.json"); test_map = json.load(fp); fp.close(); d = list(test_map.keys()); print(d)')
+              test_map=$(python3 -c 'import json; fp = open("test_map.json"); test_map = json.load(fp); fp.close(); print(test_map)')
+          else
+              keys=$(python3 -c 'keys = ["dummy"]; print(keys)')
+              test_map=$(python3 -c 'test_map = {"dummy": []}; print(test_map)')
+          fi
+          echo $keys
+          echo $test_map
+          echo "matrix=$keys" >> $GITHUB_OUTPUT
+          echo "test_map=$test_map" >> $GITHUB_OUTPUT
+
+  run_models_gpu:
+    name: Model tests
+    needs: setup_gpu
+    # `dummy` means there is no test to run
+    if: contains(fromJson(needs.setup_gpu.outputs.matrix), 'dummy') != true
+    strategy:
+      fail-fast: false
+      matrix:
+        folders: ${{ fromJson(needs.setup_gpu.outputs.matrix) }}
+        machine_type: [single-gpu, multi-gpu]
+    runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+    container:
+      image: huggingface/transformers-pytorch-amd-gpu-push-ci  # <--- We test only for PyTorch for now
+      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    env:
+      # For the meaning of these environment variables, see the job `Setup`
+      CI_BRANCH_PUSH: ${{ github.event.ref }}
+      CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }}
+      CI_SHA_PUSH: ${{ github.event.head_commit.id }}
+      CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }}
+    steps:
+      # Necessary to get the correct branch name and commit SHA for `workflow_run` event
+      # We also take into account the `push` event (we might want to test some changes in a branch)
+      - name: Prepare custom environment variables
+        shell: bash
+        # For the meaning of these environment variables, see the job `Setup`
+        run: |
+          CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
+          echo $CI_BRANCH_PUSH
+          echo $CI_BRANCH_WORKFLOW_RUN
+          echo $CI_SHA_PUSH
+          echo $CI_SHA_WORKFLOW_RUN
+          [[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
+          [[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
+
+      - name: print environment variables
+        run: |
+          echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
+          echo "env.CI_SHA = ${{ env.CI_SHA }}"
+
+      - name: Update clone using environment variables
+        working-directory: /transformers
+        run: |
+          echo "original branch = $(git branch --show-current)"
+          git fetch && git checkout ${{ env.CI_BRANCH }}
+          echo "updated branch = $(git branch --show-current)"
+          git checkout ${{ env.CI_SHA }}
+          echo "log = $(git log -n 1)"
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: Echo folder ${{ matrix.folders }}
+        shell: bash
+        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
+        # set the artifact folder names (because the character `/` is not allowed).
+        run: |
+          echo "${{ matrix.folders }}"
+          echo "${{ fromJson(needs.setup_gpu.outputs.test_map)[matrix.folders] }}"
+          matrix_folders=${{ matrix.folders }}
+          matrix_folders=${matrix_folders/'models/'/'models_'}
+          echo "$matrix_folders"
+          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
+
+      - name: ROCM-SMI
+        run: |
+          rocm-smi
+      - name: ROCM-INFO
+        run: |
+          rocminfo  | grep "Agent" -A 14
+      - name: Show ROCR environment
+        run: |
+          echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Run all non-slow selected tests on GPU
+        working-directory: /transformers
+        run: |
+          python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports ${{ fromJson(needs.setup_gpu.outputs.test_map)[matrix.folders] }} -m "not not_device_test"
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
+
+      - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports"
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ matrix.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports
+          path: /transformers/reports/${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
+
+  send_results:
+    name: Send results to webhook
+    runs-on: ubuntu-22.04
+    if: always()
+    needs: [
+        check_runner_status,
+        check_runners,
+        setup_gpu,
+        run_models_gpu,
+#        run_tests_torch_cuda_extensions_single_gpu,
+#        run_tests_torch_cuda_extensions_multi_gpu
+    ]
+    env:
+      # For the meaning of these environment variables, see the job `Setup`
+      CI_BRANCH_PUSH: ${{ github.event.ref }}
+      CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }}
+      CI_SHA_PUSH: ${{ github.event.head_commit.id }}
+      CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }}
+    steps:
+      - name: Preliminary job status
+        shell: bash
+        # For the meaning of these environment variables, see the job `Setup`
+        run: |
+          echo "Runner availability: ${{ needs.check_runner_status.result }}"
+          echo "Setup status: ${{ needs.setup_gpu.result }}"
+          echo "Runner status: ${{ needs.check_runners.result }}"
+
+      # Necessary to get the correct branch name and commit SHA for `workflow_run` event
+      # We also take into account the `push` event (we might want to test some changes in a branch)
+      - name: Prepare custom environment variables
+        shell: bash
+        # For the meaning of these environment variables, see the job `Setup`
+        run: |
+          CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
+          echo $CI_BRANCH_PUSH
+          echo $CI_BRANCH_WORKFLOW_RUN
+          echo $CI_SHA_PUSH
+          echo $CI_SHA_WORKFLOW_RUN
+          [[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
+          [[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
+
+      - name: print environment variables
+        run: |
+          echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
+          echo "env.CI_SHA = ${{ env.CI_SHA }}"
+
+      - uses: actions/checkout@v4
+        # To avoid failure when multiple commits are merged into `main` in a short period of time.
+        # Checking out to an old commit beyond the fetch depth will get an error `fatal: reference is not a tree: ...
+        # (Only required for `workflow_run` event, where we get the latest HEAD on `main` instead of the event commit)
+        with:
+          fetch-depth: 20
+
+      - name: Update clone using environment variables
+        run: |
+          echo "original branch = $(git branch --show-current)"
+          git fetch && git checkout ${{ env.CI_BRANCH }}
+          echo "updated branch = $(git branch --show-current)"
+          git checkout ${{ env.CI_SHA }}
+          echo "log = $(git log -n 1)"
+
+      - uses: actions/download-artifact@v4
+      - name: Send message to Slack
+        env:
+          CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
+          CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
+          CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
+          CI_SLACK_CHANNEL_ID_AMD: ${{ secrets.CI_SLACK_CHANNEL_ID_AMD }}
+          CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
+          CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_AMD }}
+          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
+          CI_EVENT: Push CI (AMD) - ${{ inputs.gpu_flavor }}
+          CI_TITLE_PUSH: ${{ github.event.head_commit.message }}
+          CI_TITLE_WORKFLOW_RUN: ${{ github.event.workflow_run.head_commit.message }}
+          CI_SHA: ${{ env.CI_SHA }}
+          RUNNER_STATUS: ${{ needs.check_runner_status.result }}
+          RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
+          SETUP_STATUS: ${{ needs.setup_gpu.result }}
+
+        # We pass `needs.setup_gpu.outputs.matrix` as the argument. A processing in `notification_service.py` to change
+        # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
+        run: |
+          pip install huggingface_hub
+          pip install slack_sdk
+          pip show slack_sdk
+          python utils/notification_service.py "${{ needs.setup_gpu.outputs.matrix }}"
--- a/.github/workflows/self-push-caller.yml
+++ b/.github/workflows/self-push-caller.yml
@ -0,0 +1,54 @@
+# Used to trigger self-push CI
+name: Self-hosted runner (push-caller)
+
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - "src/**"
+      - "tests/**"
+      - ".github/**"
+      - "templates/**"
+      - "utils/**"
+
+jobs:
+  check-for-setup:
+      runs-on: ubuntu-22.04
+      name: Check if setup was changed
+      outputs:
+        changed: ${{ steps.was_changed.outputs.changed }}
+      steps:
+        - uses: actions/checkout@v4
+          with: 
+            fetch-depth: "2"
+        
+        - name: Get changed files
+          id: changed-files
+          uses: tj-actions/changed-files@1c8e6069583811afb28f97afeaf8e7da80c6be5c
+        
+        - name: Was setup changed 
+          id: was_changed
+          run: |
+            for file in ${{ steps.changed-files.outputs.all_changed_files }}; do
+              if [ `basename "${file}"` = "setup.py" ]; then
+                echo "changed=1" >> $GITHUB_OUTPUT
+              fi
+            done
+
+  build-docker-containers:
+    needs: check-for-setup
+    if: (github.event_name == 'push') && (needs.check-for-setup.outputs.changed == '1')
+    uses: ./.github/workflows/build-docker-images.yml
+    with:
+      image_postfix: "-push-ci"
+    secrets: inherit
+
+  run_push_ci:
+    name: Trigger Push CI
+    runs-on: ubuntu-22.04
+    if: ${{ always() }}
+    needs: build-docker-containers
+    steps:
+      - name: Trigger push CI via workflow_run
+        run: echo "Trigger push CI via workflow_run"
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@ -0,0 +1,652 @@
+name: Self-hosted runner (push)
+
+on:
+  workflow_run:
+    workflows: ["Self-hosted runner (push-caller)"]
+    branches: ["main"]
+    types: [completed]
+  push:
+    branches:
+      - ci_*
+      - ci-*
+    paths:
+      - "src/**"
+      - "tests/**"
+      - ".github/**"
+      - "templates/**"
+      - "utils/**"
+  repository_dispatch:
+
+env:
+  HF_HOME: /mnt/cache
+  TRANSFORMERS_IS_CI: yes
+  OMP_NUM_THREADS: 8
+  MKL_NUM_THREADS: 8
+  PYTEST_TIMEOUT: 60
+  TF_FORCE_GPU_ALLOW_GROWTH: true
+  CUDA_VISIBLE_DEVICES: 0,1
+
+jobs:
+  setup:
+    name: Setup
+    strategy:
+      matrix:
+        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
+    runs-on:
+      group: '${{ matrix.machine_type }}'
+    container:
+      image: huggingface/transformers-all-latest-gpu-push-ci
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+      test_map: ${{ steps.set-matrix.outputs.test_map }}
+    env:
+      # `CI_BRANCH_PUSH`: The branch name from the push event
+      # `CI_BRANCH_WORKFLOW_RUN`: The name of the branch on which this workflow is triggered by `workflow_run` event
+      # `CI_SHA_PUSH`: The commit SHA from the push event
+      # `CI_SHA_WORKFLOW_RUN`: The commit SHA that triggers this workflow by `workflow_run` event
+      CI_BRANCH_PUSH: ${{ github.event.ref }}
+      CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }}
+      CI_SHA_PUSH: ${{ github.event.head_commit.id }}
+      CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }}
+    steps:
+      # Necessary to get the correct branch name and commit SHA for `workflow_run` event
+      # We also take into account the `push` event (we might want to test some changes in a branch)
+      - name: Prepare custom environment variables
+        shell: bash
+        # `CI_BRANCH`: The non-empty branch name from the above two (one and only one of them is empty)
+        # `CI_SHA`: The non-empty commit SHA from the above two (one and only one of them is empty)
+        run: |
+          CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
+          echo $CI_BRANCH_PUSH
+          echo $CI_BRANCH_WORKFLOW_RUN
+          echo $CI_SHA_PUSH
+          echo $CI_SHA_WORKFLOW_RUN
+          [[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
+          [[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
+
+      - name: print environment variables
+        run: |
+          echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
+          echo "env.CI_SHA = ${{ env.CI_SHA }}"
+
+      - name: Update clone using environment variables
+        working-directory: /transformers
+        run: |
+          echo "original branch = $(git branch --show-current)"
+          git fetch && git checkout ${{ env.CI_BRANCH }}
+          echo "updated branch = $(git branch --show-current)"
+          git checkout ${{ env.CI_SHA }}
+          echo "log = $(git log -n 1)"
+
+      - name: Cleanup
+        working-directory: /transformers
+        run: |
+          rm -rf tests/__pycache__
+          rm -rf tests/models/__pycache__
+          rm -rf reports
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Fetch the tests to run
+        working-directory: /transformers
+        # TODO: add `git-python` in the docker images
+        run: |
+          pip install --upgrade git-python
+          python3 utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
+
+      - name: Report fetched tests
+        uses: actions/upload-artifact@v4
+        with:
+          name: test_fetched
+          path: /transformers/test_preparation.txt
+
+      - id: set-matrix
+        name: Organize tests into models
+        working-directory: /transformers
+        # The `keys` is used as GitHub actions matrix for jobs, i.e. `models/bert`, `tokenization`, `pipeline`, etc.
+        # The `test_map` is used to get the actual identified test files under each key.
+        # If no test to run (so no `test_map.json` file), create a dummy map (empty matrix will fail)
+        run: |
+          if [ -f test_map.json ]; then
+              keys=$(python3 -c 'import json; fp = open("test_map.json"); test_map = json.load(fp); fp.close(); d = list(test_map.keys()); print(d)')
+              test_map=$(python3 -c 'import json; fp = open("test_map.json"); test_map = json.load(fp); fp.close(); print(test_map)')
+          else
+              keys=$(python3 -c 'keys = ["dummy"]; print(keys)')
+              test_map=$(python3 -c 'test_map = {"dummy": []}; print(test_map)')
+          fi
+          echo $keys
+          echo $test_map
+          echo "matrix=$keys" >> $GITHUB_OUTPUT
+          echo "test_map=$test_map" >> $GITHUB_OUTPUT
+
+  run_tests_single_gpu:
+    name: Model tests
+    needs: setup
+    # `dummy` means there is no test to run
+    if: contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true
+    strategy:
+      fail-fast: false
+      matrix:
+        folders: ${{ fromJson(needs.setup.outputs.matrix) }}
+        machine_type: [aws-g5-4xlarge-cache]
+    runs-on:
+      group: '${{ matrix.machine_type }}'
+    container:
+      image: huggingface/transformers-all-latest-gpu-push-ci
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    env:
+      # For the meaning of these environment variables, see the job `Setup`
+      CI_BRANCH_PUSH: ${{ github.event.ref }}
+      CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }}
+      CI_SHA_PUSH: ${{ github.event.head_commit.id }}
+      CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }}
+    steps:
+      # Necessary to get the correct branch name and commit SHA for `workflow_run` event
+      # We also take into account the `push` event (we might want to test some changes in a branch)
+      - name: Prepare custom environment variables
+        shell: bash
+        # For the meaning of these environment variables, see the job `Setup`
+        run: |
+          CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
+          echo $CI_BRANCH_PUSH
+          echo $CI_BRANCH_WORKFLOW_RUN
+          echo $CI_SHA_PUSH
+          echo $CI_SHA_WORKFLOW_RUN
+          [[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
+          [[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
+
+      - name: print environment variables
+        run: |
+          echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
+          echo "env.CI_SHA = ${{ env.CI_SHA }}"
+
+      - name: Set `machine_type` for report and artifact names
+        working-directory: /transformers
+        shell: bash
+        run: |
+          echo "${{ matrix.machine_type }}"
+
+          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+            machine_type=single-gpu
+          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
+            machine_type=multi-gpu
+          else
+            machine_type=${{ matrix.machine_type }}
+          fi
+
+          echo "$machine_type"
+          echo "machine_type=$machine_type" >> $GITHUB_ENV
+
+      - name: Update clone using environment variables
+        working-directory: /transformers
+        run: |
+          echo "original branch = $(git branch --show-current)"
+          git fetch && git checkout ${{ env.CI_BRANCH }}
+          echo "updated branch = $(git branch --show-current)"
+          git checkout ${{ env.CI_SHA }}
+          echo "log = $(git log -n 1)"
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: Echo folder ${{ matrix.folders }}
+        shell: bash
+        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
+        # set the artifact folder names (because the character `/` is not allowed).
+        run: |
+          echo "${{ matrix.folders }}"
+          echo "${{ fromJson(needs.setup.outputs.test_map)[matrix.folders] }}"
+          matrix_folders=${{ matrix.folders }}
+          matrix_folders=${matrix_folders/'models/'/'models_'}
+          echo "$matrix_folders"
+          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
+
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Run all non-slow selected tests on GPU
+        working-directory: /transformers
+        run: |
+          python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ env.machine_type }}_tests_gpu_${{ matrix.folders }} ${{ fromJson(needs.setup.outputs.test_map)[matrix.folders] }}
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ env.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
+
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports"
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ env.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
+          path: /transformers/reports/${{ env.machine_type }}_tests_gpu_${{ matrix.folders }}
+
+  run_tests_multi_gpu:
+    name: Model tests
+    needs: setup
+    # `dummy` means there is no test to run
+    if: contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true
+    strategy:
+      fail-fast: false
+      matrix:
+        folders: ${{ fromJson(needs.setup.outputs.matrix) }}
+        machine_type: [aws-g5-12xlarge-cache]
+    runs-on:
+      group: '${{ matrix.machine_type }}'
+    container:
+      image: huggingface/transformers-all-latest-gpu-push-ci
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    env:
+      # For the meaning of these environment variables, see the job `Setup`
+      CI_BRANCH_PUSH: ${{ github.event.ref }}
+      CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }}
+      CI_SHA_PUSH: ${{ github.event.head_commit.id }}
+      CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }}
+    steps:
+      # Necessary to get the correct branch name and commit SHA for `workflow_run` event
+      # We also take into account the `push` event (we might want to test some changes in a branch)
+      - name: Prepare custom environment variables
+        shell: bash
+        # For the meaning of these environment variables, see the job `Setup`
+        run: |
+          CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
+          echo $CI_BRANCH_PUSH
+          echo $CI_BRANCH_WORKFLOW_RUN
+          echo $CI_SHA_PUSH
+          echo $CI_SHA_WORKFLOW_RUN
+          [[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
+          [[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
+
+      - name: print environment variables
+        run: |
+          echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
+          echo "env.CI_SHA = ${{ env.CI_SHA }}"
+
+      - name: Set `machine_type` for report and artifact names
+        working-directory: /transformers
+        shell: bash
+        run: |
+          echo "${{ matrix.machine_type }}"
+
+          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+            machine_type=single-gpu
+          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
+            machine_type=multi-gpu
+          else
+            machine_type=${{ matrix.machine_type }}
+          fi
+
+          echo "$machine_type"
+          echo "machine_type=$machine_type" >> $GITHUB_ENV
+
+      - name: Update clone using environment variables
+        working-directory: /transformers
+        run: |
+          echo "original branch = $(git branch --show-current)"
+          git fetch && git checkout ${{ env.CI_BRANCH }}
+          echo "updated branch = $(git branch --show-current)"
+          git checkout ${{ env.CI_SHA }}
+          echo "log = $(git log -n 1)"
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: Echo folder ${{ matrix.folders }}
+        shell: bash
+        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
+        # set the artifact folder names (because the character `/` is not allowed).
+        run: |
+          echo "${{ matrix.folders }}"
+          echo "${{ fromJson(needs.setup.outputs.test_map)[matrix.folders] }}"
+          matrix_folders=${{ matrix.folders }}
+          matrix_folders=${matrix_folders/'models/'/'models_'}
+          echo "$matrix_folders"
+          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
+
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Run all non-slow selected tests on GPU
+        env:
+          MKL_SERVICE_FORCE_INTEL: 1
+        working-directory: /transformers
+        run: |
+          python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ env.machine_type }}_tests_gpu_${{ matrix.folders }} ${{ fromJson(needs.setup.outputs.test_map)[matrix.folders] }}
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ env.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
+
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports"
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ env.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
+          path: /transformers/reports/${{ env.machine_type }}_tests_gpu_${{ matrix.folders }}
+
+  run_tests_torch_cuda_extensions_single_gpu:
+    name: Torch CUDA extension tests
+    needs: setup
+    if: contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended')
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [aws-g5-4xlarge-cache]
+    runs-on:
+      group: '${{ matrix.machine_type }}'
+    container:
+      image: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    env:
+      # For the meaning of these environment variables, see the job `Setup`
+      CI_BRANCH_PUSH: ${{ github.event.ref }}
+      CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }}
+      CI_SHA_PUSH: ${{ github.event.head_commit.id }}
+      CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }}
+    steps:
+      # Necessary to get the correct branch name and commit SHA for `workflow_run` event
+      # We also take into account the `push` event (we might want to test some changes in a branch)
+      - name: Prepare custom environment variables
+        shell: bash
+        # For the meaning of these environment variables, see the job `Setup`
+        run: |
+          CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
+          echo $CI_BRANCH_PUSH
+          echo $CI_BRANCH_WORKFLOW_RUN
+          echo $CI_SHA_PUSH
+          echo $CI_SHA_WORKFLOW_RUN
+          [[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
+          [[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
+
+      - name: print environment variables
+        run: |
+          echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
+          echo "env.CI_SHA = ${{ env.CI_SHA }}"
+
+      - name: Set `machine_type` for report and artifact names
+        working-directory: /workspace/transformers
+        shell: bash
+        run: |
+          echo "${{ matrix.machine_type }}"
+
+          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+            machine_type=single-gpu
+          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
+            machine_type=multi-gpu
+          else
+            machine_type=${{ matrix.machine_type }}
+          fi
+
+          echo "$machine_type"
+          echo "machine_type=$machine_type" >> $GITHUB_ENV
+
+      - name: Update clone using environment variables
+        working-directory: /workspace/transformers
+        run: |
+          echo "original branch = $(git branch --show-current)"
+          git fetch && git checkout ${{ env.CI_BRANCH }}
+          echo "updated branch = $(git branch --show-current)"
+          git checkout ${{ env.CI_SHA }}
+          echo "log = $(git log -n 1)"
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /workspace/transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: Remove cached torch extensions
+        run: rm -rf /github/home/.cache/torch_extensions/
+
+      # To avoid unknown test failures
+      - name: Pre build DeepSpeed *again*
+        working-directory: /workspace
+        run: |
+          python3 -m pip uninstall -y deepspeed
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
+      - name: Environment
+        working-directory: /workspace/transformers
+        run: |
+          python utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /workspace/transformers
+        run: pip freeze
+
+      - name: Run all non-slow selected tests on GPU
+        working-directory: /workspace/transformers
+        # TODO: Here we pass all tests in the 2 folders for simplicity. It's better to pass only the identified tests.
+        run: |
+          python -m pytest -n 1 --dist=loadfile -v --make-reports=${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /workspace/transformers/reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt
+
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports"
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
+          path: /workspace/transformers/reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
+
+  run_tests_torch_cuda_extensions_multi_gpu:
+    name: Torch CUDA extension tests
+    needs: setup
+    if: contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended')
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [aws-g5-12xlarge-cache]
+    runs-on:
+      group: '${{ matrix.machine_type }}'
+    container:
+      image: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    env:
+      # For the meaning of these environment variables, see the job `Setup`
+      CI_BRANCH_PUSH: ${{ github.event.ref }}
+      CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }}
+      CI_SHA_PUSH: ${{ github.event.head_commit.id }}
+      CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }}
+    steps:
+      # Necessary to get the correct branch name and commit SHA for `workflow_run` event
+      # We also take into account the `push` event (we might want to test some changes in a branch)
+      - name: Prepare custom environment variables
+        shell: bash
+        # For the meaning of these environment variables, see the job `Setup`
+        run: |
+          CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
+          echo $CI_BRANCH_PUSH
+          echo $CI_BRANCH_WORKFLOW_RUN
+          echo $CI_SHA_PUSH
+          echo $CI_SHA_WORKFLOW_RUN
+          [[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
+          [[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
+
+      - name: print environment variables
+        run: |
+          echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
+          echo "env.CI_SHA = ${{ env.CI_SHA }}"
+
+      - name: Set `machine_type` for report and artifact names
+        working-directory: /workspace/transformers
+        shell: bash
+        run: |
+          echo "${{ matrix.machine_type }}"
+
+          if [ "${{ matrix.machine_type }}" = "aws-g5-4xlarge-cache" ]; then
+            machine_type=single-gpu
+          elif [ "${{ matrix.machine_type }}" = "aws-g5-12xlarge-cache" ]; then
+            machine_type=multi-gpu
+          else
+            machine_type=${{ matrix.machine_type }}
+          fi
+
+          echo "$machine_type"
+          echo "machine_type=$machine_type" >> $GITHUB_ENV
+
+      - name: Update clone using environment variables
+        working-directory: /workspace/transformers
+        run: |
+          echo "original branch = $(git branch --show-current)"
+          git fetch && git checkout ${{ env.CI_BRANCH }}
+          echo "updated branch = $(git branch --show-current)"
+          git checkout ${{ env.CI_SHA }}
+          echo "log = $(git log -n 1)"
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /workspace/transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: Remove cached torch extensions
+        run: rm -rf /github/home/.cache/torch_extensions/
+
+      # To avoid unknown test failures
+      - name: Pre build DeepSpeed *again*
+        working-directory: /workspace
+        run: |
+          python3 -m pip uninstall -y deepspeed
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
+      - name: Environment
+        working-directory: /workspace/transformers
+        run: |
+          python utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /workspace/transformers
+        run: pip freeze
+
+      - name: Run all non-slow selected tests on GPU
+        working-directory: /workspace/transformers
+        # TODO: Here we pass all tests in the 2 folders for simplicity. It's better to pass only the identified tests.
+        run: |
+          python -m pytest -n 1 --dist=loadfile -v --make-reports=${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /workspace/transformers/reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt
+
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports"
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
+          path: /workspace/transformers/reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
+
+  send_results:
+    name: Send results to webhook
+    runs-on: ubuntu-22.04
+    if: always()
+    needs: [
+        setup,
+        run_tests_single_gpu,
+        run_tests_multi_gpu,
+        run_tests_torch_cuda_extensions_single_gpu,
+        run_tests_torch_cuda_extensions_multi_gpu
+    ]
+    env:
+      # For the meaning of these environment variables, see the job `Setup`
+      CI_BRANCH_PUSH: ${{ github.event.ref }}
+      CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }}
+      CI_SHA_PUSH: ${{ github.event.head_commit.id }}
+      CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }}
+    steps:
+      - name: Preliminary job status
+        shell: bash
+        # For the meaning of these environment variables, see the job `Setup`
+        run: |
+          echo "Setup status: ${{ needs.setup.result }}"
+
+      # Necessary to get the correct branch name and commit SHA for `workflow_run` event
+      # We also take into account the `push` event (we might want to test some changes in a branch)
+      - name: Prepare custom environment variables
+        shell: bash
+        # For the meaning of these environment variables, see the job `Setup`
+        run: |
+          CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''}
+          echo $CI_BRANCH_PUSH
+          echo $CI_BRANCH_WORKFLOW_RUN
+          echo $CI_SHA_PUSH
+          echo $CI_SHA_WORKFLOW_RUN
+          [[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV
+          [[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV
+
+      - name: print environment variables
+        run: |
+          echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}"
+          echo "env.CI_SHA = ${{ env.CI_SHA }}"
+
+      - uses: actions/checkout@v4
+        # To avoid failure when multiple commits are merged into `main` in a short period of time.
+        # Checking out to an old commit beyond the fetch depth will get an error `fatal: reference is not a tree: ...
+        # (Only required for `workflow_run` event, where we get the latest HEAD on `main` instead of the event commit)
+        with:
+          fetch-depth: 20
+
+      - name: Update clone using environment variables
+        run: |
+          echo "original branch = $(git branch --show-current)"
+          git fetch && git checkout ${{ env.CI_BRANCH }}
+          echo "updated branch = $(git branch --show-current)"
+          git checkout ${{ env.CI_SHA }}
+          echo "log = $(git log -n 1)"
+
+      - uses: actions/download-artifact@v4
+      - name: Send message to Slack
+        env:
+          CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
+          CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
+          CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
+          CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
+          CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
+          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
+          CI_EVENT: push
+          CI_TITLE_PUSH: ${{ github.event.head_commit.message }}
+          CI_TITLE_WORKFLOW_RUN: ${{ github.event.workflow_run.head_commit.message }}
+          CI_SHA: ${{ env.CI_SHA }}
+          SETUP_STATUS: ${{ needs.setup.result }}
+
+        # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
+        # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
+        run: |
+          pip install huggingface_hub
+          pip install slack_sdk
+          pip show slack_sdk
+          python utils/notification_service.py "${{ needs.setup.outputs.matrix }}"
--- a/.github/workflows/self-scheduled-caller.yml
+++ b/.github/workflows/self-scheduled-caller.yml
@ -63,7 +63,7 @@ jobs:
    with:
      job: run_pipelines_torch_gpu
      slack_report_channel: "#transformers-ci-daily-pipeline-torch"
-      docker: huggingface/transformers-all-latest-gpu
+      docker: huggingface/transformers-pytorch-gpu
      ci_event: Daily CI
      report_repo_id: hf-internal-testing/transformers_daily_ci
      commit_sha: ${{ github.sha }}
--- a/.github/workflows/self-scheduled-flash-attn-caller.yml
+++ b/.github/workflows/self-scheduled-flash-attn-caller.yml
@ -1,60 +0,0 @@
-name: Nvidia CI - Flash Attn
-
-on:
-  repository_dispatch:
-  schedule:
-    - cron: "17 2 * * *"
-  push:
-    branches:
-      - run_nvidia_ci_flash_attn*
-  workflow_dispatch:
-    inputs:
-      prev_workflow_run_id:
-        description: 'previous workflow run id to compare'
-        type: string
-        required: false
-        default: ""
-      other_workflow_run_id:
-        description: 'other workflow run id to compare'
-        type: string
-        required: false
-        default: ""
-
-
-# Used for `push` to easily modify the target workflow runs to compare against
-env:
-    prev_workflow_run_id: ""
-    other_workflow_run_id: ""
-
-
-jobs:
-  setup:
-    name: Setup
-    runs-on: ubuntu-22.04
-    steps:
-      - name: Setup
-        run: |
-          mkdir "setup_values"
-          echo "${{ inputs.prev_workflow_run_id || env.prev_workflow_run_id }}" > "setup_values/prev_workflow_run_id.txt"
-          echo "${{ inputs.other_workflow_run_id || env.other_workflow_run_id }}" > "setup_values/other_workflow_run_id.txt"
-
-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          name: setup_values
-          path: setup_values
-
-
-  model-ci:
-    name: Model CI
-    uses: ./.github/workflows/self-scheduled.yml
-    with:
-      job: run_models_gpu
-      slack_report_channel: "#transformers-ci-flash-attn"
-      docker: huggingface/transformers-all-latest-gpu:flash-attn
-      ci_event: Daily CI
-      runner_type: "a10"
-      report_repo_id: hf-internal-testing/transformers_flash_attn_ci
-      commit_sha: ${{ github.sha }}
-      pytest_marker: "flash_attn_test or flash_attn_3_test"
-    secrets: inherit
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@ -38,10 +38,6 @@ on:
        default: ""
        required: false
        type: string
-      pytest_marker:
-        required: false
-        type: string
-

 env:
  HF_HOME: /mnt/cache
@ -131,7 +127,6 @@ jobs:
      commit_sha: ${{ inputs.commit_sha || github.sha }}
      runner_type: ${{ inputs.runner_type }}
      report_repo_id: ${{ inputs.report_repo_id }}
-      pytest_marker: ${{ inputs.pytest_marker }}
    secrets: inherit

  run_trainer_and_fsdp_gpu:
@ -165,7 +160,7 @@ jobs:
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
-      image: huggingface/transformers-all-latest-gpu
+      image: huggingface/transformers-pytorch-gpu
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      - name: Update clone
--- a/AGENTS.md
+++ b/AGENTS.md
@ -14,7 +14,7 @@ This AGENTS.md file provides guidance for code agents working with this codebase

 - PRs should be as brief as possible. Bugfix PRs in particular can often be only one or two lines long, and do not need large comments, docstrings or new functions in this case. Aim to minimize the size of the diff.
 - When writing tests, they should be added to an existing file. The only exception is for PRs to add a new model, when a new test directory should be created for that model.
- Code style is enforced in the CI. You can install the style tools with `pip install -e ".[quality]"`. You can then run `make fixup` to apply style and consistency fixes to your code.
+- Code style is enforced in the CI. You can install the style tools with `pip install -e .[quality]`. You can then run `make fixup` to apply style and consistency fixes to your code.

 ## Copying and inheritance

@ -36,4 +36,4 @@ After making changes, you should usually run `make fixup` to ensure any copies a
 the model you made the changes in and any other models that were updated by `make fixup`. Tests can be run with `pytest tests/models/[name]/test_modeling_[name].py`
 If your changes affect code in other classes like tokenizers or processors, you should run those tests instead, like `test_processing_[name].py` or `test_tokenization_[name].py`.

-In order to run tests, you may need to install dependencies. You can do this with `pip install -e ".[testing]"`. You will probably also need to `pip install torch accelerate` if your environment does not already have them.
+In order to run tests, you may need to install dependencies. You can do this with `pip install -e .[testing]`. You will probably also need to `pip install torch accelerate` if your environment does not already have them.
--- a/README.md
+++ b/README.md
@ -64,8 +64,8 @@ limitations under the License.
    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_as_a_model_definition.png"/>
 </h3>

-Transformers acts as the model-definition framework for state-of-the-art machine learning with text, computer
-vision, audio, video, and multimodal models, for both inference and training.
+Transformers acts as the model-definition framework for state-of-the-art machine learning models in text, computer
+vision, audio, video, and multimodal model, for both inference and training.

 It centralizes the model definition so that this definition is agreed upon across the ecosystem. `transformers` is the
 pivot across frameworks: if a model definition is supported, it will be compatible with the majority of training
--- a/awesome-transformers.md
+++ b/awesome-transformers.md
@ -9,12 +9,6 @@ In this list, we showcase incredibly impactful and novel projects that have push
 adding other projects to the list. If you believe a project should be here and it's not, then please, open a PR
 to add it.

-## [◉ Universal Intelligence](https://github.com/blueraai/universal-intelligence)
-
-[Universal Intelligence](https://github.com/blueraai/universal-intelligence) aims to standardize models, tools, and agents —transforming them into simple, composable, portable, interoperable, framework-agnostic, hardware-agnostic interfaces (through auto-negotiation and resource sharing); for fast and accessible development of AI applications.
-
-Keywords: Protocol, Open-source, LLMs, Large Language Models, Agents, Low-code
-
 ## [gpt4all](https://github.com/nomic-ai/gpt4all)

 [gpt4all](https://github.com/nomic-ai/gpt4all) is an ecosystem of open-source chatbots trained on massive collections of clean assistant data including code, stories and dialogue. It offers open-source, large language models such as LLaMA and GPT-J trained in an assistant-style.
--- a/conftest.py
+++ b/conftest.py
@ -87,8 +87,6 @@ def pytest_configure(config):
    config.addinivalue_line("markers", "not_device_test: mark the tests always running on cpu")
    config.addinivalue_line("markers", "torch_compile_test: mark test which tests torch compile functionality")
    config.addinivalue_line("markers", "torch_export_test: mark test which tests torch export functionality")
-    config.addinivalue_line("markers", "flash_attn_test: mark test which tests flash attention functionality")
-    config.addinivalue_line("markers", "flash_attn_3_test: mark test which tests flash attention 3 functionality")

    os.environ["DISABLE_SAFETENSORS_CONVERSION"] = "true"

--- a/docker/consistency.dockerfile
+++ b/docker/consistency.dockerfile
@ -5,7 +5,7 @@ ARG REF=main
 RUN apt-get update && apt-get install -y time git g++ pkg-config make git-lfs
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip install uv && uv pip install --no-cache-dir -U pip setuptools GitPython
-RUN uv pip install --no-cache-dir --upgrade 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir --upgrade 'torch<2.9' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir pypi-kenlm
 RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[quality,testing,torch-speech,vision]"
 RUN git lfs install
--- a/docker/custom-tokenizers.dockerfile
+++ b/docker/custom-tokenizers.dockerfile
@ -17,7 +17,7 @@ RUN make install -j 10

 WORKDIR /

-RUN uv pip install --no-cache --upgrade 'torch' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache --upgrade 'torch<2.9' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir  --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install  --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[ja,testing,sentencepiece,spacy,ftfy,rjieba]" unidic unidic-lite
 # spacy is not used so not tested. Causes to failures. TODO fix later
--- a/docker/examples-torch.dockerfile
+++ b/docker/examples-torch.dockerfile
@ -5,7 +5,7 @@ USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git-lfs ffmpeg curl
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch<2.9' 'torchaudio' 'torchvision' 'torchcodec<0.8' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]" seqeval albumentations jiwer

--- a/docker/exotic-models.dockerfile
+++ b/docker/exotic-models.dockerfile
@ -5,7 +5,7 @@ USER root
 RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git libgl1 g++ tesseract-ocr git-lfs curl
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch<2.9' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir  --no-deps timm accelerate
 RUN uv pip install -U --no-cache-dir pytesseract python-Levenshtein opencv-python nltk
 # RUN uv pip install --no-cache-dir natten==0.15.1+torch210cpu -f https://shi-labs.com/natten/wheels
--- a/docker/pipeline-torch.dockerfile
+++ b/docker/pipeline-torch.dockerfile
@ -5,7 +5,7 @@ USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git pkg-config openssh-client git ffmpeg curl
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch<2.9' 'torchaudio' 'torchvision' 'torchcodec<0.8' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]"

--- a/docker/torch-light.dockerfile
+++ b/docker/torch-light.dockerfile
@ -5,7 +5,7 @@ USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git-lfs ffmpeg curl
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch<2.9' 'torchaudio' 'torchvision' 'torchcodec<0.8' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing,tiktoken,num2words,video]"

--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@ -9,15 +9,10 @@ SHELL ["sh", "-lc"]
 # The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
 # to be used as arguments for docker build (so far).

-ARG PYTORCH='2.9.0'
+ARG PYTORCH='2.8.0'
 # Example: `cu102`, `cu113`, etc.
 ARG CUDA='cu126'

-# This needs to be compatible with the above `PYTORCH`.
-ARG TORCHCODEC='0.8.0'
-
-ARG FLASH_ATTN='false'
-
 RUN apt update
 RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg git-lfs
 RUN git lfs install
@ -26,44 +21,11 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip
 ARG REF=main
 RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF

-RUN python3 -m pip install --no-cache-dir -e ./transformers[dev]
-
 # 1. Put several commands in a single `RUN` to avoid image/layer exporting issue. Could be revised in the future.
-# 2. For `torchcodec`, use `cpu` as we don't have `libnvcuvid.so` on the host runner. See https://github.com/meta-pytorch/torchcodec/issues/912
-#    **Important**: We need to specify `torchcodec` version if the torch version is not the latest stable one.
-# 3. `set -e` means "exit immediately if any command fails".
-RUN set -e; \
-    # Determine torch version
-    if [ ${#PYTORCH} -gt 0 ] && [ "$PYTORCH" != "pre" ]; then \
-        VERSION="torch==${PYTORCH}.*"; \
-        TORCHCODEC_VERSION="torchcodec==${TORCHCODEC}.*"; \
-    else \
-        VERSION="torch"; \
-        TORCHCODEC_VERSION="torchcodec"; \
-    fi; \
-    \
-    # Log the version being installed
-    echo "Installing torch version: $VERSION"; \
-    \
-    # Install PyTorch packages
-    if [ "$PYTORCH" != "pre" ]; then \
-        python3 -m pip install --no-cache-dir -U \
-            $VERSION \
-            torchvision \
-            torchaudio \
-            --extra-index-url https://download.pytorch.org/whl/$CUDA; \
-        # We need to specify the version if the torch version is not the latest stable one.
-        python3 -m pip install --no-cache-dir -U \
-            $TORCHCODEC_VERSION --extra-index-url https://download.pytorch.org/whl/cpu; \
-    else \
-        python3 -m pip install --no-cache-dir -U --pre \
-            torch \
-            torchvision \
-            torchaudio \
-            --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA; \
-        python3 -m pip install --no-cache-dir -U --pre \
-            torchcodec --extra-index-url https://download.pytorch.org/whl/nightly/cpu; \
-    fi
+# 2. Regarding `torch` part, We might need to specify proper versions for `torchvision` and `torchaudio`.
+#    Currently, let's not bother to specify their versions explicitly (so installed with their latest release versions).
+# 3. For `torchcodec<0.8`: this is quickly added as torch 2.9.0 + torchcodec 0.8.0 fails on our CI env. Need to remove later once they work.
+RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' ||  VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio "torchcodec<0.8" --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA

 RUN python3 -m pip install --no-cache-dir -U timm

@ -92,7 +54,7 @@ RUN python3 -m pip install --no-cache-dir bitsandbytes
 RUN python3 -m pip install --no-cache-dir quanto

 # After using A10 as CI runner, let's run FA2 tests
-RUN [ "$FLASH_ATTN" != "false" ] && python3 -m pip uninstall -y ninja && python3 -m pip install --no-cache-dir ninja && python3 -m pip install flash-attn --no-cache-dir --no-build-isolation || echo "Don't install FA2 with nightly torch"
+RUN [ "$PYTORCH" != "pre" ] && python3 -m pip uninstall -y ninja && python3 -m pip install --no-cache-dir ninja && python3 -m pip install flash-attn --no-cache-dir --no-build-isolation || echo "Don't install FA2 with nightly torch"

 # TODO (ydshieh): check this again
 # `quanto` will install `ninja` which leads to many `CUDA error: an illegal memory access ...` in some model tests
--- a/docker/transformers-pytorch-amd-gpu/Dockerfile
+++ b/docker/transformers-pytorch-amd-gpu/Dockerfile
@ -1,4 +1,4 @@
-FROM rocm/pytorch:rocm7.0.2_ubuntu24.04_py3.12_pytorch_release_2.7.1
+FROM rocm/pytorch:rocm6.4.1_ubuntu24.04_py3.12_pytorch_release_2.7.1
 LABEL maintainer="Hugging Face"

 ARG DEBIAN_FRONTEND=noninteractive
@ -10,8 +10,8 @@ RUN apt update && \

 RUN git lfs install

-RUN python3 -m pip install --no-cache-dir --upgrade pip numpy importlib-metadata setuptools wheel ninja pytesseract "itsdangerous<2.1.0"
-RUN python3 -m pip install --no-cache-dir --no-build-isolation git+https://github.com/facebookresearch/detectron2.git
+RUN python3 -m pip install --no-cache-dir --upgrade pip numpy
+RUN python3 -m pip install --no-cache-dir --upgrade importlib-metadata setuptools ninja git+https://github.com/facebookresearch/detectron2.git pytesseract "itsdangerous<2.1.0"

 ARG REF=main
 WORKDIR /
@ -39,7 +39,6 @@ RUN python3 -m pip install --no-cache-dir "torchcodec==0.5"
 # Install flash attention from source. Tested with commit 6387433156558135a998d5568a9d74c1778666d8
 RUN git clone https://github.com/ROCm/flash-attention/ -b tridao && \
    cd flash-attention && \
-    GPU_ARCHS="gfx942;gfx950" python setup.py install  
-# GPU_ARCHS builds for MI300, MI325 and MI355
+    GPU_ARCHS="gfx942" python setup.py install

 RUN python3 -m pip install --no-cache-dir einops
--- a/docs/README.md
+++ b/docs/README.md
@ -24,7 +24,7 @@ pip install -e ".[dev]"
 ```

 > [!NOTE]
-> This command might fail for some OS that are missing dependencies. Check step 4 in [Create a Pull Request](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#create-a-pull-request) to work around it.
+> This command might fail for some OS that are missing dependencies. Check step 4 in [Create a Pull Request](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#create-a-pull-request) to workaround it.

 Then you need to install our special tool that builds the documentation:

@ -38,7 +38,7 @@ pip install git+https://github.com/huggingface/doc-builder

 ## Building the documentation

-Once you have set up the `doc-builder` and additional packages, you can generate the documentation by 
+Once you have setup the `doc-builder` and additional packages, you can generate the documentation by 
 typing the following command:

 ```bash
@ -295,11 +295,12 @@ Here's an example of a tuple return, comprising several objects:
 Due to the rapidly growing repository, it is important to make sure that no files that would significantly weigh down the repository are added. This includes images, videos, and other non-text files. We prefer to leverage a hf.co hosted `dataset` like
 the ones hosted on [`hf-internal-testing`](https://huggingface.co/hf-internal-testing) in which to place these files and reference
 them by URL. We recommend putting them in the following dataset: [huggingface/documentation-images](https://huggingface.co/datasets/huggingface/documentation-images).
-If an external contribution, feel free to add the images to your PR and ask a Hugging Face member to migrate them to this dataset.
+If an external contribution, feel free to add the images to your PR and ask a Hugging Face member to migrate your images
+to this dataset.

 ## Styling the docstring

-We have an automatic script running with the `make style` command that will make sure that:
+We have an automatic script running with the `make style` comment that will make sure that:
 - the docstrings fully take advantage of the line width
 - all code examples are formatted using black, like the code of the Transformers library

--- a/docs/source/ar/_toctree.yml
+++ b/docs/source/ar/_toctree.yml
@ -258,6 +258,8 @@
 #       title: النماذج
 #     - local: main_classes/text_generation
 #       title: توليد النصوص
+#     - local: main_classes/onnx
+#       title: ONNX
 #     - local: main_classes/optimizer_schedules
 #       title: التحسين
 #     - local: main_classes/output
--- a/docs/source/ar/serialization.md
+++ b/docs/source/ar/serialization.md
@ -32,7 +32,7 @@
 لتصدير نموذج 🤗 Transformers إلى ONNX، قم أولاً بتثبيت اعتماد إضافي:

 ```bash
-pip install optimum-onnx
+pip install optimum[exporters]
 ```

 للاطلاع على جميع المعامﻻت المتاحة، يرجى الرجوع إلى [وثائق 🤗 Optimum](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model#exporting-a-model-to-onnx-using-the-cli)، أو عرض المساعدة في سطر الأوامر:
@ -111,3 +111,60 @@ optimum-cli export onnx --model keras-io/transformers-qa distilbert_base_cased_s
 ### تصدير نموذج لهندسة غير مدعومة

 إذا كنت ترغب في المساهمة من خلال إضافة دعم لنموذج لا يُمكن تصديره حاليًا، فيجب عليك أولاً التحقق مما إذا كان مدعومًا في [`optimum.exporters.onnx`](https://huggingface.co/docs/optimum/exporters/onnx/overview)، وإذا لم يكن مدعومًا، [فيمكنك المساهمة في 🤗 Optimum](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/contribute) مُباشرةً.
+
+### تصدير نموذج باستخدام `transformers.onnx`
+
+<Tip warning={true}>
+
+لم يعد يتم دعم `transformers.onnx`  يُرجى تصدير النماذج باستخدام 🤗 Optimum كما هو موضح أعلاه. سيتم إزالة هذا القسم في الإصدارات القادمة.
+
+</Tip>
+
+لتصدير نموذج 🤗 Transformers إلى ONNX باستخدام `transformers.onnx`، ثبّت التبعيات الإضافية:
+
+```bash
+pip install transformers[onnx]
+```
+
+استخدم حزمة `transformers.onnx` كنموذج Python لتصدير نقطة حفظ باستخدام تكوين جاهز:
+
+```bash
+python -m transformers.onnx --model=distilbert/distilbert-base-uncased onnx/
+```
+
+يُصدّر هذا رسمًا بيانيًا ONNX لنقطة الحفظ المُحددة بواسطة وسيطة `--model`. مرر أي نقطة حفظ على 🤗 Hub أو نقطة حفظ مُخزنة محليًا.
+يُمكن بعد ذلك تشغيل ملف `model.onnx` الناتج على أحد المُسرعات العديدة التي تدعم معيار ONNX. على سبيل المثال، قم بتحميل وتشغيل النموذج باستخدام ONNX Runtime كما يلي:
+
+```python
+>>> from transformers import AutoTokenizer
+>>> from onnxruntime import InferenceSession
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
+>>> session = InferenceSession("onnx/model.onnx")
+>>> # يتوقع ONNX Runtime مصفوفات NumPy كمدخلات
+>>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np")
+>>> outputs = session.run(output_names=["last_hidden_state"], input_feed=dict(inputs))
+```
+
+يُمكن الحصول على أسماء المخرجات المطلوبة (مثل `["last_hidden_state"]`) من خلال إلقاء نظرة على تكوين ONNX لكل نموذج. على سبيل المثال، بالنسبة لـ DistilBERT، لدينا:
+
+```python
+>>> from transformers.models.distilbert import DistilBertConfig, DistilBertOnnxConfig
+
+>>> config = DistilBertConfig()
+>>> onnx_config = DistilBertOnnxConfig(config)
+>>> print(list(onnx_config.outputs.keys()))
+["last_hidden_state"]
+```
+
+العمليات مُتطابقة لنقاط الحفظ TensorFlow على Hub. على سبيل المثال، صدّر نقطة حفظ TensorFlow خالصة كما يلي:
+
+```bash
+python -m transformers.onnx --model=keras-io/transformers-qa onnx/
+```
+
+لتصدير نموذج مُخزن محليًا، احفظ أوزان النموذج ومجزىء اللغوى في نفس الدليل (على سبيل المثال `local-pt-checkpoint`)، ثم قم بتصديره إلى ONNX عن طريق توجيه وسيط `--model` لحزمة `transformers.onnx` إلى الدليل المطلوب:
+
+```bash
+python -m transformers.onnx --model=local-pt-checkpoint onnx/
+```
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@ -88,8 +88,6 @@
      title: Tool use
    - local: chat_templating_writing
      title: Writing a chat template
-    - local: chat_response_parsing
-      title: Response parsing
    title: Chat with models
  - sections:
    - local: serving
--- a/docs/source/en/chat_extras.md
+++ b/docs/source/en/chat_extras.md
@ -95,12 +95,9 @@ print(tokenizer.decode(outputs[0][len(inputs["input_ids"][0]):]))

 The chat model called the `get_current_temperature` tool with the correct parameters from the docstring. It inferred France as the location based on Paris, and that it should use Celsius for the units of temperature.

-A model **cannot actually call the tool itself**. It requests a tool call, and it's your job to handle the call and append it and the result to the chat history. For
-models that support [response parsing](./chat_response_parsing), the response parsing will be handled automatically, and you can just use
-[`~PreTrainedTokenizer.parse_response] to extract the tool call. For other models, you'll need to manually translate the output
-string into a tool call dict.
+A model **cannot actually call the tool itself**. It requests a tool call, and it's your job to handle the call and append it and the result to the chat history.

-Regardless of the approach you use, the tool call should go in the `tool_calls` key of an `assistant` message. This is the recommended API, and should be supported by the chat template of most tool-using models.
+Hold the call in the `tool_calls` key of an `assistant` message. This is the recommended API, and should be supported by the chat template of most tool-using models.

 > [!WARNING]
 > Although `tool_calls` is similar to the OpenAI API, the OpenAI API uses a JSON string as its `tool_calls` format. This may cause errors or strange model behavior if used in Transformers, which expects a dict.
--- a/docs/source/en/chat_response_parsing.md
+++ b/docs/source/en/chat_response_parsing.md
@ -1,233 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Response Parsing
-
-It is increasingly common for chat models to generate structured outputs, rather than just a single reply string. 
-The most common uses for structured outputs are [tool calling](./chat_extras) and [reasoning models](https://huggingface.co/reasoning-course).
-Tool calling models can output tool calls, containing the name of the tool to call and any arguments to be passed to it,
-while reasoning models often output reasoning steps as a "chain of thought". Some recent models even use both of these,
-and may output reasoning and/or one or more tool calls before their final answer.
-
-Models with structured outputs pose a challenge for chat templating, because the output needs to be parsed before it
-can be appended to the chat. For a concrete example, let's say we ask [GPT-OSS](https://huggingface.co/openai/gpt-oss-120b)
-what the weather is like, and it thinks and decides to call a tool. Here's what the raw model output might look like:
-
-```txt
-<|start|>analysis<|message|>The user asks: "What is the weather like in SF?" We need to get the location of the user? The user explicitly asks about SF (San Francisco).
-So we need to get the current weather in San Francisco, CA. We need to call get_current_weather function. But we need to call function to get weather data.
-So we should call get_current_weather with location "San Francisco, CA". Let's do that.
-We will call function get_current_weather.<|end|><|start|>commentary to=functions.get_current_weather<|channel|>commentary <|constrain|>json<|message|>{"location":"San Francisco, CA"}<|call|>
-}
-```
-
-But if you want to append this to a chat, you'll need to format it as a chat message dict, like this:
-
-```json
-{
-  "role": "assistant",
-  "thinking": "The user asks: \"What is the weather like in SF?\" We need to get the location of the user? The user explicitly asks about SF (San Francisco). So we need to get the current weather in San Francisco, CA. We need to call get_current_weather function. But we need to call function to get weather data. So we should call get_current_weather with location \"San Francisco, CA\". Let's do that.",
-  "tool_calls": [
-    {
-      "name": "get_current_weather",
-      "arguments": {
-        "location": "San Francisco, CA"
-      }
-    }
-  ]
-}
-```
-
-Chat **templates** give us a way to turn messages into formatted input for a model, but we need something else to
-parse model output back into a standard message dict. This is what chat **parsing** is for.
-
-## The [parse_response](~PreTrainedTokenizerBase.parse_response) method
-
-Parsing a chat response on a model that supports it is straightforward. Simply take the raw, decoded output from
-[generate](`~generation.GenerationMixin.generate`), and pass it to the tokenizer's [parse_response](~PreTrainedTokenizerBase.parse_response) method:
-
-```python
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-checkpoint = "HuggingFaceTB/SmolLM3-3B"
-
-tokenizer = AutoTokenizer.from_pretrained(checkpoint)
-model = AutoModelForCausalLM.from_pretrained(checkpoint, dtype="auto", device_map="auto")
-
-messages = [
-    {
-        "role": "user",
-        "content": "Hey! Can you summarize the end of the Cold War as briefly as possible? Like, comically briefly. It should really leave out almost most of the relevant information."
-    }
-]
-
-input_ids = tokenizer.apply_chat_template(
-    messages,
-    add_generation_prompt=True,
-    tokenize=True,
-    return_tensors="pt"
-).to(model.device)
-
-outputs = model.generate(input_ids, max_new_tokens=1024)[0, input_ids.shape[1]:]
-out_text = tokenizer.decode(outputs)
-parsed = tokenizer.parse_response(out_text)
-print(parsed.keys())
-```
-
-And you should get:
-
-```text
-dict_keys(['thinking', 'content'])
-```
-
-And that's all you need to start using response parsing! `parse_response` should return a complete message dict that is ready to be appended to the chat history. 
-When the tokenizer does not support response parsing, `parse_response` will throw an error. We hope to add support
-to more tokenizers over time.
-
-## Developers: Understanding a simple response schema
-
-Under the hood, `parse_response` uses a **JSON schema** to parse the model output. A JSON schema represents
-the structure of the output message dict. The schema is augmented with additional fields that indicate how the 
-output message string should be parsed into the expected format. Let's take a look at the schema for a SmolLM response,
-excluding tool calls for now:
-
-```python
-{
-    "x-regex": "(?:<think>\n?(?P<thinking>.+?)\n?</think>)?\s*(?P<content>.+?)?\s*(?:<\|im_end\|>|$)",
-    "type": "object",
-    "properties": {
-        "role": {"const": "assistant"},
-        "content": {"type": "string"},
-        "thinking": {"type": "string"}
-    }
-}
-```
-
-We can see that the schema describes a JSON "object" (a `dict`, in other words) with three keys: `role`, `content`, and `thinking`.
-Because all assistant responses have the role "assistant", the `role` key is a `const`(ant). The other two keys are strings, extracted
-from the named groups in the regex in the `x-regex` field.
-
-Like chat templates, response schemas are set as a property of the tokenizer. To enable response parsing, all you need
-to do is set `tokenizer.response_schema` to a valid schema dict, and `tokenizer.parse_response()` will work! Again, like
-chat templates, this schema will be saved with the processor, so once you set it, you can use `save_pretrained()` or `push_to_hub()` to
-save and share the schema. 
-
-## Developers: Complex schemas
-
-Now, let's look at a more complex schema, which includes tool calls, to gain more of an understanding of the parser
-internals. For this, we'll use the `GPT-OSS` schema. GPT-OSS emits both tool calls and thinking blocks, and it uses
-an unusual format where model responses are tagged with one of three "channels": `commentary` for things like
-tool calls, `analysis` for chain of thought blocks, and `final` for messages intended to be sent to the user. 
-A full message where the model calls a tool named `get_current_weather` might look like this, with some extra linebreaks added for clarity:
-
-```text
-<|channel|>analysis<|message|>
-The user asks: "What is the weather like in SF?" So we need to get the current weather in San Francisco, CA. 
-We need to call get_current_weather function. So we should call get_current_weather with location "San Francisco, CA".
-<|end|>
-<|start|>assistant<|channel|>commentary 
-to=functions.get_current_weather <|constrain|>json<|message|>
-{
-  "location": "San Francisco, CA"
-}
-<|call|>
-```
-
-Parsing proceeds recursively; the output of a regex (or other parser) at one level becomes the input to the nodes below it.
-In other words, don't feel like you have to parse the entire output in one enormous regex! Instead, start with the schema,
-and then add regexes to extract the relevant chunks as you go. Here's a schema that will parse it, with some
-explanatory comments:
-
-```python
-{
-    "type": "object",
-    "properties": {
-        "role": {"const": "assistant"},
-        # "content" and "thinking" are both similar to the previous example, and just extract a single string
-        # However, rather than using a single regex with named groups to extract both, we use a regex in each subkey.
-        # When an object node has no parser/regex, the entire input string is passed to all of its children, so 
-        # parsing can either be done with named groups at the object level, or with separate regexes at the property level.
-        "content": {"type": "string", "x-regex": r"<\|channel\|>final<\|message\|>(.*?)(?:<\|end\|>|$)"},
-        "thinking": {"type": "string", "x-regex": r"<\|channel\|>analysis<\|message\|>(.*?)<\|end\|>"},
-        "tool_calls": {
-            # "x-regex-iterator" uses re.findall to find multiple possible manages, and returns them as an
-            # array/list. You don't need to worry about array handling, though - each item in the array will be
-            # parsed by the `items` schema, so just write the schema for a single item.
-            "x-regex-iterator": r"<\|channel\|>commentary (to=functions\..*?<\|message\|>.*?)(?:<\|call\|>|$)",
-            "type": "array",
-            "items": {
-                "type": "object",
-                "properties": {
-                    # A const property is a fixed value, and the input has no effect on it.
-                    "type": {"const": "function"},
-                    # Here, we wrap the entire tool call dict in a `{"function": ...}` block. The input string is passed through to it unchanged.
-                    "function": {
-                        "type": "object",
-                        "properties": {
-                            "name": {"type": "string", "x-regex": r"^to=functions\.(\w+)"},
-                            "arguments": {
-                                "type": "object",
-                                "x-regex": "<\|message\|>(.*)",
-                                # The "x-parser" field indicates that the extracted string should be parsed as JSON.
-                                # The output is then passed to the schema nodes below and recursive parsing continues.
-                                "x-parser": "json",
-                                "additionalProperties": {"type": "any"},
-                            },
-                        },
-                    },
-                },
-            },
-        },
-    },
-}
-```
-
-## Developers: Understanding the parser logic
-
-The parser follows a few simple rules:
-
-1. Each level of the schema receives input from the level above, applies any regex or parser it has, and then passes the output to its children.
-2. The root level receives the entire decoded model output string as input.
-3. If a node has structured content after parsing (for example, if the regex has named groups and returns a dict, or if the parser returns a dict or list),
-   then that structured content is mapped to the node's children, and each child node receives its corresponding value as input.
-4. If an `object` (dict) node has unstructured (string) output, then the entire string is passed to all of its children. This allows child nodes
-   to handle parsing individually rather than requiring a single parent regex to extract all keys at once.
-5. If an `array` (list) node has unstructured (string) output, then this throws an error.
-
-There is a small set of allowable `x-` keys that indicate how parsing should be done at each node:
- `x-regex`: A regex string to apply to the input. If the regex has named groups, the output is a dict of group names to values. Named groups should only be used in `object` nodes.
-  Otherwise, the regex must have exactly one unnamed capturing group, and the output is the value of that group as a string.
- `x-regex-iterator`: A regex string to apply to the input using `re.findall()`. The output is a list of all matches.
-  This should only be used in `array` nodes, and the regex must have exactly one unnamed capturing group. The output is distributed to
-  the node's `items` schema.
- `x-parser`: Calls a built-in parser to apply to the input. Currently, the only supported parser is `json`, which parses the input string as JSON.
-  The output is passed to the child nodes for further parsing. Note that the `json` parser can return deeply nested output - in this case, the output
-  will be progressively unwrapped as it is passed through child nodes. The child nodes do not need additional `x-parser` or `x-regex` fields in this case, 
-  but their structure must match the structure of the parsed JSON.
- `x-parser-args`: Only allowed in conjunction with `x-parser`. This is a dict of additional arguments that control parsing. Right now, the only supported
-  argument is `transform`, which specifies a `jmespath` transformation to apply to the output. This is useful when the JSON parser returns a structure
-  that needs to be modified to match the schema.
- `x-regex-key-value`: This is rarely necessary, but it can be useful when parsing key-value pairs in non-JSON format where the names of the keys are not known
-  in advance, such as when a model emits XML tool calls with arbitrary argument names. The regex must have exactly two named capturing groups, 
-  `key` and `value`, and the output is a dict mapping keys to values. This should only be used in `object` nodes.
-
-In general, multiple regexes/parsers cannot be combined at the same level. The exception is that `x-regex`, returning a single string, can be combined with the other parsers. In this case,
-`x-regex` is applied first, and then the output is passed to the other parser, either `x-regex-iterator`, `x-parser`, or `x-regex-key-value`.
-
-Putting these ideas together, you can see that the input flows through the schema, being parsed at each level and then distributed to child nodes. Each level
-only needs to extract the input content that is relevant for that part of the schema, and can then let its child nodes handle the rest. Internally, this is handled
-with a parser function that receives input, applies any regexes/parsers at the current level, then maps the result to its child nodes before recursively calling itself on each of them.
-Recursion terminates when it reaches leaf nodes, usually primitive types like `string` or `number`, which simply return the input they receive.
--- a/docs/source/en/model_doc/lightglue.md
+++ b/docs/source/en/model_doc/lightglue.md
@ -88,16 +88,16 @@ processed_outputs = processor.post_process_keypoint_matching(outputs, image_size
    import torch
    from PIL import Image
    import requests
-
+    
    processor = AutoImageProcessor.from_pretrained("ETH-CVG/lightglue_superpoint")
    model = AutoModel.from_pretrained("ETH-CVG/lightglue_superpoint")
-
+    
    # LightGlue requires pairs of images
    images = [image1, image2]
    inputs = processor(images, return_tensors="pt")
    with torch.inference_mode():
        outputs = model(**inputs)
-
+    
    # Extract matching information
    keypoints0 = outputs.keypoints0  # Keypoints in first image
    keypoints1 = outputs.keypoints1  # Keypoints in second image
@ -112,7 +112,7 @@ processed_outputs = processor.post_process_keypoint_matching(outputs, image_size
    # Process outputs for visualization
    image_sizes = [[(image.height, image.width) for image in images]]
    processed_outputs = processor.post_process_keypoint_matching(outputs, image_sizes, threshold=0.2)
-
+    
    for i, output in enumerate(processed_outputs):
        print(f"For the image pair {i}")
        for keypoint0, keypoint1, matching_score in zip(
@ -147,13 +147,6 @@ processed_outputs = processor.post_process_keypoint_matching(outputs, image_size
    - post_process_keypoint_matching
    - visualize_keypoint_matching

-## LightGlueImageProcessorFast
-
-[[autodoc]] LightGlueImageProcessorFast
-    - preprocess
-    - post_process_keypoint_matching
-    - visualize_keypoint_matching
-
 ## LightGlueForKeypointMatching

 [[autodoc]] LightGlueForKeypointMatching
--- a/docs/source/en/optimizers.md
+++ b/docs/source/en/optimizers.md
@ -154,7 +154,7 @@ pip install schedulefree

 [Schedule Free optimizer (SFO)](https://hf.co/papers/2405.15682) replaces the base optimizers momentum with a combination of averaging and interpolation. Unlike a traditional scheduler, SFO completely removes the need to anneal the learning rate.

-SFO supports the RAdam (`schedule_free_radam`), AdamW (`schedule_free_adamw`) and SGD (`schedule_free_sgd`) optimizers. The RAdam scheduler doesn't require `warmup_steps`.
+SFO supports the RAdam (`schedule_free_radam`), AdamW (`schedule_free_adamw`) and SGD (`schedule_free_sgd`) optimizers. The RAdam scheduler doesn't require `warmup_steps` or `warmup_ratio`.

 By default, it is recommended to set `lr_scheduler_type="constant"`. Other `lr_scheduler_type` values may also work, but combining SFO optimizers with other learning rate schedules could affect SFOs intended behavior and performance.

--- a/docs/source/en/pr_checks.md
+++ b/docs/source/en/pr_checks.md
@ -38,7 +38,7 @@ pip install transformers[dev]
 or for an editable install:

 ```bash
-pip install -e ".[dev]"
+pip install -e .[dev]
 ```

 inside the Transformers repo. Since the number of optional dependencies of Transformers has grown a lot, it's possible you don't manage to get all of them. If the dev install fails, make sure to install PyTorch then do
@ -50,7 +50,7 @@ pip install transformers[quality]
 or for an editable install:

 ```bash
-pip install -e ".[quality]"
+pip install -e .[quality]
 ```

 ## Tests
--- a/docs/source/en/serialization.md
+++ b/docs/source/en/serialization.md
@ -33,7 +33,7 @@ Export a Transformers model to ONNX with the Optimum CLI or the `optimum.onnxrun
 Run the command below to install Optimum and the [exporters](https://huggingface.co/docs/optimum/exporters/overview) module.

 ```bash
-pip install optimum-onnx
+pip install optimum[exporters]
 ```

 > [!TIP]
--- a/docs/source/en/serving.md
+++ b/docs/source/en/serving.md
@ -383,30 +383,6 @@ transformers serve \
  --attn_implementation "sdpa"
 ```

-### Quantization
-
-transformers serve is compatible with all [quantization methods](https://huggingface.co/docs/transformers/main/quantization/overview) supported in transformers. Quantization can significantly reduce memory usage and improve inference speed, with two main workflows: pre-quantized models and on-the-fly quantization.
-
-#### Pre-quantized Models
-
-For models that are already quantized (e.g., GPTQ, AWQ, bitsandbytes), simply choose a quantized model name for serving.
-Make sure to install the required libraries listed in the quantization documentation.
-
-> [!TIP]
-> Pre-quantized models generally provide the best balance of performance and accuracy.
-
-#### On the fly quantization
-
-If you want to quantize a model at runtime, you can specify the --quantization flag in the CLI. Note that not all quantization methods support on-the-fly conversion. The full list of supported methods is available in the quantization [overview](https://huggingface.co/docs/transformers/main/quantization/overview). 
-
-Currently, with transformers serve, we only supports some methods: ["bnb-4bit", "bnb-8bit"]
-
-For example, to enable 4-bit quantization with bitsandbytes, you need to pass add `--quantization bnb-4bit`: 
-
-```sh
-transformers serve --quantization bnb-4bit
-```
-
 ### Performance tips

 - Use an efficient attention backend when available:
@ -421,4 +397,6 @@ transformers serve \

 - `--dtype {bfloat16|float16}` typically improve throughput and memory use vs. `float32`

+- `--load_in_4bit`/`--load_in_8bit` can reduce memory footprint for LoRA setups
+
 - `--force-model <repo_id>` avoids per-request model hints and helps produce stable, repeatable runs
--- a/docs/source/en/tasks/audio_classification.md
+++ b/docs/source/en/tasks/audio_classification.md
@ -220,7 +220,7 @@ At this point, only three steps remain:
 ...     gradient_accumulation_steps=4,
 ...     per_device_eval_batch_size=32,
 ...     num_train_epochs=10,
-...     warmup_steps=0.1,
+...     warmup_ratio=0.1,
 ...     logging_steps=10,
 ...     load_best_model_at_end=True,
 ...     metric_for_best_model="accuracy",
--- a/docs/source/en/tasks/image_classification.md
+++ b/docs/source/en/tasks/image_classification.md
@ -211,7 +211,7 @@ At this point, only three steps remain:
 ...     gradient_accumulation_steps=4,
 ...     per_device_eval_batch_size=16,
 ...     num_train_epochs=3,
-...     warmup_steps=0.1,
+...     warmup_ratio=0.1,
 ...     logging_steps=10,
 ...     load_best_model_at_end=True,
 ...     metric_for_best_model="accuracy",
--- a/docs/source/en/tasks/video_classification.md
+++ b/docs/source/en/tasks/video_classification.md
@ -378,7 +378,7 @@ Most of the training arguments are self-explanatory, but one that is quite impor
 ...     learning_rate=5e-5,
 ...     per_device_train_batch_size=batch_size,
 ...     per_device_eval_batch_size=batch_size,
-...     warmup_steps=0.1,
+...     warmup_ratio=0.1,
 ...     logging_steps=10,
 ...     load_best_model_at_end=True,
 ...     metric_for_best_model="accuracy",
--- a/docs/source/es/pr_checks.md
+++ b/docs/source/es/pr_checks.md
@ -37,7 +37,7 @@ pip install transformers[dev]
 o una instalación editable:

 ```bash
-pip install -e ".[dev]"
+pip install -e .[dev]
 ```

 del repositorio de Transformers.
--- a/docs/source/es/tasks/audio_classification.md
+++ b/docs/source/es/tasks/audio_classification.md
@ -220,7 +220,7 @@ Al llegar a este punto, solo quedan tres pasos:
 ...     gradient_accumulation_steps=4,
 ...     per_device_eval_batch_size=32,
 ...     num_train_epochs=10,
-...     warmup_steps=0.1,
+...     warmup_ratio=0.1,
 ...     logging_steps=10,
 ...     load_best_model_at_end=True,
 ...     metric_for_best_model="accuracy",
--- a/docs/source/it/pr_checks.md
+++ b/docs/source/it/pr_checks.md
@ -37,7 +37,7 @@ pip install transformers[dev]
 o un'installazione modificabile:

 ```bash
-pip install -e ".[dev]"
+pip install -e .[dev]
 ```

 all'interno del repo Transformers.
--- a/docs/source/ja/_toctree.yml
+++ b/docs/source/ja/_toctree.yml
@ -200,6 +200,8 @@
      title: モデル
    - local: main_classes/text_generation
      title: テキストの生成
+    - local: main_classes/onnx
+      title: ONNX
    - local: main_classes/optimizer_schedules
      title: 最適化
    - local: main_classes/output
--- a/docs/source/ja/main_classes/deepspeed.md
+++ b/docs/source/ja/main_classes/deepspeed.md
@ -1292,7 +1292,7 @@ DeepSpeed は、`LRRangeTest`、`OneCycle`、`WarmupLR`、および`WarmupDecayL
  したがって、スケジューラを設定しない場合、これがデフォルトで設定されるスケジューラになります。

 設定ファイルで `scheduler` エントリを設定しない場合、[`Trainer`] は
-`--lr_scheduler_type`、`--learning_rate`、および `--warmup_steps` の値を設定します。
+`--lr_scheduler_type`、`--learning_rate`、および `--warmup_steps` または `--warmup_ratio` の値を設定します。
 🤗 それのトランスフォーマーバージョン。

 以下は、`WarmupLR`の自動構成された`scheduler`エントリの例です。
@ -1316,7 +1316,8 @@ DeepSpeed は、`LRRangeTest`、`OneCycle`、`WarmupLR`、および`WarmupDecayL

 - `warmup_min_lr` の値は `0` です。
 - `warmup_max_lr` と `--learning_rate` の値。
- `warmup_num_steps` と `--warmup_steps` の値 (指定されている場合)
+- `warmup_num_steps` と `--warmup_steps` の値 (指定されている場合)。それ以外の場合は `--warmup_ratio` を使用します
+  トレーニング ステップの数を乗算し、切り上げます。
 - `total_num_steps` には `--max_steps` の値を指定するか、指定されていない場合は実行時に自動的に導出されます。
  環境、データセットのサイズ、およびその他のコマンド ライン引数 (
  `WarmupDecayLR`)。
--- a/docs/source/ja/main_classes/onnx.md
+++ b/docs/source/ja/main_classes/onnx.md
@ -0,0 +1,50 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Exporting 🤗 Transformers models to ONNX
+
+🤗 Transformers は `transformers.onnx` パッケージを提供します。
+設定オブジェクトを利用することで、モデルのチェックポイントをONNXグラフに変換することができます。
+
+詳細は[ガイド](../serialization) を参照してください。
+を参照してください。
+
+## ONNX Configurations
+
+以下の3つの抽象クラスを提供しています。
+エクスポートしたいモデルアーキテクチャのタイプに応じて、継承すべき3つの抽象クラスを提供します：
+
+* エンコーダーベースのモデルは [`~onnx.config.OnnxConfig`] を継承します。
+* デコーダーベースのモデルは [`~onnx.config.OnnxConfigWithPast`] を継承します。
+* エンコーダー・デコーダーモデルは [`~onnx.config.OnnxSeq2SeqConfigWithPast`] を継承しています。
+
+
+### OnnxConfig
+
+[[autodoc]] onnx.config.OnnxConfig
+
+### OnnxConfigWithPast
+
+[[autodoc]] onnx.config.OnnxConfigWithPast
+
+### OnnxSeq2SeqConfigWithPast
+
+[[autodoc]] onnx.config.OnnxSeq2SeqConfigWithPast
+
+## ONNX Features
+
+各 ONNX 構成は、次のことを可能にする一連の _機能_ に関連付けられています。
+さまざまなタイプのトポロジまたはタスクのモデルをエクスポートします。
--- a/docs/source/ja/pr_checks.md
+++ b/docs/source/ja/pr_checks.md
@ -40,7 +40,7 @@ pip install transformers[dev]


 ```bash
-pip install -e ".[dev]"
+pip install -e .[dev]
 ```

 トランスフォーマーズのリポジトリ内で作業しています。トランスフォーマーズのオプションの依存関係の数が増えたため、すべてを取得できない可能性があります。開発用インストールが失敗した場合、作業しているディープラーニングフレームワーク（PyTorch、TensorFlow、および/またはFlax）をインストールし、次の手順を実行してください。
@ -53,7 +53,7 @@ pip install transformers[quality]
 または編集可能なインストールの場合：

 ```bash
-pip install -e ".[quality]"
+pip install -e .[quality]
 ```

 ## Tests
--- a/docs/source/ja/serialization.md
+++ b/docs/source/ja/serialization.md
@ -47,7 +47,7 @@ ONNX形式にエクスポートされたモデルは、以下のように使用
 🤗 TransformersモデルをONNXにエクスポートするには、まず追加の依存関係をインストールしてください：

 ```bash
-pip install optimum-onnx
+pip install optimum[exporters]
 ```

 すべての利用可能な引数を確認するには、[🤗 Optimumドキュメント](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model#exporting-a-model-to-onnx-using-the-cli)を参照してください。または、コマンドラインでヘルプを表示することもできます：
@ -128,3 +128,64 @@ CLIの代わりに、🤗 TransformersモデルをONNXにプログラム的に
 ### Exporting a model for an unsupported architecture

 現在エクスポートできないモデルをサポートするために貢献したい場合、まず[`optimum.exporters.onnx`](https://huggingface.co/docs/optimum/exporters/onnx/overview)でサポートされているかどうかを確認し、サポートされていない場合は[🤗 Optimumに貢献](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/contribute)してください。
+
+### Exporting a model with `transformers.onnx`
+
+<Tip warning={true}>
+
+`transformers.onnx`はもはやメンテナンスされていないため、モデルを上記で説明したように🤗 Optimumでエクスポートしてください。このセクションは将来のバージョンで削除されます。
+
+</Tip>
+
+🤗 TransformersモデルをONNXにエクスポートするには、追加の依存関係をインストールしてください：
+
+
+```bash
+pip install transformers[onnx]
+```
+
+`transformers.onnx`パッケージをPythonモジュールとして使用して、事前に用意された設定を使用してチェックポイントをエクスポートする方法は以下の通りです：
+
+```bash
+python -m transformers.onnx --model=distilbert/distilbert-base-uncased onnx/
+```
+
+この方法は、`--model`引数で定義されたチェックポイントのONNXグラフをエクスポートします。🤗 Hubのいずれかのチェックポイントまたはローカルに保存されたチェックポイントを渡すことができます。エクスポートされた`model.onnx`ファイルは、ONNX標準をサポートする多くのアクセラレータで実行できます。例えば、ONNX Runtimeを使用してモデルを読み込んで実行する方法は以下の通りです：
+
+
+```python
+>>> from transformers import AutoTokenizer
+>>> from onnxruntime import InferenceSession
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
+>>> session = InferenceSession("onnx/model.onnx")
+>>> # ONNX Runtime expects NumPy arrays as input
+>>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np")
+>>> outputs = session.run(output_names=["last_hidden_state"], input_feed=dict(inputs))
+```
+
+必要な出力名（例: `["last_hidden_state"]`）は、各モデルのONNX構成を確認することで取得できます。例えば、DistilBERTの場合、次のようになります：
+
+
+```python
+>>> from transformers.models.distilbert import DistilBertConfig, DistilBertOnnxConfig
+
+>>> config = DistilBertConfig()
+>>> onnx_config = DistilBertOnnxConfig(config)
+>>> print(list(onnx_config.outputs.keys()))
+["last_hidden_state"]
+```
+
+ハブから純粋なTensorFlowのチェックポイントをプログラム的にエクスポートするプロセスは、以下のように同様です：
+
+```bash
+python -m transformers.onnx --model=keras-io/transformers-qa onnx/
+```
+
+ローカルに保存されたモデルをエクスポートする場合、モデルの重みとトークナイザのファイルを同じディレクトリに保存してください（例： `local-pt-checkpoint`）。その後、`transformers.onnx`パッケージの `--model`引数を希望するディレクトリに向けて設定して、ONNXにエクスポートします：
+
+
+```bash
+python -m transformers.onnx --model=local-pt-checkpoint onnx/
+```
+
--- a/docs/source/ja/tasks/audio_classification.md
+++ b/docs/source/ja/tasks/audio_classification.md
@ -219,7 +219,7 @@ MInDS-14 データセットのサンプリング レートは 8khz です (こ
 ...     gradient_accumulation_steps=4,
 ...     per_device_eval_batch_size=32,
 ...     num_train_epochs=10,
-...     warmup_steps=0.1,
+...     warmup_ratio=0.1,
 ...     logging_steps=10,
 ...     load_best_model_at_end=True,
 ...     metric_for_best_model="accuracy",
--- a/docs/source/ja/tasks/image_classification.md
+++ b/docs/source/ja/tasks/image_classification.md
@ -216,7 +216,7 @@ Datasets、🤗 データセット ライブラリから Food-101 データセ
 ...     gradient_accumulation_steps=4,
 ...     per_device_eval_batch_size=16,
 ...     num_train_epochs=3,
-...     warmup_steps=0.1,
+...     warmup_ratio=0.1,
 ...     logging_steps=10,
 ...     load_best_model_at_end=True,
 ...     metric_for_best_model="accuracy",
--- a/docs/source/ja/tasks/video_classification.md
+++ b/docs/source/ja/tasks/video_classification.md
@ -360,7 +360,7 @@ You should probably TRAIN this model on a down-stream task to be able to use it
 ...     learning_rate=5e-5,
 ...     per_device_train_batch_size=batch_size,
 ...     per_device_eval_batch_size=batch_size,
-...     warmup_steps=0.1,
+...     warmup_ratio=0.1,
 ...     logging_steps=10,
 ...     load_best_model_at_end=True,
 ...     metric_for_best_model="accuracy",
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@ -406,6 +406,8 @@
      title: Models
    - local: main_classes/text_generation
      title: 텍스트 생성
+    - local: main_classes/onnx
+      title: ONNX
    - local: main_classes/optimizer_schedules
      title: 최적화
    - local: main_classes/output
--- a/docs/source/ko/main_classes/onnx.md
+++ b/docs/source/ko/main_classes/onnx.md
@ -0,0 +1,45 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 🤗 Transformers 모델을 ONNX로 내보내기[[exporting--transformers-models-to-onnx]]
+
+🤗 트랜스포머는 `transformers.onnx` 패키지를 제공하며, 이 패키지는 설정 객체를 활용하여 모델 체크포인트를 ONNX 그래프로 변환할 수 있게 합니다.
+
+🤗 Transformers에 대한 자세한 내용은 [이 가이드](../serialization)를 참조하세요.
+
+## ONNX 설정[[onnx-configurations]]
+
+내보내려는(export) 모델 아키텍처의 유형에 따라 상속받아야 할 세 가지 추상 클래스를 제공합니다:
+
+* 인코더 기반 모델은 [`~onnx.config.OnnxConfig`]을 상속받습니다.
+* 디코더 기반 모델은 [`~onnx.config.OnnxConfigWithPast`]을 상속받습니다.
+* 인코더-디코더 기반 모델은 [`~onnx.config.OnnxSeq2SeqConfigWithPast`]을 상속받습니다.
+
+### OnnxConfig[[transformers.onnx.OnnxConfig]]
+
+[[autodoc]] onnx.config.OnnxConfig
+
+### OnnxConfigWithPast[[transformers.onnx.OnnxConfigWithPast]]
+
+[[autodoc]] onnx.config.OnnxConfigWithPast
+
+### OnnxSeq2SeqConfigWithPast[[OnnxSeq2SeqConfigWithPast]]
+
+[[autodoc]] onnx.config.OnnxSeq2SeqConfigWithPast
+
+## ONNX 특징[[onnx-features]]
+
+각 ONNX 설정은 다양한 유형의 토폴로지나 작업에 대해 모델을 내보낼 수 있게(exporting) 해주는 _features_ 세트와 연관되어 있습니다.
--- a/docs/source/ko/optimizers.md
+++ b/docs/source/ko/optimizers.md
@ -154,7 +154,7 @@ pip install schedulefree

 [Schedule Free optimizer (SFO)](https://hf.co/papers/2405.15682)는 기본 옵티마이저의 모멘텀 대신 평균화(averaging)와 보간(interpolation)을 조합하여 사용합니다. 덕분에 기존의 학습률 스케줄러와 달리, SFO는 학습률을 점진적으로 낮추는 절차가 아예 필요 없습니다.

-SFO는 RAdam(`schedule_free_radam`), AdamW(`schedule_free_adamw`), SGD(`schedule_free_sgd`) 옵티마이저를 지원합니다. RAdam 스케줄러는 `warmup_steps`.
+SFO는 RAdam(`schedule_free_radam`), AdamW(`schedule_free_adamw`), SGD(`schedule_free_sgd`) 옵티마이저를 지원합니다. RAdam 스케줄러는 `warmup_steps`나 `warmup_ratio` 설정이 필요하지 않습니다. 

 기본적으로 `lr_scheduler_type="constant"`로 설정하는 것을 권장합니다. 다른 `lr_scheduler_type` 값도 동작할 순 있으나, SFO 옵티마이저와 다른 학습률 스케줄을 함께 사용하면 SFO의 의도된 동작과 성능에 영향을 줄 수 있습니다. 

--- a/docs/source/ko/pr_checks.md
+++ b/docs/source/ko/pr_checks.md
@ -37,7 +37,7 @@ pip install transformers[dev]
 또는 Transformers 저장소 내에 편집 가능한 설치가 필요합니다:

 ```bash
-pip install -e ".[dev]"
+pip install -e .[dev]
 ```

 Transformers의 선택적 종속성 수가 많이 늘어났기 때문에 개발 설치를 실패할 수도 있습니다. 개발 설치가 실패하는 경우, 작업 중인 Deep Learning 프레임워크 (PyTorch, TensorFlow 및/또는 Flax)를 설치하고 다음 명령을 실행하세요.
@ -49,7 +49,7 @@ pip install transformers[quality]
 편집 가능한 설치의 경우는 다음 명령을 실행하세요.

 ```bash
-pip install -e ".[quality]"
+pip install -e .[quality]
 ```


--- a/docs/source/ko/serialization.md
+++ b/docs/source/ko/serialization.md
@ -47,7 +47,7 @@ ONNX 형식으로 내보낸 모델은 다음과 같이 사용할 수 있습니
 🤗 Transformers 모델을 ONNX로 내보내려면 먼저 추가 종속성을 설치하세요:

 ```bash
-pip install optimum-onnx
+pip install optimum[exporters]
 ```

 사용 가능한 모든 인수를 확인하려면 [🤗 Optimum 문서](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model#exporting-a-model-to-onnx-using-the-cli)를 참조하거나 명령줄에서 도움말을 보세요.
@ -123,3 +123,59 @@ CLI 대신에 `optimum.onnxruntime`을 사용하여 프로그래밍 방식으로
 ### 지원되지 않는 아키텍처의 모델 내보내기 [[exporting-a-model-for-an-unsupported-architecture]]

 현재 내보낼 수 없는 모델을 지원하기 위해 기여하려면, 먼저 [`optimum.exporters.onnx`](https://huggingface.co/docs/optimum/exporters/onnx/overview)에서 지원되는지 확인한 후 지원되지 않는 경우에는 [🤗 Optimum에 기여](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/contribute)하세요.
+
+### `transformers.onnx`를 사용하여 모델 내보내기 [[exporting-a-model-with-transformersonnx]]
+
+<Tip warning={true}>
+
+`tranformers.onnx`는 더 이상 유지되지 않습니다. 위에서 설명한 대로 🤗 Optimum을 사용하여 모델을 내보내세요. 이 섹션은 향후 버전에서 제거될 예정입니다.
+
+</Tip>
+
+🤗 Transformers 모델을 ONNX로 내보내려면 추가 종속성을 설치하세요:
+
+```bash
+pip install transformers[onnx]
+```
+
+`transformers.onnx` 패키지를 Python 모듈로 사용하여 준비된 구성을 사용하여 체크포인트를 내보냅니다:
+
+```bash
+python -m transformers.onnx --model=distilbert/distilbert-base-uncased onnx/
+```
+
+이렇게 하면 `--model` 인수에 정의된 체크포인트의 ONNX 그래프가 내보내집니다. 🤗 Hub에서 제공하는 체크포인트나 로컬에 저장된 체크포인트를 전달할 수 있습니다. 결과로 생성된 `model.onnx` 파일은 ONNX 표준을 지원하는 많은 가속기 중 하나에서 실행할 수 있습니다. 예를 들어, 다음과 같이 ONNX Runtime을 사용하여 모델을 로드하고 실행할 수 있습니다:
+
+```python
+>>> from transformers import AutoTokenizer
+>>> from onnxruntime import InferenceSession
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
+>>> session = InferenceSession("onnx/model.onnx")
+>>> # ONNX Runtime expects NumPy arrays as input
+>>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np")
+>>> outputs = session.run(output_names=["last_hidden_state"], input_feed=dict(inputs))
+```
+
+필요한 출력 이름(예: `["last_hidden_state"]`)은 각 모델의 ONNX 구성을 확인하여 얻을 수 있습니다. 예를 들어, DistilBERT의 경우 다음과 같습니다:
+
+```python
+>>> from transformers.models.distilbert import DistilBertConfig, DistilBertOnnxConfig
+
+>>> config = DistilBertConfig()
+>>> onnx_config = DistilBertOnnxConfig(config)
+>>> print(list(onnx_config.outputs.keys()))
+["last_hidden_state"]
+```
+
+Hub의 TensorFlow 체크포인트에 대해서도 동일한 프로세스가 적용됩니다. 예를 들어, 다음과 같이 순수한 TensorFlow 체크포인트를 내보냅니다:
+
+```bash
+python -m transformers.onnx --model=keras-io/transformers-qa onnx/
+```
+
+로컬에 저장된 모델을 내보내려면 모델의 가중치 파일과 토크나이저 파일을 동일한 디렉토리에 저장한 다음, transformers.onnx 패키지의 --model 인수를 원하는 디렉토리로 지정하여 ONNX로 내보냅니다:
+
+```bash
+python -m transformers.onnx --model=local-pt-checkpoint onnx/
+```
--- a/docs/source/ko/tasks/audio_classification.md
+++ b/docs/source/ko/tasks/audio_classification.md
@ -221,7 +221,7 @@ MinDS-14 데이터 세트의 샘플링 속도는 8khz이므로(이 정보는 [
 ...     gradient_accumulation_steps=4,
 ...     per_device_eval_batch_size=32,
 ...     num_train_epochs=10,
-...     warmup_steps=0.1,
+...     warmup_ratio=0.1,
 ...     logging_steps=10,
 ...     load_best_model_at_end=True,
 ...     metric_for_best_model="accuracy",
--- a/docs/source/ko/tasks/image_classification.md
+++ b/docs/source/ko/tasks/image_classification.md
@ -212,7 +212,7 @@ Hugging Face 계정에 로그인하여 모델을 업로드하고 커뮤니티에
 ...     gradient_accumulation_steps=4,
 ...     per_device_eval_batch_size=16,
 ...     num_train_epochs=3,
-...     warmup_steps=0.1,
+...     warmup_ratio=0.1,
 ...     logging_steps=10,
 ...     load_best_model_at_end=True,
 ...     metric_for_best_model="accuracy",
--- a/docs/source/ko/tasks/video_classification.md
+++ b/docs/source/ko/tasks/video_classification.md
@ -357,7 +357,7 @@ You should probably TRAIN this model on a down-stream task to be able to use it
 ...     learning_rate=5e-5,
 ...     per_device_train_batch_size=batch_size,
 ...     per_device_eval_batch_size=batch_size,
-...     warmup_steps=0.1,
+...     warmup_ratio=0.1,
 ...     logging_steps=10,
 ...     load_best_model_at_end=True,
 ...     metric_for_best_model="accuracy",
--- a/docs/source/zh/_toctree.yml
+++ b/docs/source/zh/_toctree.yml
@ -107,6 +107,8 @@
      title: 模型
    - local: main_classes/text_generation
      title: 文本生成
+    - local: main_classes/onnx
+      title: ONNX
    - local: main_classes/optimizer_schedules
      title: Optimization
    - local: main_classes/output
--- a/docs/source/zh/main_classes/deepspeed.md
+++ b/docs/source/zh/main_classes/deepspeed.md
@ -1206,7 +1206,7 @@ DeepSpeed支持`LRRangeTest`、`OneCycle`、`WarmupLR`和`WarmupDecayLR`学习
 - 通过 `--lr_scheduler_type constant_with_warmup` 实现 `WarmupLR`
 - 通过 `--lr_scheduler_type linear` 实现 `WarmupDecayLR`。这也是 `--lr_scheduler_type` 的默认值，因此，如果不配置调度器，这将是默认配置的调度器。

-如果在配置文件中不配置 `scheduler` 条目，[`Trainer`] 将使用 `--lr_scheduler_type`、`--learning_rate` 和 `--warmup_steps` 的值来配置其🤗 Transformers 版本。
+如果在配置文件中不配置 `scheduler` 条目，[`Trainer`] 将使用 `--lr_scheduler_type`、`--learning_rate` 和 `--warmup_steps` 或 `--warmup_ratio` 的值来配置其🤗 Transformers 版本。

 以下是 `WarmupLR` 的自动配置示例：

@ -1227,7 +1227,7 @@ DeepSpeed支持`LRRangeTest`、`OneCycle`、`WarmupLR`和`WarmupDecayLR`学习

 - `warmup_min_lr` 的值为 `0`。
 - `warmup_max_lr` 的值为 `--learning_rate`。
- `warmup_num_steps` 的值为 `--warmup_steps`（如果提供）。
+- `warmup_num_steps` 的值为 `--warmup_steps`（如果提供）。否则，将使用 `--warmup_ratio` 乘以训练步骤的数量，并四舍五入。
 - `total_num_steps` 的值为 `--max_steps` 或者如果没有提供，将在运行时根据环境、数据集的大小和其他命令行参数（对于 `WarmupDecayLR` 来说需要）自动推导。

 当然，您可以接管任何或所有的配置值，并自行设置这些值：
--- a/docs/source/zh/main_classes/onnx.md
+++ b/docs/source/zh/main_classes/onnx.md
@ -0,0 +1,45 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 导出 🤗 Transformers 模型到 ONNX
+
+🤗 Transformers提供了一个`transformers.onnx`包，通过利用配置对象，您可以将模型checkpoints转换为ONNX图。
+
+有关更多详细信息，请参阅导出 🤗 Transformers 模型的[指南](../serialization)。
+
+## ONNX Configurations
+
+我们提供了三个抽象类，取决于您希望导出的模型架构类型：
+
+* 基于编码器的模型继承 [`~onnx.config.OnnxConfig`]
+* 基于解码器的模型继承 [`~onnx.config.OnnxConfigWithPast`]
+* 编码器-解码器模型继承 [`~onnx.config.OnnxSeq2SeqConfigWithPast`]
+
+### OnnxConfig
+
+[[autodoc]] onnx.config.OnnxConfig
+
+### OnnxConfigWithPast
+
+[[autodoc]] onnx.config.OnnxConfigWithPast
+
+### OnnxSeq2SeqConfigWithPast
+
+[[autodoc]] onnx.config.OnnxSeq2SeqConfigWithPast
+
+## ONNX Features
+
+每个ONNX配置与一组 _特性_ 相关联，使您能够为不同类型的拓扑结构或任务导出模型。
--- a/docs/source/zh/serialization.md
+++ b/docs/source/zh/serialization.md
@ -47,7 +47,7 @@ rendered properly in your Markdown viewer.
 要将 🤗 Transformers 模型导出为 ONNX，首先需要安装额外的依赖项：

 ```bash
-pip install optimum-onnx
+pip install optimum[exporters]
 ```

 请参阅 [🤗 Optimum 文档](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model#exporting-a-model-to-onnx-using-the-cli) 以查看所有可用参数，或者在命令行中查看帮助：
@ -117,3 +117,53 @@ optimum-cli export onnx --model local_path --task question-answering distilbert_
 ### 导出尚未支持的架构的模型

 如果你想要为当前无法导出的模型添加支持，请先检查 [`optimum.exporters.onnx`](https://huggingface.co/docs/optimum/exporters/onnx/overview) 是否支持该模型，如果不支持，你可以 [直接为 🤗 Optimum 贡献代码](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/contribute)。
+
+### 使用 `transformers.onnx` 导出模型
+
+<Tip warning={true}>
+
+`transformers.onnx` 不再进行维护，请如上所述，使用 🤗 Optimum 导出模型。这部分内容将在未来版本中删除。
+
+</Tip>
+
+要使用 `transformers.onnx` 将 🤗 Transformers 模型导出为 ONNX，请安装额外的依赖项：
+
+```bash
+pip install transformers[onnx]
+```
+
+将 `transformers.onnx` 包作为 Python 模块使用，以使用现成的配置导出检查点：
+
+```bash
+python -m transformers.onnx --model=distilbert/distilbert-base-uncased onnx/
+```
+
+以上代码将导出由 `--model` 参数定义的检查点的 ONNX 图。传入任何 🤗 Hub 上或者存储与本地的检查点。生成的 `model.onnx` 文件可以在支持 ONNX 标准的众多加速引擎上运行。例如，使用 ONNX Runtime 加载并运行模型，如下所示：
+
+```python
+>>> from transformers import AutoTokenizer
+>>> from onnxruntime import InferenceSession
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
+>>> session = InferenceSession("onnx/model.onnx")
+>>> # ONNX Runtime expects NumPy arrays as input
+>>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np")
+>>> outputs = session.run(output_names=["last_hidden_state"], input_feed=dict(inputs))
+```
+
+可以通过查看每个模型的 ONNX 配置来获取所需的输出名（例如 `["last_hidden_state"]`）。例如，对于 DistilBERT，可以用以下代码获取输出名称：
+
+```python
+>>> from transformers.models.distilbert import DistilBertConfig, DistilBertOnnxConfig
+
+>>> config = DistilBertConfig()
+>>> onnx_config = DistilBertOnnxConfig(config)
+>>> print(list(onnx_config.outputs.keys()))
+["last_hidden_state"]
+```
+
+要导出本地存储的模型，请将模型的权重和分词器文件保存在同一目录中（例如 `local-pt-checkpoint`），然后通过将 `transformers.onnx` 包的 `--model` 参数指向该目录，将其导出为 ONNX：
+
+```bash
+python -m transformers.onnx --model=local-pt-checkpoint onnx/
+```
--- a/examples/modular-transformers/modeling_new_task_model.py
+++ b/examples/modular-transformers/modeling_new_task_model.py
@ -125,23 +125,15 @@ def token_type_ids_mask_function(
        # If it's 1 for both query and key/value, we are in an image block
        # NOTE: static cache shape goes beyond input seq length, while token_type_ids.shape[1] == input seq length
        # Since vmap doesn't support `if statement` we workaround it with `torch.where`
-        safe_q_idx = torch.where(q_idx < token_type_ids.shape[1], q_idx, 0)
-        safe_kv_idx = torch.where(kv_idx < token_type_ids.shape[1], kv_idx, 0)
-
-        token_type_ids_at_q_idx = token_type_ids[batch_idx, safe_q_idx]
-        token_type_ids_at_q_idx = torch.where(q_idx < token_type_ids.shape[1], token_type_ids_at_q_idx, 0)
-
-        token_type_ids_at_kv_idx = token_type_ids[batch_idx, safe_kv_idx]
+        safe_idx = torch.where(kv_idx < token_type_ids.shape[1], kv_idx, 0)
+        token_type_ids_at_kv_idx = token_type_ids[batch_idx, safe_idx]
        token_type_ids_at_kv_idx = torch.where(kv_idx < token_type_ids.shape[1], token_type_ids_at_kv_idx, 0)

-        image_group_ids_at_q_idx = image_group_ids[batch_idx, safe_q_idx]
-        image_group_ids_at_q_idx = torch.where(q_idx < image_group_ids.shape[1], image_group_ids_at_q_idx, -1)
-
-        image_group_ids_at_kv_idx = image_group_ids[batch_idx, safe_kv_idx]
+        image_group_ids_at_kv_idx = image_group_ids[batch_idx, safe_idx]
        image_group_ids_at_kv_idx = torch.where(kv_idx < image_group_ids.shape[1], image_group_ids_at_kv_idx, -1)

-        is_image_block = (token_type_ids_at_q_idx == 1) & (token_type_ids_at_kv_idx == 1)
-        same_image_block = image_group_ids_at_q_idx == image_group_ids_at_kv_idx
+        is_image_block = (token_type_ids[batch_idx, q_idx] == 1) & (token_type_ids_at_kv_idx == 1)
+        same_image_block = image_group_ids[batch_idx, q_idx] == image_group_ids_at_kv_idx

        # This is bidirectional attention whenever we are dealing with image tokens
        return is_image_block & same_image_block
--- a/examples/pytorch/audio-classification/README.md
+++ b/examples/pytorch/audio-classification/README.md
@ -41,7 +41,7 @@ python run_audio_classification.py \
    --learning_rate 3e-5 \
    --max_length_seconds 1 \
    --attention_mask False \
-    --warmup_steps 0.1 \
+    --warmup_ratio 0.1 \
    --num_train_epochs 5 \
    --per_device_train_batch_size 32 \
    --gradient_accumulation_steps 4 \
@ -82,7 +82,7 @@ python run_audio_classification.py \
    --learning_rate 3e-4 \
    --max_length_seconds 16 \
    --attention_mask False \
-    --warmup_steps 0.1 \
+    --warmup_ratio 0.1 \
    --num_train_epochs 10 \
    --per_device_train_batch_size 8 \
    --gradient_accumulation_steps 4 \
--- a/examples/pytorch/continuous_batching.py
+++ b/examples/pytorch/continuous_batching.py
@ -44,7 +44,7 @@ def generate_simple(
        "eager": "eager",
        "paged_attention": "eager",  # TODO: this does not work on AMD docker
        "flash_paged": "flash_attention_2",  # TODO: this does not work on AMD docker
-        "kernels-community/flash-attn2": "eager",
+        "kernels-community/flash-attn": "eager",
    }[attn_impl]

    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, dtype=torch.bfloat16, attn_implementation=attn_impl)
@ -187,7 +187,7 @@ if __name__ == "__main__":
    parser.add_argument("--num-blocks", "-n", type=int, default=None)
    parser.add_argument("--max-batch-tokens", "-b", type=int, default=None)

-    parser.add_argument("--attn", type=str, default="kernels-community/flash-attn2", help="Attention implementation")
+    parser.add_argument("--attn", type=str, default="kernels-community/flash-attn", help="Attention implementation")
    parser.add_argument("--matmul-precision", "-mp", type=str, default="high")  # set to "none" to disable
    parser.add_argument("--cuda-graph", "-cg", help="Use cuda graphs", type=str, default=None)
    parser.add_argument("--compile", action="store_true", help="Compile the model using torch.compile")
--- a/examples/pytorch/continuous_batching_simple.py
+++ b/examples/pytorch/continuous_batching_simple.py
@ -31,7 +31,7 @@ if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--num-blocks", "-n", type=int, default=None)
    parser.add_argument("--max-batch-tokens", "-b", type=int, default=None)
-    parser.add_argument("--attn", type=str, default="kernels-community/flash-attn2", help="Attention implementation")
+    parser.add_argument("--attn", type=str, default="kernels-community/flash-attn", help="Attention implementation")
    parser.add_argument("--samples", type=int, default=500)
    parser.add_argument("--max-new-tokens", type=int, default=32)

--- a/examples/pytorch/image-pretraining/README.md
+++ b/examples/pytorch/image-pretraining/README.md
@ -165,7 +165,7 @@ python run_mae.py \
    --lr_scheduler_type cosine \
    --weight_decay 0.05 \
    --num_train_epochs 800 \
-    --warmup_steps 0.05 \
+    --warmup_ratio 0.05 \
    --per_device_train_batch_size 8 \
    --per_device_eval_batch_size 8 \
    --logging_strategy steps \
--- a/notebooks/README.md
+++ b/notebooks/README.md
@ -33,9 +33,9 @@ You can open any page of the documentation as a notebook in Colab (there is a bu
 | [Quicktour of the library](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/quicktour.ipynb)  | A presentation of the various APIs in Transformers |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/quicktour.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/en/transformers_doc/quicktour.ipynb)| |
 | [Summary of the tasks](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/task_summary.ipynb)  | How to run the models of the Transformers library task by task |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/task_summary.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/task_summary.ipynb)| |
 | [Preprocessing data](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/preprocessing.ipynb)  | How to use a tokenizer to preprocess your data |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/preprocessing.ipynb) | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/preprocessing.ipynb)||
-| [Fine-tuning a pretrained model](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/training.ipynb)  | How to use the Trainer to fine-tune a pretrained model |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/training.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/training.ipynb)|[![Open in AMD Dev Cloud](https://oneclickamd.ai/static/amd.svg?v=2)](https://oneclickamd.ai/github/huggingface/notebooks/blob/main/transformers_doc/en/training.ipynb)|
-| [Summary of the tokenizers](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/tokenizer_summary.ipynb)  | The differences between the tokenizers algorithm |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/tokenizer_summary.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/tokenizer_summary.ipynb)|[![Open in AMD Dev Cloud](https://oneclickamd.ai/static/amd.svg?v=2)](https://oneclickamd.ai/github/huggingface/notebooks/blob/main/transformers_doc/en/tokenizer_summary.ipynb )|
-| [Multilingual models](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/multilingual.ipynb)  | How to use the multilingual models of the library |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/multilingual.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/multilingual.ipynb)|[![Open in AMD Dev Cloud](https://oneclickamd.ai/static/amd.svg?v=2)](https://oneclickamd.ai/github/huggingface/notebooks/blob/main/transformers_doc/en/multilingual.ipynb)|
+| [Fine-tuning a pretrained model](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/training.ipynb)  | How to use the Trainer to fine-tune a pretrained model |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/training.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/training.ipynb)| |
+| [Summary of the tokenizers](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/tokenizer_summary.ipynb)  | The differences between the tokenizers algorithm |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/tokenizer_summary.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/tokenizer_summary.ipynb)|[![Open in AMD Dev Cloud](https://oneclickamd.ai/static/amd.svg?v=2)](http://oneclickamd.ai/github/huggingface/notebooks/blob/main/transformers_doc/en/tokenizer_summary.ipynb )|
+| [Multilingual models](https://github.com/huggingface/notebooks/blob/main/transformers_doc/en/multilingual.ipynb)  | How to use the multilingual models of the library |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/en/multilingual.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/en/multilingual.ipynb)|[![Open in AMD Dev Cloud](https://oneclickamd.ai/static/amd.svg)](http://oneclickamd.ai/github/huggingface/notebooks/blob/main/transformers_doc/en/multilingual.ipynb)|

 ### PyTorch Examples

@ -43,14 +43,14 @@ You can open any page of the documentation as a notebook in Colab (there is a bu

 | Notebook     |      Description      |   |   |   |
 |:----------|:-------------|:-------------|:-------------|------:|
-| [Train your tokenizer](https://github.com/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb)  | How to train and use your very own tokenizer  |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb)|[![Open in AMD Dev Cloud](https://oneclickamd.ai/static/amd.svg?v=2)](https://oneclickamd.ai/github/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb)|
-| [Train your language model](https://github.com/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch.ipynb)   | How to easily start using transformers  |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch.ipynb)|[![Open in AMD Dev Cloud](https://oneclickamd.ai/static/amd.svg?v=2)](https://oneclickamd.ai/github/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch.ipynb)|
+| [Train your tokenizer](https://github.com/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb)  | How to train and use your very own tokenizer  |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb)|[![Open in AMD Dev Cloud](https://oneclickamd.ai/static/amd.svg?v=2)](http://oneclickamd.ai/github/huggingface/notebooks/blob/main/examples/tokenizer_training.ipynb)|
+| [Train your language model](https://github.com/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch.ipynb)   | How to easily start using transformers  |[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch.ipynb)|[![Open in AMD Dev Cloud](https://oneclickamd.ai/static/amd.svg?v=2)](http://oneclickamd.ai/github/huggingface/notebooks/blob/main/examples/language_modeling_from_scratch.ipynb)|
 | [How to fine-tune a model on text classification](https://github.com/huggingface/notebooks/blob/main/examples/text_classification.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on any GLUE task. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb)| |
-| [How to fine-tune a model on language modeling](https://github.com/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on a causal or masked LM task. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)| [![Open in AMD Dev Cloud](https://oneclickamd.ai/static/amd.svg?v=2)](https://oneclickamd.ai/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)|
-| [How to fine-tune a model on token classification](https://github.com/huggingface/notebooks/blob/main/examples/token_classification.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on a token classification task (NER, PoS). | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb)|[![Open in AMD Dev Cloud](https://oneclickamd.ai/static/amd.svg?v=2)](https://oneclickamd.ai/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb)|
-| [How to fine-tune a model on question answering](https://github.com/huggingface/notebooks/blob/main/examples/question_answering.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on SQUAD. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb)|[![Open in AMD Dev Cloud](https://oneclickamd.ai/static/amd.svg?v=2)](https://oneclickamd.ai/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb)|
-| [How to fine-tune a model on multiple choice](https://github.com/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on SWAG. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)|[![Open in AMD Dev Cloud](https://oneclickamd.ai/static/amd.svg?v=2)](https://oneclickamd.ai/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)|
-| [How to fine-tune a model on translation](https://github.com/huggingface/notebooks/blob/main/examples/translation.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on WMT. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/translation.ipynb)|[![Open in AMD Dev Cloud](https://oneclickamd.ai/static/amd.svg?v=2)](https://oneclickamd.ai/github/huggingface/notebooks/blob/main/examples/translation.ipynb)|
+| [How to fine-tune a model on language modeling](https://github.com/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on a causal or masked LM task. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)| [![Open in AMD Dev Cloud](https://oneclickamd.ai/static/amd.svg?v=2)](http://oneclickamd.ai/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)|
+| [How to fine-tune a model on token classification](https://github.com/huggingface/notebooks/blob/main/examples/token_classification.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on a token classification task (NER, PoS). | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb)| |
+| [How to fine-tune a model on question answering](https://github.com/huggingface/notebooks/blob/main/examples/question_answering.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on SQUAD. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb)| |
+| [How to fine-tune a model on multiple choice](https://github.com/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on SWAG. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)| |
+| [How to fine-tune a model on translation](https://github.com/huggingface/notebooks/blob/main/examples/translation.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on WMT. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/translation.ipynb)| |
 | [How to fine-tune a model on summarization](https://github.com/huggingface/notebooks/blob/main/examples/summarization.ipynb)| Show how to preprocess the data and fine-tune a pretrained model on XSUM. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/summarization.ipynb)| |
 | [How to train a language model from scratch](https://github.com/huggingface/blog/blob/main/notebooks/01_how_to_train.ipynb)| Highlight all the steps to effectively train Transformer model on custom data | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/01_how_to_train.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/blog/blob/main/notebooks/01_how_to_train.ipynb)| |
 | [How to generate text](https://github.com/huggingface/blog/blob/main/notebooks/02_how_to_generate.ipynb)| How to use different decoding methods for language generation with transformers | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/main/notebooks/02_how_to_generate.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/blog/blob/main/notebooks/02_how_to_generate.ipynb)| |
@ -58,16 +58,16 @@ You can open any page of the documentation as a notebook in Colab (there is a bu

 #### Computer Vision[[pytorch-cv]]

-| Notebook                                                                                                                                                                   | Description                                                                                                            |                                                                                                                                                                                                            |   |   |
-|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------|------:|
-| [How to fine-tune a model on image classification (Torchvision)](https://github.com/huggingface/notebooks/blob/main/examples/image_classification.ipynb)                   | Show how to preprocess the data using Torchvision and fine-tune any pretrained Vision model on Image Classification    | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb)                 | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb)| [![Open in AMD Dev Cloud](https://oneclickamd.ai/static/amd.svg?v=2)](https://oneclickamd.ai/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb)|
-| [How to fine-tune a model on image classification (Albumentations)](https://github.com/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb) | Show how to preprocess the data using Albumentations and fine-tune any pretrained Vision model on Image Classification | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb)  | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb)| |
-| [How to fine-tune a model on image classification (Kornia)](https://github.com/huggingface/notebooks/blob/main/examples/image_classification_kornia.ipynb)                 | Show how to preprocess the data using Kornia and fine-tune any pretrained Vision model on Image Classification         | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_kornia.ipynb)          | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_classification_kornia.ipynb)| |
-| [How to perform zero-shot object detection with OWL-ViT](https://github.com/huggingface/notebooks/blob/main/examples/zeroshot_object_detection_with_owlvit.ipynb)          | Show how to perform zero-shot object detection on images with text queries                                             | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/zeroshot_object_detection_with_owlvit.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/zeroshot_object_detection_with_owlvit.ipynb)| |
-| [How to fine-tune an image captioning model](https://github.com/huggingface/notebooks/blob/main/examples/image_captioning_blip.ipynb)                                      | Show how to fine-tune BLIP for image captioning on a custom dataset                                                    | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_captioning_blip.ipynb)                | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_captioning_blip.ipynb)| |
-| [How to build an image similarity system with Transformers](https://github.com/huggingface/notebooks/blob/main/examples/image_similarity.ipynb)                            | Show how to build an image similarity system                                                                           | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_similarity.ipynb)                     | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_similarity.ipynb)| |
-| [How to fine-tune a SegFormer model on semantic segmentation](https://github.com/huggingface/notebooks/blob/main/examples/semantic_segmentation.ipynb)                     | Show how to preprocess the data and fine-tune a pretrained SegFormer model on Semantic Segmentation                    | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/semantic_segmentation.ipynb)                | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/semantic_segmentation.ipynb)| |
-| [How to fine-tune a VideoMAE model on video classification](https://github.com/huggingface/notebooks/blob/main/examples/video_classification.ipynb)          | Show how to preprocess the data and fine-tune a pretrained VideoMAE model on Video Classification                      | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/video_classification.ipynb)                | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/video_classification.ipynb)| |
+| Notebook                                                                                                                                                                   | Description                                                                                                            |                                                                                                                                                                                                            |   |
+|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------|:-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------:|
+| [How to fine-tune a model on image classification (Torchvision)](https://github.com/huggingface/notebooks/blob/main/examples/image_classification.ipynb)                   | Show how to preprocess the data using Torchvision and fine-tune any pretrained Vision model on Image Classification    | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb)                 | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb)|
+| [How to fine-tune a model on image classification (Albumentations)](https://github.com/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb) | Show how to preprocess the data using Albumentations and fine-tune any pretrained Vision model on Image Classification | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb)  | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb)|
+| [How to fine-tune a model on image classification (Kornia)](https://github.com/huggingface/notebooks/blob/main/examples/image_classification_kornia.ipynb)                 | Show how to preprocess the data using Kornia and fine-tune any pretrained Vision model on Image Classification         | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_kornia.ipynb)          | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_classification_kornia.ipynb)|
+| [How to perform zero-shot object detection with OWL-ViT](https://github.com/huggingface/notebooks/blob/main/examples/zeroshot_object_detection_with_owlvit.ipynb)          | Show how to perform zero-shot object detection on images with text queries                                             | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/zeroshot_object_detection_with_owlvit.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/zeroshot_object_detection_with_owlvit.ipynb)|
+| [How to fine-tune an image captioning model](https://github.com/huggingface/notebooks/blob/main/examples/image_captioning_blip.ipynb)                                      | Show how to fine-tune BLIP for image captioning on a custom dataset                                                    | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_captioning_blip.ipynb)                | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_captioning_blip.ipynb)|
+| [How to build an image similarity system with Transformers](https://github.com/huggingface/notebooks/blob/main/examples/image_similarity.ipynb)                            | Show how to build an image similarity system                                                                           | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_similarity.ipynb)                     | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/image_similarity.ipynb)|
+| [How to fine-tune a SegFormer model on semantic segmentation](https://github.com/huggingface/notebooks/blob/main/examples/semantic_segmentation.ipynb)                     | Show how to preprocess the data and fine-tune a pretrained SegFormer model on Semantic Segmentation                    | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/semantic_segmentation.ipynb)                | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/semantic_segmentation.ipynb)|
+| [How to fine-tune a VideoMAE model on video classification](https://github.com/huggingface/notebooks/blob/main/examples/video_classification.ipynb)          | Show how to preprocess the data and fine-tune a pretrained VideoMAE model on Video Classification                      | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/video_classification.ipynb)                | [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/video_classification.ipynb)|

 #### Audio[[pytorch-audio]]

--- a/setup.py
+++ b/setup.py
@ -104,7 +104,7 @@ _deps = [
    "deepspeed>=0.9.3",
    "diffusers",
    "dill<0.3.5",
-    "evaluate>=0.4.6",
+    "evaluate>=0.2.0",
    "faiss-cpu",
    "fastapi",
    "filelock",
@ -117,7 +117,6 @@ _deps = [
    "importlib_metadata",
    "ipadic>=1.0.0,<2.0",
    "jinja2>=3.1.0",
-    "jmespath>=1.0.1",
    "kenlm",
    "kernels>=0.10.2,<0.11",
    "librosa",
@ -170,7 +169,7 @@ _deps = [
    "tiktoken",
    "timm<=1.0.19,!=1.0.18",
    "tokenizers>=0.22.0,<=0.23.0",
-    "torch>=2.2",
+    "torch>=2.2,<2.9",
    "torchaudio",
    "torchvision",
    "pyctcdecode>=0.4.0",
@ -295,7 +294,7 @@ extras["num2words"] = deps_list("num2words")
 extras["sentencepiece"] = deps_list("sentencepiece", "protobuf")
 extras["tiktoken"] = deps_list("tiktoken", "blobfile")
 extras["mistral-common"] = deps_list("mistral-common[opencv]")
-extras["chat_template"] = deps_list("jinja2", "jmespath")
+extras["chat_template"] = deps_list("jinja2")
 extras["testing"] = (
    deps_list(
        "pytest",
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@ -129,6 +129,8 @@ _import_structure = {
    ],
    "loss": [],
    "modelcard": ["ModelCard"],
+    # Models
+    "onnx": [],
    "pipelines": [
        "AudioClassificationPipeline",
        "AutomaticSpeechRecognitionPipeline",
--- a/src/transformers/cli/run.py
+++ b/src/transformers/cli/run.py
@ -51,7 +51,7 @@ def run(
        Optional[str],
        typer.Option(help="Name of the column to use as input. For multi columns input use 'column1,columns2'"),
    ] = None,
-    format: Annotated[FormatEnum, typer.Option(help="Input format to read from", case_sensitive=False)] = "pipe",  # type: ignore
+    format: Annotated[FormatEnum, typer.Option(help="Input format to read from", case_sensitive=False)] = "infer",  # type: ignore
    device: Annotated[
        int, typer.Option(help="Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU.")
    ] = -1,
--- a/src/transformers/cli/serve.py
+++ b/src/transformers/cli/serve.py
@ -377,10 +377,14 @@ class Serve:
                help="Which attention implementation to use; you can run --attn_implementation=flash_attention_2, in which case you must install this manually by running `pip install flash-attn --no-build-isolation`."
            ),
        ] = None,
-        quantization: Annotated[
-            Optional[str],
-            typer.Option(help="Which quantization method to use. choices: 'bnb-4bit', 'bnb-8bit'"),
-        ] = None,
+        load_in_8bit: Annotated[
+            bool, typer.Option(help="Whether to use 8 bit precision for the base model - works only with LoRA.")
+        ] = False,
+        load_in_4bit: Annotated[
+            bool, typer.Option(help="Whether to use 4 bit precision for the base model - works only with LoRA.")
+        ] = False,
+        bnb_4bit_quant_type: Annotated[str, typer.Option(help="Quantization type.")] = "nf4",
+        use_bnb_nested_quant: Annotated[bool, typer.Option(help="Whether to use nested quantization.")] = False,
        host: Annotated[str, typer.Option(help="Interface the server will listen to.")] = "localhost",
        port: Annotated[int, typer.Option(help="Port the server will listen to.")] = 8000,
        model_timeout: Annotated[
@ -420,7 +424,10 @@ class Serve:
        self.dtype = dtype
        self.trust_remote_code = trust_remote_code
        self.attn_implementation = attn_implementation
-        self.quantization = quantization
+        self.load_in_8bit = load_in_8bit
+        self.load_in_4bit = load_in_4bit
+        self.bnb_4bit_quant_type = bnb_4bit_quant_type
+        self.use_bnb_nested_quant = use_bnb_nested_quant
        self.host = host
        self.port = port
        self.model_timeout = model_timeout
@ -1681,20 +1688,22 @@ class Serve:
        Returns:
            `Optional[BitsAndBytesConfig]`: The quantization config.
        """
-        if self.quantization == "bnb-4bit":
+        if self.load_in_4bit:
            quantization_config = BitsAndBytesConfig(
                load_in_4bit=True,
-                bnb_4bit_quant_type="nf4",
-                bnb_4bit_use_double_quant=True,
+                # For consistency with model weights, we use the same value as `dtype`
+                bnb_4bit_compute_dtype=self.dtype,
+                bnb_4bit_quant_type=self.bnb_4bit_quant_type,
+                bnb_4bit_use_double_quant=self.use_bnb_nested_quant,
+                bnb_4bit_quant_storage=self.dtype,
+            )
+        elif self.load_in_8bit:
+            quantization_config = BitsAndBytesConfig(
+                load_in_8bit=True,
            )
-        elif self.quantization == "bnb-8bit":
-            quantization_config = BitsAndBytesConfig(load_in_8bit=True)
        else:
            quantization_config = None

-        if quantization_config is not None:
-            logger.info(f"Quantization applied with the following config: {quantization_config}")
-
        return quantization_config

    def process_model_name(self, model_id: str) -> str:
@ -1741,6 +1750,7 @@ class Serve:
            revision=revision,
            trust_remote_code=self.trust_remote_code,
        )
+
        dtype = self.dtype if self.dtype in ["auto", None] else getattr(torch, self.dtype)
        quantization_config = self.get_quantization_config()

@ -1748,15 +1758,19 @@ class Serve:
            "revision": revision,
            "attn_implementation": self.attn_implementation,
            "dtype": dtype,
-            "device_map": self.device,
+            "device_map": "auto",
            "trust_remote_code": self.trust_remote_code,
-            "quantization_config": quantization_config,
        }
+        if quantization_config is not None:
+            model_kwargs["quantization_config"] = quantization_config

        config = AutoConfig.from_pretrained(model_id, **model_kwargs)
        architecture = getattr(transformers, config.architectures[0])
        model = architecture.from_pretrained(model_id, **model_kwargs)

+        if getattr(model, "hf_device_map", None) is None:
+            model = model.to(self.device)
+
        has_default_max_length = (
            model.generation_config.max_new_tokens is None and model.generation_config.max_length == 20
        )
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@ -14,7 +14,7 @@ deps = {
    "deepspeed": "deepspeed>=0.9.3",
    "diffusers": "diffusers",
    "dill": "dill<0.3.5",
-    "evaluate": "evaluate>=0.4.6",
+    "evaluate": "evaluate>=0.2.0",
    "faiss-cpu": "faiss-cpu",
    "fastapi": "fastapi",
    "filelock": "filelock",
@ -27,7 +27,6 @@ deps = {
    "importlib_metadata": "importlib_metadata",
    "ipadic": "ipadic>=1.0.0,<2.0",
    "jinja2": "jinja2>=3.1.0",
-    "jmespath": "jmespath>=1.0.1",
    "kenlm": "kenlm",
    "kernels": "kernels>=0.10.2,<0.11",
    "librosa": "librosa",
@ -77,7 +76,7 @@ deps = {
    "tiktoken": "tiktoken",
    "timm": "timm<=1.0.19,!=1.0.18",
    "tokenizers": "tokenizers>=0.22.0,<=0.23.0",
-    "torch": "torch>=2.2",
+    "torch": "torch>=2.2,<2.9",
    "torchaudio": "torchaudio",
    "torchvision": "torchvision",
    "pyctcdecode": "pyctcdecode>=0.4.0",
--- a/src/transformers/generation/continuous_batching/requests.py
+++ b/src/transformers/generation/continuous_batching/requests.py
@ -27,6 +27,7 @@ from ...utils.metrics import traced
 logger = logging.getLogger("ContinuousBatchingLogger")


+@staticmethod
 def get_device_and_memory_breakdown() -> tuple[torch.device, int, int, int]:
    if torch.cuda.is_available():
        device = torch.device("cuda")
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@ -442,6 +442,75 @@ def normalize(
    return image


+def unnormalize(
+    image: Union[np.ndarray, "torch.Tensor"],
+    mean: Union[float, Collection[float]],
+    std: Union[float, Collection[float]],
+    data_format: Optional[ChannelDimension] = None,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> np.ndarray:
+    """
+    Inverse of `normalize`:
+
+        image = image * std + mean
+
+    Args:
+        image (`np.ndarray` or `torch.Tensor`):
+            The image to unnormalize.
+        mean (`float` or `Collection[float]`):
+            The mean to use for unnormalization.
+        std (`float` or `Collection[float]`):
+            The standard deviation to use for unnormalization.
+        data_format (`ChannelDimension`, *optional*):
+            The channel dimension format of the output image. If unset, will use the inferred format from the input.
+        input_data_format (`ChannelDimension`, *optional*):
+            The channel dimension format of the input image. If unset, will use the inferred format from the input.
+
+    Returns:
+        `np.ndarray`: The unnormalized image.
+    """
+    is_torch_input = isinstance(image, torch.Tensor)
+    if is_torch_input:
+        image = image.detach().cpu().numpy()
+    elif not isinstance(image, np.ndarray):
+        raise TypeError("image must be a numpy array or a torch tensor")
+
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(image)
+
+    if not np.issubdtype(image.dtype, np.floating):
+        image = image.astype(np.float32)
+
+    channel_axis = get_channel_dimension_axis(image, input_data_format=input_data_format)
+    num_channels = image.shape[channel_axis]
+
+    if isinstance(mean, Collection):
+        if len(mean) != num_channels:
+            raise ValueError(f"mean must have {num_channels} elements if it is an iterable, got {len(mean)}")
+    else:
+        mean = [mean] * num_channels
+    mean = np.array(mean, dtype=image.dtype)
+
+    if isinstance(std, Collection):
+        if len(std) != num_channels:
+            raise ValueError(f"std must have {num_channels} elements if it is an iterable, got {len(std)}")
+    else:
+        std = [std] * num_channels
+    std = np.array(std, dtype=image.dtype)
+
+    if input_data_format == ChannelDimension.LAST:
+        image = image * std + mean
+    else:
+        shape = [1] * image.ndim
+        shape[channel_axis] = num_channels
+        mean = mean.reshape(shape)
+        std = std.reshape(shape)
+        image = image * std + mean
+
+    image = to_channel_dimension_format(image, data_format, input_data_format) if data_format is not None else image
+    return image
+
+
 def center_crop(
    image: np.ndarray,
    size: tuple[int, int],
--- a/src/transformers/integrations/deepspeed.py
+++ b/src/transformers/integrations/deepspeed.py
@ -314,14 +314,13 @@ def _load_state_dict_into_zero3_model(model_to_load, state_dict):
        args = (state_dict, prefix, local_metadata, True, [], [], error_msgs)
        # Parameters of module and children will start with prefix. We can exit early if there are none in this
        # state_dict
-        if is_deepspeed_zero3_enabled():
+        if is_deepspeed_zero3_enabled() and len([key for key in state_dict if key.startswith(prefix)]) > 0:
            import deepspeed

            # In sharded models, each shard has only part of the full state_dict, so only gather
            # parameters that are in the current state_dict.
            named_parameters = dict(module.named_parameters(prefix=prefix[:-1], recurse=False))
-            params_to_gather = [named_parameters[k] for k in named_parameters if k in state_dict]
-
+            params_to_gather = [named_parameters[k] for k in state_dict if k in named_parameters]
            if len(params_to_gather) > 0:
                # because zero3 puts placeholders in model params, this context
                # manager gathers (unpartitions) the params of the current layer, then loads from
--- a/src/transformers/integrations/peft.py
+++ b/src/transformers/integrations/peft.py
@ -628,7 +628,7 @@ def maybe_load_adapters(
    **adapter_kwargs,
 ):
    if pretrained_model_name_or_path is None or not is_peft_available():
-        return None, pretrained_model_name_or_path, adapter_kwargs
+        return None, pretrained_model_name_or_path

    token = download_kwargs.get("token")

@ -651,15 +651,13 @@ def maybe_load_adapters(

    _adapter_model_path = adapter_kwargs.pop("_adapter_model_path", None)

-    token_from_adapter_kwargs = adapter_kwargs.pop("token", None)
-
    if _adapter_model_path is None:
        _adapter_model_path = find_adapter_config_file(
            pretrained_model_name_or_path,
            cache_dir=download_kwargs.get("cache_dir"),
            force_download=bool(download_kwargs.get("force_download", False)),
            proxies=download_kwargs.get("proxies"),
-            token=token or token_from_adapter_kwargs,
+            token=token,
            revision=download_kwargs.get("revision"),
            local_files_only=bool(download_kwargs.get("local_files_only", False)),
            subfolder=download_kwargs.get("subfolder", ""),
@ -672,4 +670,4 @@ def maybe_load_adapters(
            _adapter_model_path = pretrained_model_name_or_path
            pretrained_model_name_or_path = json.load(f)["base_model_name_or_path"]

-    return _adapter_model_path, pretrained_model_name_or_path, adapter_kwargs
+    return _adapter_model_path, pretrained_model_name_or_path
--- a/src/transformers/modelcard.py
+++ b/src/transformers/modelcard.py
@ -752,6 +752,8 @@ def extract_hyperparameters_from_trainer(trainer):
            hyperparameters["optimizer"] = f"Use {optimizer_name} and the args are:\n{optimizer_args}"

    hyperparameters["lr_scheduler_type"] = trainer.args.lr_scheduler_type.value
+    if trainer.args.warmup_ratio != 0.0:
+        hyperparameters["lr_scheduler_warmup_ratio"] = trainer.args.warmup_ratio
    if trainer.args.warmup_steps != 0.0:
        hyperparameters["lr_scheduler_warmup_steps"] = trainer.args.warmup_steps
    if trainer.args.max_steps != -1:
--- a/src/transformers/modeling_flash_attention_utils.py
+++ b/src/transformers/modeling_flash_attention_utils.py
@ -97,7 +97,7 @@ def _lazy_imports(implementation: Optional[str]):
            if flash_attn_varlen_func is None or flash_attn_func is None:
                raise ValueError(
                    f"Could not find the currently requested flash attention implementation at `{implementation}`."
-                    f"Make sure that you request a valid kernel from the hub, e.g. `kernels-community/flash-attn2`."
+                    f"Make sure that you request a valid kernel from the hub, e.g. `kernels-community/flash-attn`."
                )

    return flash_attn_func, flash_attn_varlen_func, pad_input, unpad_input
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@ -2381,7 +2381,7 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
            and not is_torch_npu_available()
        ):
            if attn_implementation.endswith("2"):
-                applicable_attn_implementation = "kernels-community/flash-attn2"
+                applicable_attn_implementation = "kernels-community/flash-attn"
            else:
                applicable_attn_implementation = "kernels-community/vllm-flash-attn3"

@ -4353,7 +4353,7 @@ class PreTrainedModel(nn.Module, EmbeddingAccessMixin, ModuleUtilsMixin, PushToH
        if adapter_kwargs is None:
            adapter_kwargs = {}

-        _adapter_model_path, pretrained_model_name_or_path, adapter_kwargs = maybe_load_adapters(
+        _adapter_model_path, pretrained_model_name_or_path = maybe_load_adapters(
            pretrained_model_name_or_path,
            download_kwargs_with_commit,
            **adapter_kwargs,
--- a/src/transformers/models/albert/configuration_albert.py
+++ b/src/transformers/models/albert/configuration_albert.py
@ -15,7 +15,11 @@
 # limitations under the License.
 """ALBERT model configuration"""

+from collections import OrderedDict
+from collections.abc import Mapping
+
 from ...configuration_utils import PreTrainedConfig
+from ...onnx import OnnxConfig


 class AlbertConfig(PreTrainedConfig):
@ -138,4 +142,21 @@ class AlbertConfig(PreTrainedConfig):
        self.classifier_dropout_prob = classifier_dropout_prob


-__all__ = ["AlbertConfig"]
+# Copied from transformers.models.bert.configuration_bert.BertOnnxConfig with Roberta->Albert
+class AlbertOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        if self.task == "multiple-choice":
+            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
+        else:
+            dynamic_axis = {0: "batch", 1: "sequence"}
+        return OrderedDict(
+            [
+                ("input_ids", dynamic_axis),
+                ("attention_mask", dynamic_axis),
+                ("token_type_ids", dynamic_axis),
+            ]
+        )
+
+
+__all__ = ["AlbertConfig", "AlbertOnnxConfig"]
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@ -121,7 +121,7 @@ else:
            ("layoutlmv3", ("LayoutLMv3ImageProcessor", "LayoutLMv3ImageProcessorFast")),
            ("levit", ("LevitImageProcessor", "LevitImageProcessorFast")),
            ("lfm2_vl", (None, "Lfm2VlImageProcessorFast")),
-            ("lightglue", ("LightGlueImageProcessor", "LightGlueImageProcessorFast")),
+            ("lightglue", ("LightGlueImageProcessor", None)),
            ("llama4", ("Llama4ImageProcessor", "Llama4ImageProcessorFast")),
            ("llava", ("LlavaImageProcessor", "LlavaImageProcessorFast")),
            ("llava_next", ("LlavaNextImageProcessor", "LlavaNextImageProcessorFast")),
--- a/src/transformers/models/bamba/modeling_bamba.py
+++ b/src/transformers/models/bamba/modeling_bamba.py
@ -486,11 +486,13 @@ def segment_sum(input_tensor):
    return tensor_segsum


+is_fast_path_available = all((selective_state_update, causal_conv1d_fn, causal_conv1d_update))
+
+
 def apply_mask_to_padding_states(hidden_states, attention_mask):
    """
    Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66
    """
-    # NOTE: attention mask is a 2D boolean tensor
    if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1:
        dtype = hidden_states.dtype
        hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype)
@ -498,9 +500,6 @@ def apply_mask_to_padding_states(hidden_states, attention_mask):
    return hidden_states


-is_fast_path_available = all((selective_state_update, causal_conv1d_fn, causal_conv1d_update))
-
-
 # Adapted from transformers.models.mamba2.modeling_mamba2.Mamba2Mixer
 class BambaMixer(nn.Module):
    """
--- a/src/transformers/models/bamba/modular_bamba.py
+++ b/src/transformers/models/bamba/modular_bamba.py
@ -36,7 +36,6 @@ from transformers.models.llama.modeling_llama import (
 )
 from transformers.models.mamba2.modeling_mamba2 import (
    MambaRMSNormGated,
-    apply_mask_to_padding_states,
    pad_tensor_by_size,
    reshape_into_chunks,
    segment_sum,
@ -204,6 +203,17 @@ class BambaRMSNormGated(MambaRMSNormGated):
    pass


+def apply_mask_to_padding_states(hidden_states, attention_mask):
+    """
+    Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66
+    """
+    if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1:
+        dtype = hidden_states.dtype
+        hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype)
+
+    return hidden_states
+
+
 # Adapted from transformers.models.mamba2.modeling_mamba2.Mamba2Mixer
 class BambaMixer(nn.Module):
    """
--- a/src/transformers/models/bark/modeling_bark.py
+++ b/src/transformers/models/bark/modeling_bark.py
@ -1318,7 +1318,7 @@ class BarkFineModel(BarkPreTrainedModel):
    output sound according to specific predefined voice.
    """
 )
-class BarkModel(BarkPreTrainedModel, GenerationMixin):
+class BarkModel(BarkPreTrainedModel):
    config: BarkConfig

    def __init__(self, config):
--- a/src/transformers/models/bart/configuration_bart.py
+++ b/src/transformers/models/bart/configuration_bart.py
@ -15,9 +15,15 @@
 """BART model configuration"""

 import warnings
+from collections import OrderedDict
+from collections.abc import Mapping
+from typing import Any

+from ... import PreTrainedTokenizer
 from ...configuration_utils import PreTrainedConfig
-from ...utils import logging
+from ...onnx import OnnxConfig, OnnxConfigWithPast, OnnxSeq2SeqConfigWithPast
+from ...onnx.utils import compute_effective_axis_dimension
+from ...utils import is_torch_available, logging


 logger = logging.get_logger(__name__)
@ -174,4 +180,223 @@ class BartConfig(PreTrainedConfig):
            )


-__all__ = ["BartConfig"]
+class BartOnnxConfig(OnnxSeq2SeqConfigWithPast):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        if self.task in ["default", "seq2seq-lm"]:
+            common_inputs = OrderedDict(
+                [
+                    ("input_ids", {0: "batch", 1: "encoder_sequence"}),
+                    ("attention_mask", {0: "batch", 1: "encoder_sequence"}),
+                ]
+            )
+
+            if self.use_past:
+                common_inputs["decoder_input_ids"] = {0: "batch"}
+                common_inputs["decoder_attention_mask"] = {0: "batch", 1: "past_decoder_sequence + sequence"}
+            else:
+                common_inputs["decoder_input_ids"] = {0: "batch", 1: "decoder_sequence"}
+                common_inputs["decoder_attention_mask"] = {0: "batch", 1: "decoder_sequence"}
+
+            if self.use_past:
+                self.fill_with_past_key_values_(common_inputs, direction="inputs")
+        elif self.task == "causal-lm":
+            # TODO: figure this case out.
+            common_inputs = OrderedDict(
+                [
+                    ("input_ids", {0: "batch", 1: "encoder_sequence"}),
+                    ("attention_mask", {0: "batch", 1: "encoder_sequence"}),
+                ]
+            )
+            if self.use_past:
+                num_encoder_layers, _ = self.num_layers
+                for i in range(num_encoder_layers):
+                    common_inputs[f"past_key_values.{i}.key"] = {0: "batch", 2: "past_sequence + sequence"}
+                    common_inputs[f"past_key_values.{i}.value"] = {0: "batch", 2: "past_sequence + sequence"}
+        else:
+            common_inputs = OrderedDict(
+                [
+                    ("input_ids", {0: "batch", 1: "encoder_sequence"}),
+                    ("attention_mask", {0: "batch", 1: "encoder_sequence"}),
+                    ("decoder_input_ids", {0: "batch", 1: "decoder_sequence"}),
+                    ("decoder_attention_mask", {0: "batch", 1: "decoder_sequence"}),
+                ]
+            )
+
+        return common_inputs
+
+    @property
+    def outputs(self) -> Mapping[str, Mapping[int, str]]:
+        if self.task in ["default", "seq2seq-lm"]:
+            common_outputs = super().outputs
+        else:
+            common_outputs = super(OnnxConfigWithPast, self).outputs
+            if self.use_past:
+                num_encoder_layers, _ = self.num_layers
+                for i in range(num_encoder_layers):
+                    common_outputs[f"present.{i}.key"] = {0: "batch", 2: "past_sequence + sequence"}
+                    common_outputs[f"present.{i}.value"] = {0: "batch", 2: "past_sequence + sequence"}
+        return common_outputs
+
+    def _generate_dummy_inputs_for_default_and_seq2seq_lm(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        batch_size: int = -1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+    ) -> Mapping[str, Any]:
+        encoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
+            tokenizer, batch_size, seq_length, is_pair
+        )
+
+        # Generate decoder inputs
+        decoder_seq_length = seq_length if not self.use_past else 1
+        decoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
+            tokenizer, batch_size, decoder_seq_length, is_pair
+        )
+        decoder_inputs = {f"decoder_{name}": tensor for name, tensor in decoder_inputs.items()}
+        common_inputs = dict(**encoder_inputs, **decoder_inputs)
+
+        if self.use_past:
+            if not is_torch_available():
+                raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
+            else:
+                import torch
+            batch, encoder_seq_length = common_inputs["input_ids"].shape
+            decoder_seq_length = common_inputs["decoder_input_ids"].shape[1]
+            num_encoder_attention_heads, num_decoder_attention_heads = self.num_attention_heads
+            encoder_shape = (
+                batch,
+                num_encoder_attention_heads,
+                encoder_seq_length,
+                self._config.hidden_size // num_encoder_attention_heads,
+            )
+            decoder_past_length = decoder_seq_length + 3
+            decoder_shape = (
+                batch,
+                num_decoder_attention_heads,
+                decoder_past_length,
+                self._config.hidden_size // num_decoder_attention_heads,
+            )
+
+            common_inputs["decoder_attention_mask"] = torch.cat(
+                [common_inputs["decoder_attention_mask"], torch.ones(batch, decoder_past_length)], dim=1
+            )
+
+            common_inputs["past_key_values"] = []
+            # If the number of encoder and decoder layers are present in the model configuration, both are considered
+            num_encoder_layers, num_decoder_layers = self.num_layers
+            min_num_layers = min(num_encoder_layers, num_decoder_layers)
+            max_num_layers = max(num_encoder_layers, num_decoder_layers) - min_num_layers
+            remaining_side_name = "encoder" if num_encoder_layers > num_decoder_layers else "decoder"
+
+            for _ in range(min_num_layers):
+                common_inputs["past_key_values"].append(
+                    (
+                        torch.zeros(decoder_shape),
+                        torch.zeros(decoder_shape),
+                        torch.zeros(encoder_shape),
+                        torch.zeros(encoder_shape),
+                    )
+                )
+            # TODO: test this.
+            shape = encoder_shape if remaining_side_name == "encoder" else decoder_shape
+            for _ in range(min_num_layers, max_num_layers):
+                common_inputs["past_key_values"].append((torch.zeros(shape), torch.zeros(shape)))
+        return common_inputs
+
+    def _generate_dummy_inputs_for_causal_lm(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        batch_size: int = -1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+    ) -> Mapping[str, Any]:
+        common_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
+            tokenizer, batch_size, seq_length, is_pair
+        )
+
+        if self.use_past:
+            if not is_torch_available():
+                raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
+            else:
+                import torch
+            batch, seqlen = common_inputs["input_ids"].shape
+            # Not using the same length for past_key_values
+            past_key_values_length = seqlen + 2
+            num_encoder_layers, _ = self.num_layers
+            num_encoder_attention_heads, _ = self.num_attention_heads
+            past_shape = (
+                batch,
+                num_encoder_attention_heads,
+                past_key_values_length,
+                self._config.hidden_size // num_encoder_attention_heads,
+            )
+
+            mask_dtype = common_inputs["attention_mask"].dtype
+            common_inputs["attention_mask"] = torch.cat(
+                [common_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1
+            )
+            common_inputs["past_key_values"] = [
+                (torch.zeros(past_shape), torch.zeros(past_shape)) for _ in range(num_encoder_layers)
+            ]
+        return common_inputs
+
+    def _generate_dummy_inputs_for_sequence_classification_and_question_answering(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        batch_size: int = -1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+    ) -> Mapping[str, Any]:
+        # Copied from OnnxConfig.generate_dummy_inputs
+        # Did not use super(OnnxConfigWithPast, self).generate_dummy_inputs for code clarity.
+        # If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX
+        batch_size = compute_effective_axis_dimension(
+            batch_size, fixed_dimension=OnnxConfig.default_fixed_batch, num_token_to_add=0
+        )
+
+        # If dynamic axis (-1) we forward with a fixed dimension of 8 tokens to avoid optimizations made by ONNX
+        token_to_add = tokenizer.num_special_tokens_to_add(is_pair)
+        seq_length = compute_effective_axis_dimension(
+            seq_length, fixed_dimension=OnnxConfig.default_fixed_sequence, num_token_to_add=token_to_add
+        )
+
+        # Generate dummy inputs according to compute batch and sequence
+        dummy_input = [" ".join([tokenizer.unk_token]) * seq_length] * batch_size
+        common_inputs = dict(tokenizer(dummy_input, return_tensors="pt"))
+        return common_inputs
+
+    def generate_dummy_inputs(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        batch_size: int = -1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+    ) -> Mapping[str, Any]:
+        if self.task in ["default", "seq2seq-lm"]:
+            common_inputs = self._generate_dummy_inputs_for_default_and_seq2seq_lm(
+                tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair
+            )
+
+        elif self.task == "causal-lm":
+            common_inputs = self._generate_dummy_inputs_for_causal_lm(
+                tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair
+            )
+        else:
+            common_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
+                tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair
+            )
+
+        return common_inputs
+
+    def _flatten_past_key_values_(self, flattened_output, name, idx, t):
+        if self.task in ["default", "seq2seq-lm"]:
+            flattened_output = super()._flatten_past_key_values_(flattened_output, name, idx, t)
+        else:
+            flattened_output = super(OnnxSeq2SeqConfigWithPast, self)._flatten_past_key_values_(
+                flattened_output, name, idx, t
+            )
+
+
+__all__ = ["BartConfig", "BartOnnxConfig"]
--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@ -538,12 +538,12 @@ class BartEncoder(BartPreTrainedModel):
        self.max_source_positions = config.max_position_embeddings
        embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0

+        self.embed_tokens = BartScaledWordEmbedding(
+            config.vocab_size, embed_dim, self.padding_idx, embed_scale=embed_scale
+        )
+
        if embed_tokens is not None:
-            self.embed_tokens = embed_tokens
-        else:
-            self.embed_tokens = BartScaledWordEmbedding(
-                config.vocab_size, embed_dim, self.padding_idx, embed_scale=embed_scale
-            )
+            self.embed_tokens.weight = embed_tokens.weight

        self.embed_positions = BartLearnedPositionalEmbedding(
            config.max_position_embeddings,
@ -682,12 +682,12 @@ class BartDecoder(BartPreTrainedModel):
        self.max_target_positions = config.max_position_embeddings
        embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0

+        self.embed_tokens = BartScaledWordEmbedding(
+            config.vocab_size, config.d_model, self.padding_idx, embed_scale=embed_scale
+        )
+
        if embed_tokens is not None:
-            self.embed_tokens = embed_tokens
-        else:
-            self.embed_tokens = BartScaledWordEmbedding(
-                config.vocab_size, config.d_model, self.padding_idx, embed_scale=embed_scale
-            )
+            self.embed_tokens.weight = embed_tokens.weight

        self.embed_positions = BartLearnedPositionalEmbedding(
            config.max_position_embeddings,
--- a/src/transformers/models/beit/configuration_beit.py
+++ b/src/transformers/models/beit/configuration_beit.py
@ -15,8 +15,13 @@
 """BEiT model configuration"""

 import warnings
+from collections import OrderedDict
+from collections.abc import Mapping
+
+from packaging import version

 from ...configuration_utils import PreTrainedConfig
+from ...onnx import OnnxConfig
 from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices


@ -204,4 +209,21 @@ class BeitConfig(BackboneConfigMixin, PreTrainedConfig):
        self.reshape_hidden_states = reshape_hidden_states


-__all__ = ["BeitConfig"]
+# Copied from transformers.models.vit.configuration_vit.ViTOnnxConfig
+class BeitOnnxConfig(OnnxConfig):
+    torch_onnx_minimum_version = version.parse("1.11")
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
+            ]
+        )
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-4
+
+
+__all__ = ["BeitConfig", "BeitOnnxConfig"]
--- a/src/transformers/models/bert/configuration_bert.py
+++ b/src/transformers/models/bert/configuration_bert.py
@ -15,7 +15,11 @@
 # limitations under the License.
 """BERT model configuration"""

+from collections import OrderedDict
+from collections.abc import Mapping
+
 from ...configuration_utils import PreTrainedConfig
+from ...onnx import OnnxConfig
 from ...utils import logging


@ -123,4 +127,20 @@ class BertConfig(PreTrainedConfig):
        self.classifier_dropout = classifier_dropout


-__all__ = ["BertConfig"]
+class BertOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        if self.task == "multiple-choice":
+            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
+        else:
+            dynamic_axis = {0: "batch", 1: "sequence"}
+        return OrderedDict(
+            [
+                ("input_ids", dynamic_axis),
+                ("attention_mask", dynamic_axis),
+                ("token_type_ids", dynamic_axis),
+            ]
+        )
+
+
+__all__ = ["BertConfig", "BertOnnxConfig"]
--- a/src/transformers/models/big_bird/configuration_big_bird.py
+++ b/src/transformers/models/big_bird/configuration_big_bird.py
@ -14,7 +14,11 @@
 # limitations under the License.
 """BigBird model configuration"""

+from collections import OrderedDict
+from collections.abc import Mapping
+
 from ...configuration_utils import PreTrainedConfig
+from ...onnx import OnnxConfig
 from ...utils import logging


@ -154,4 +158,19 @@ class BigBirdConfig(PreTrainedConfig):
        self.classifier_dropout = classifier_dropout


-__all__ = ["BigBirdConfig"]
+class BigBirdOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        if self.task == "multiple-choice":
+            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
+        else:
+            dynamic_axis = {0: "batch", 1: "sequence"}
+        return OrderedDict(
+            [
+                ("input_ids", dynamic_axis),
+                ("attention_mask", dynamic_axis),
+            ]
+        )
+
+
+__all__ = ["BigBirdConfig", "BigBirdOnnxConfig"]
--- a/src/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py
+++ b/src/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py
@ -14,8 +14,15 @@
 # limitations under the License.
 """BigBirdPegasus model configuration"""

+from collections import OrderedDict
+from collections.abc import Mapping
+from typing import Any
+
+from ... import PreTrainedTokenizer
 from ...configuration_utils import PreTrainedConfig
-from ...utils import logging
+from ...onnx import OnnxConfig, OnnxConfigWithPast, OnnxSeq2SeqConfigWithPast
+from ...onnx.utils import compute_effective_axis_dimension
+from ...utils import is_torch_available, logging


 logger = logging.get_logger(__name__)
@ -179,4 +186,224 @@ class BigBirdPegasusConfig(PreTrainedConfig):
        )


-__all__ = ["BigBirdPegasusConfig"]
+# Copied from transformers.models.bart.configuration_bart.BartOnnxConfig with Bart->BigBirdPegasus
+class BigBirdPegasusOnnxConfig(OnnxSeq2SeqConfigWithPast):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        if self.task in ["default", "seq2seq-lm"]:
+            common_inputs = OrderedDict(
+                [
+                    ("input_ids", {0: "batch", 1: "encoder_sequence"}),
+                    ("attention_mask", {0: "batch", 1: "encoder_sequence"}),
+                ]
+            )
+
+            if self.use_past:
+                common_inputs["decoder_input_ids"] = {0: "batch"}
+                common_inputs["decoder_attention_mask"] = {0: "batch", 1: "past_decoder_sequence + sequence"}
+            else:
+                common_inputs["decoder_input_ids"] = {0: "batch", 1: "decoder_sequence"}
+                common_inputs["decoder_attention_mask"] = {0: "batch", 1: "decoder_sequence"}
+
+            if self.use_past:
+                self.fill_with_past_key_values_(common_inputs, direction="inputs")
+        elif self.task == "causal-lm":
+            # TODO: figure this case out.
+            common_inputs = OrderedDict(
+                [
+                    ("input_ids", {0: "batch", 1: "encoder_sequence"}),
+                    ("attention_mask", {0: "batch", 1: "encoder_sequence"}),
+                ]
+            )
+            if self.use_past:
+                num_encoder_layers, _ = self.num_layers
+                for i in range(num_encoder_layers):
+                    common_inputs[f"past_key_values.{i}.key"] = {0: "batch", 2: "past_sequence + sequence"}
+                    common_inputs[f"past_key_values.{i}.value"] = {0: "batch", 2: "past_sequence + sequence"}
+        else:
+            common_inputs = OrderedDict(
+                [
+                    ("input_ids", {0: "batch", 1: "encoder_sequence"}),
+                    ("attention_mask", {0: "batch", 1: "encoder_sequence"}),
+                    ("decoder_input_ids", {0: "batch", 1: "decoder_sequence"}),
+                    ("decoder_attention_mask", {0: "batch", 1: "decoder_sequence"}),
+                ]
+            )
+
+        return common_inputs
+
+    @property
+    def outputs(self) -> Mapping[str, Mapping[int, str]]:
+        if self.task in ["default", "seq2seq-lm"]:
+            common_outputs = super().outputs
+        else:
+            common_outputs = super(OnnxConfigWithPast, self).outputs
+            if self.use_past:
+                num_encoder_layers, _ = self.num_layers
+                for i in range(num_encoder_layers):
+                    common_outputs[f"present.{i}.key"] = {0: "batch", 2: "past_sequence + sequence"}
+                    common_outputs[f"present.{i}.value"] = {0: "batch", 2: "past_sequence + sequence"}
+        return common_outputs
+
+    def _generate_dummy_inputs_for_default_and_seq2seq_lm(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        batch_size: int = -1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+    ) -> Mapping[str, Any]:
+        encoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
+            tokenizer, batch_size, seq_length, is_pair
+        )
+
+        # Generate decoder inputs
+        decoder_seq_length = seq_length if not self.use_past else 1
+        decoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
+            tokenizer, batch_size, decoder_seq_length, is_pair
+        )
+        decoder_inputs = {f"decoder_{name}": tensor for name, tensor in decoder_inputs.items()}
+        common_inputs = dict(**encoder_inputs, **decoder_inputs)
+
+        if self.use_past:
+            if not is_torch_available():
+                raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
+            else:
+                import torch
+            batch, encoder_seq_length = common_inputs["input_ids"].shape
+            decoder_seq_length = common_inputs["decoder_input_ids"].shape[1]
+            num_encoder_attention_heads, num_decoder_attention_heads = self.num_attention_heads
+            encoder_shape = (
+                batch,
+                num_encoder_attention_heads,
+                encoder_seq_length,
+                self._config.hidden_size // num_encoder_attention_heads,
+            )
+            decoder_past_length = decoder_seq_length + 3
+            decoder_shape = (
+                batch,
+                num_decoder_attention_heads,
+                decoder_past_length,
+                self._config.hidden_size // num_decoder_attention_heads,
+            )
+
+            common_inputs["decoder_attention_mask"] = torch.cat(
+                [common_inputs["decoder_attention_mask"], torch.ones(batch, decoder_past_length)], dim=1
+            )
+
+            common_inputs["past_key_values"] = []
+            # If the number of encoder and decoder layers are present in the model configuration, both are considered
+            num_encoder_layers, num_decoder_layers = self.num_layers
+            min_num_layers = min(num_encoder_layers, num_decoder_layers)
+            max_num_layers = max(num_encoder_layers, num_decoder_layers) - min_num_layers
+            remaining_side_name = "encoder" if num_encoder_layers > num_decoder_layers else "decoder"
+
+            for _ in range(min_num_layers):
+                common_inputs["past_key_values"].append(
+                    (
+                        torch.zeros(decoder_shape),
+                        torch.zeros(decoder_shape),
+                        torch.zeros(encoder_shape),
+                        torch.zeros(encoder_shape),
+                    )
+                )
+            # TODO: test this.
+            shape = encoder_shape if remaining_side_name == "encoder" else decoder_shape
+            for _ in range(min_num_layers, max_num_layers):
+                common_inputs["past_key_values"].append((torch.zeros(shape), torch.zeros(shape)))
+        return common_inputs
+
+    def _generate_dummy_inputs_for_causal_lm(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        batch_size: int = -1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+    ) -> Mapping[str, Any]:
+        common_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
+            tokenizer, batch_size, seq_length, is_pair
+        )
+
+        if self.use_past:
+            if not is_torch_available():
+                raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
+            else:
+                import torch
+            batch, seqlen = common_inputs["input_ids"].shape
+            # Not using the same length for past_key_values
+            past_key_values_length = seqlen + 2
+            num_encoder_layers, _ = self.num_layers
+            num_encoder_attention_heads, _ = self.num_attention_heads
+            past_shape = (
+                batch,
+                num_encoder_attention_heads,
+                past_key_values_length,
+                self._config.hidden_size // num_encoder_attention_heads,
+            )
+
+            mask_dtype = common_inputs["attention_mask"].dtype
+            common_inputs["attention_mask"] = torch.cat(
+                [common_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1
+            )
+            common_inputs["past_key_values"] = [
+                (torch.zeros(past_shape), torch.zeros(past_shape)) for _ in range(num_encoder_layers)
+            ]
+        return common_inputs
+
+    def _generate_dummy_inputs_for_sequence_classification_and_question_answering(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        batch_size: int = -1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+    ) -> Mapping[str, Any]:
+        # Copied from OnnxConfig.generate_dummy_inputs
+        # Did not use super(OnnxConfigWithPast, self).generate_dummy_inputs for code clarity.
+        # If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX
+        batch_size = compute_effective_axis_dimension(
+            batch_size, fixed_dimension=OnnxConfig.default_fixed_batch, num_token_to_add=0
+        )
+
+        # If dynamic axis (-1) we forward with a fixed dimension of 8 tokens to avoid optimizations made by ONNX
+        token_to_add = tokenizer.num_special_tokens_to_add(is_pair)
+        seq_length = compute_effective_axis_dimension(
+            seq_length, fixed_dimension=OnnxConfig.default_fixed_sequence, num_token_to_add=token_to_add
+        )
+
+        # Generate dummy inputs according to compute batch and sequence
+        dummy_input = [" ".join([tokenizer.unk_token]) * seq_length] * batch_size
+        common_inputs = dict(tokenizer(dummy_input, return_tensors="pt"))
+        return common_inputs
+
+    def generate_dummy_inputs(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        batch_size: int = -1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+    ) -> Mapping[str, Any]:
+        if self.task in ["default", "seq2seq-lm"]:
+            common_inputs = self._generate_dummy_inputs_for_default_and_seq2seq_lm(
+                tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair
+            )
+
+        elif self.task == "causal-lm":
+            common_inputs = self._generate_dummy_inputs_for_causal_lm(
+                tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair
+            )
+        else:
+            common_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
+                tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair
+            )
+
+        return common_inputs
+
+    def _flatten_past_key_values_(self, flattened_output, name, idx, t):
+        if self.task in ["default", "seq2seq-lm"]:
+            flattened_output = super()._flatten_past_key_values_(flattened_output, name, idx, t)
+        else:
+            flattened_output = super(OnnxSeq2SeqConfigWithPast, self)._flatten_past_key_values_(
+                flattened_output, name, idx, t
+            )
+
+
+__all__ = ["BigBirdPegasusConfig", "BigBirdPegasusOnnxConfig"]
--- a/src/transformers/models/blenderbot/configuration_blenderbot.py
+++ b/src/transformers/models/blenderbot/configuration_blenderbot.py
@ -14,7 +14,15 @@
 # limitations under the License.
 """Blenderbot model configuration"""

+from collections import OrderedDict
+from collections.abc import Mapping
+from typing import Any
+
+from ... import PreTrainedTokenizer
 from ...configuration_utils import PreTrainedConfig
+from ...file_utils import is_torch_available
+from ...onnx import OnnxConfig, OnnxConfigWithPast, OnnxSeq2SeqConfigWithPast
+from ...onnx.utils import compute_effective_axis_dimension
 from ...utils import logging


@ -158,4 +166,227 @@ class BlenderbotConfig(PreTrainedConfig):
        )


-__all__ = ["BlenderbotConfig"]
+class BlenderbotOnnxConfig(OnnxSeq2SeqConfigWithPast):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        if self.task in ["default", "seq2seq-lm"]:
+            common_inputs = OrderedDict(
+                [
+                    ("input_ids", {0: "batch", 1: "encoder_sequence"}),
+                    ("attention_mask", {0: "batch", 1: "encoder_sequence"}),
+                ]
+            )
+            if self.use_past:
+                common_inputs["decoder_input_ids"] = {0: "batch"}
+                common_inputs["decoder_attention_mask"] = {0: "batch", 1: "past_decoder_sequence + sequence"}
+            else:
+                common_inputs["decoder_input_ids"] = {0: "batch", 1: "decoder_sequence"}
+                common_inputs["decoder_attention_mask"] = {0: "batch", 1: "decoder_sequence"}
+            if self.use_past:
+                self.fill_with_past_key_values_(common_inputs, direction="inputs")
+        elif self.task == "causal-lm":
+            common_inputs = OrderedDict(
+                [
+                    ("input_ids", {0: "batch", 1: "encoder_sequence"}),
+                    ("attention_mask", {0: "batch", 1: "encoder_sequence"}),
+                ]
+            )
+            if self.use_past:
+                _, num_decoder_layers = self.num_layers
+                for i in range(num_decoder_layers):
+                    common_inputs[f"past_key_values.{i}.key"] = {0: "batch", 2: "past_sequence + sequence"}
+                    common_inputs[f"past_key_values.{i}.value"] = {0: "batch", 2: "past_sequence + sequence"}
+        else:
+            common_inputs = OrderedDict(
+                [
+                    ("input_ids", {0: "batch", 1: "encoder_sequence"}),
+                    ("attention_mask", {0: "batch", 1: "encoder_sequence"}),
+                    ("decoder_input_ids", {0: "batch", 1: "decoder_sequence"}),
+                    ("decoder_attention_mask", {0: "batch", 1: "decoder_sequence"}),
+                ]
+            )
+
+        return common_inputs
+
+    @property
+    # Copied from transformers.models.bart.configuration_bart.BartOnnxConfig.outputs
+    def outputs(self) -> Mapping[str, Mapping[int, str]]:
+        if self.task in ["default", "seq2seq-lm"]:
+            common_outputs = super().outputs
+        else:
+            common_outputs = super(OnnxConfigWithPast, self).outputs
+            if self.use_past:
+                num_encoder_layers, _ = self.num_layers
+                for i in range(num_encoder_layers):
+                    common_outputs[f"present.{i}.key"] = {0: "batch", 2: "past_sequence + sequence"}
+                    common_outputs[f"present.{i}.value"] = {0: "batch", 2: "past_sequence + sequence"}
+        return common_outputs
+
+    def _generate_dummy_inputs_for_default_and_seq2seq_lm(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        batch_size: int = -1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+    ) -> Mapping[str, Any]:
+        encoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
+            tokenizer, batch_size, seq_length, is_pair
+        )
+        # Generate decoder inputs
+        decoder_seq_length = seq_length if not self.use_past else 1
+        decoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
+            tokenizer, batch_size, decoder_seq_length, is_pair
+        )
+        decoder_inputs = {f"decoder_{name}": tensor for name, tensor in decoder_inputs.items()}
+        common_inputs = dict(**encoder_inputs, **decoder_inputs)
+
+        if self.use_past:
+            if not is_torch_available():
+                raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
+            else:
+                import torch
+            batch, encoder_seq_length = common_inputs["input_ids"].shape
+            decoder_seq_length = common_inputs["decoder_input_ids"].shape[1]
+            num_encoder_attention_heads, num_decoder_attention_heads = self.num_attention_heads
+            encoder_shape = (
+                batch,
+                num_encoder_attention_heads,
+                encoder_seq_length,
+                self._config.hidden_size // num_encoder_attention_heads,
+            )
+            decoder_past_length = decoder_seq_length
+            decoder_shape = (
+                batch,
+                num_decoder_attention_heads,
+                decoder_past_length,
+                self._config.hidden_size // num_decoder_attention_heads,
+            )
+            common_inputs["decoder_attention_mask"] = torch.cat(
+                [common_inputs["decoder_attention_mask"], torch.ones(batch, decoder_past_length)], dim=1
+            )
+            common_inputs["past_key_values"] = []
+            _, num_decoder_layers = self.num_layers
+
+            for _ in range(num_decoder_layers):
+                common_inputs["past_key_values"].append(
+                    (
+                        torch.zeros(decoder_shape),
+                        torch.zeros(decoder_shape),
+                        torch.zeros(encoder_shape),
+                        torch.zeros(encoder_shape),
+                    )
+                )
+        return common_inputs
+
+    def _generate_dummy_inputs_for_causal_lm(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        batch_size: int = -1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+    ) -> Mapping[str, Any]:
+        common_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
+            tokenizer, batch_size, seq_length, is_pair
+        )
+
+        if self.use_past:
+            if not is_torch_available():
+                raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
+            else:
+                import torch
+            batch, seqlen = common_inputs["input_ids"].shape
+            past_key_values_length = seqlen
+            _, num_decoder_layers = self.num_layers
+            num_encoder_attention_heads, _ = self.num_attention_heads
+            past_shape = (
+                batch,
+                num_encoder_attention_heads,
+                past_key_values_length,
+                self._config.hidden_size // num_encoder_attention_heads,
+            )
+            mask_dtype = common_inputs["attention_mask"].dtype
+            common_inputs["attention_mask"] = torch.cat(
+                [common_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1
+            )
+            common_inputs["past_key_values"] = [
+                (torch.zeros(past_shape), torch.zeros(past_shape)) for _ in range(num_decoder_layers)
+            ]
+        return common_inputs
+
+    # Copied from transformers.models.bart.configuration_bart.BartOnnxConfig._generate_dummy_inputs_for_sequence_classification_and_question_answering
+    def _generate_dummy_inputs_for_sequence_classification_and_question_answering(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        batch_size: int = -1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+    ) -> Mapping[str, Any]:
+        # Copied from OnnxConfig.generate_dummy_inputs
+        # Did not use super(OnnxConfigWithPast, self).generate_dummy_inputs for code clarity.
+        # If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX
+        batch_size = compute_effective_axis_dimension(
+            batch_size, fixed_dimension=OnnxConfig.default_fixed_batch, num_token_to_add=0
+        )
+
+        # If dynamic axis (-1) we forward with a fixed dimension of 8 tokens to avoid optimizations made by ONNX
+        token_to_add = tokenizer.num_special_tokens_to_add(is_pair)
+        seq_length = compute_effective_axis_dimension(
+            seq_length, fixed_dimension=OnnxConfig.default_fixed_sequence, num_token_to_add=token_to_add
+        )
+
+        # Generate dummy inputs according to compute batch and sequence
+        dummy_input = [" ".join([tokenizer.unk_token]) * seq_length] * batch_size
+        common_inputs = dict(tokenizer(dummy_input, return_tensors="pt"))
+        return common_inputs
+
+    # Copied from transformers.models.bart.configuration_bart.BartOnnxConfig.generate_dummy_inputs
+    def generate_dummy_inputs(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        batch_size: int = -1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+    ) -> Mapping[str, Any]:
+        if self.task in ["default", "seq2seq-lm"]:
+            common_inputs = self._generate_dummy_inputs_for_default_and_seq2seq_lm(
+                tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair
+            )
+
+        elif self.task == "causal-lm":
+            common_inputs = self._generate_dummy_inputs_for_causal_lm(
+                tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair
+            )
+        else:
+            common_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
+                tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair
+            )
+
+        return common_inputs
+
+    # Copied from transformers.models.bart.configuration_bart.BartOnnxConfig._flatten_past_key_values_
+    def _flatten_past_key_values_(self, flattened_output, name, idx, t):
+        if self.task in ["default", "seq2seq-lm"]:
+            flattened_output = super()._flatten_past_key_values_(flattened_output, name, idx, t)
+        else:
+            flattened_output = super(OnnxSeq2SeqConfigWithPast, self)._flatten_past_key_values_(
+                flattened_output, name, idx, t
+            )
+
+    def fill_with_past_key_values_(self, inputs_or_outputs: Mapping[str, Mapping[int, str]], direction: str):
+        if direction not in ["inputs", "outputs"]:
+            raise ValueError(f'direction must either be "inputs" or "outputs", but {direction} was given')
+
+        name = "past_key_values" if direction == "inputs" else "present"
+        _, num_decoder_layers = self.num_layers
+
+        encoder_sequence = "past_encoder_sequence"
+        decoder_sequence = "past_decoder_sequence" if direction == "inputs" else "past_decoder_sequence + sequence"
+
+        for i in range(num_decoder_layers):
+            inputs_or_outputs[f"{name}.{i}.decoder.key"] = {0: "batch", 2: decoder_sequence}
+            inputs_or_outputs[f"{name}.{i}.decoder.value"] = {0: "batch", 2: decoder_sequence}
+            inputs_or_outputs[f"{name}.{i}.encoder.key"] = {0: "batch", 2: encoder_sequence}
+            inputs_or_outputs[f"{name}.{i}.encoder.value"] = {0: "batch", 2: encoder_sequence}
+
+
+__all__ = ["BlenderbotConfig", "BlenderbotOnnxConfig"]
--- a/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/configuration_blenderbot_small.py
@ -14,7 +14,15 @@
 # limitations under the License.
 """BlenderbotSmall model configuration"""

+from collections import OrderedDict
+from collections.abc import Mapping
+from typing import Any
+
+from ... import PreTrainedTokenizer
 from ...configuration_utils import PreTrainedConfig
+from ...file_utils import is_torch_available
+from ...onnx import OnnxConfig, OnnxConfigWithPast, OnnxSeq2SeqConfigWithPast
+from ...onnx.utils import compute_effective_axis_dimension
 from ...utils import logging


@ -156,4 +164,224 @@ class BlenderbotSmallConfig(PreTrainedConfig):
        )


-__all__ = ["BlenderbotSmallConfig"]
+# Copied from transformers.models.bart.configuration_bart.BartOnnxConfig with Bart->BlenderbotSmall
+class BlenderbotSmallOnnxConfig(OnnxSeq2SeqConfigWithPast):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        if self.task in ["default", "seq2seq-lm"]:
+            common_inputs = OrderedDict(
+                [
+                    ("input_ids", {0: "batch", 1: "encoder_sequence"}),
+                    ("attention_mask", {0: "batch", 1: "encoder_sequence"}),
+                ]
+            )
+
+            if self.use_past:
+                common_inputs["decoder_input_ids"] = {0: "batch"}
+                common_inputs["decoder_attention_mask"] = {0: "batch", 1: "past_decoder_sequence + sequence"}
+            else:
+                common_inputs["decoder_input_ids"] = {0: "batch", 1: "decoder_sequence"}
+                common_inputs["decoder_attention_mask"] = {0: "batch", 1: "decoder_sequence"}
+
+            if self.use_past:
+                self.fill_with_past_key_values_(common_inputs, direction="inputs")
+        elif self.task == "causal-lm":
+            # TODO: figure this case out.
+            common_inputs = OrderedDict(
+                [
+                    ("input_ids", {0: "batch", 1: "encoder_sequence"}),
+                    ("attention_mask", {0: "batch", 1: "encoder_sequence"}),
+                ]
+            )
+            if self.use_past:
+                num_encoder_layers, _ = self.num_layers
+                for i in range(num_encoder_layers):
+                    common_inputs[f"past_key_values.{i}.key"] = {0: "batch", 2: "past_sequence + sequence"}
+                    common_inputs[f"past_key_values.{i}.value"] = {0: "batch", 2: "past_sequence + sequence"}
+        else:
+            common_inputs = OrderedDict(
+                [
+                    ("input_ids", {0: "batch", 1: "encoder_sequence"}),
+                    ("attention_mask", {0: "batch", 1: "encoder_sequence"}),
+                    ("decoder_input_ids", {0: "batch", 1: "decoder_sequence"}),
+                    ("decoder_attention_mask", {0: "batch", 1: "decoder_sequence"}),
+                ]
+            )
+
+        return common_inputs
+
+    @property
+    def outputs(self) -> Mapping[str, Mapping[int, str]]:
+        if self.task in ["default", "seq2seq-lm"]:
+            common_outputs = super().outputs
+        else:
+            common_outputs = super(OnnxConfigWithPast, self).outputs
+            if self.use_past:
+                num_encoder_layers, _ = self.num_layers
+                for i in range(num_encoder_layers):
+                    common_outputs[f"present.{i}.key"] = {0: "batch", 2: "past_sequence + sequence"}
+                    common_outputs[f"present.{i}.value"] = {0: "batch", 2: "past_sequence + sequence"}
+        return common_outputs
+
+    def _generate_dummy_inputs_for_default_and_seq2seq_lm(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        batch_size: int = -1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+    ) -> Mapping[str, Any]:
+        encoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
+            tokenizer, batch_size, seq_length, is_pair
+        )
+
+        # Generate decoder inputs
+        decoder_seq_length = seq_length if not self.use_past else 1
+        decoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
+            tokenizer, batch_size, decoder_seq_length, is_pair
+        )
+        decoder_inputs = {f"decoder_{name}": tensor for name, tensor in decoder_inputs.items()}
+        common_inputs = dict(**encoder_inputs, **decoder_inputs)
+
+        if self.use_past:
+            if not is_torch_available():
+                raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
+            else:
+                import torch
+            batch, encoder_seq_length = common_inputs["input_ids"].shape
+            decoder_seq_length = common_inputs["decoder_input_ids"].shape[1]
+            num_encoder_attention_heads, num_decoder_attention_heads = self.num_attention_heads
+            encoder_shape = (
+                batch,
+                num_encoder_attention_heads,
+                encoder_seq_length,
+                self._config.hidden_size // num_encoder_attention_heads,
+            )
+            decoder_past_length = decoder_seq_length + 3
+            decoder_shape = (
+                batch,
+                num_decoder_attention_heads,
+                decoder_past_length,
+                self._config.hidden_size // num_decoder_attention_heads,
+            )
+
+            common_inputs["decoder_attention_mask"] = torch.cat(
+                [common_inputs["decoder_attention_mask"], torch.ones(batch, decoder_past_length)], dim=1
+            )
+
+            common_inputs["past_key_values"] = []
+            # If the number of encoder and decoder layers are present in the model configuration, both are considered
+            num_encoder_layers, num_decoder_layers = self.num_layers
+            min_num_layers = min(num_encoder_layers, num_decoder_layers)
+            max_num_layers = max(num_encoder_layers, num_decoder_layers) - min_num_layers
+            remaining_side_name = "encoder" if num_encoder_layers > num_decoder_layers else "decoder"
+
+            for _ in range(min_num_layers):
+                common_inputs["past_key_values"].append(
+                    (
+                        torch.zeros(decoder_shape),
+                        torch.zeros(decoder_shape),
+                        torch.zeros(encoder_shape),
+                        torch.zeros(encoder_shape),
+                    )
+                )
+            # TODO: test this.
+            shape = encoder_shape if remaining_side_name == "encoder" else decoder_shape
+            for _ in range(min_num_layers, max_num_layers):
+                common_inputs["past_key_values"].append((torch.zeros(shape), torch.zeros(shape)))
+        return common_inputs
+
+    def _generate_dummy_inputs_for_causal_lm(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        batch_size: int = -1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+    ) -> Mapping[str, Any]:
+        common_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
+            tokenizer, batch_size, seq_length, is_pair
+        )
+
+        if self.use_past:
+            if not is_torch_available():
+                raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
+            else:
+                import torch
+            batch, seqlen = common_inputs["input_ids"].shape
+            # Not using the same length for past_key_values
+            past_key_values_length = seqlen + 2
+            num_encoder_layers, _ = self.num_layers
+            num_encoder_attention_heads, _ = self.num_attention_heads
+            past_shape = (
+                batch,
+                num_encoder_attention_heads,
+                past_key_values_length,
+                self._config.hidden_size // num_encoder_attention_heads,
+            )
+
+            mask_dtype = common_inputs["attention_mask"].dtype
+            common_inputs["attention_mask"] = torch.cat(
+                [common_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1
+            )
+            common_inputs["past_key_values"] = [
+                (torch.zeros(past_shape), torch.zeros(past_shape)) for _ in range(num_encoder_layers)
+            ]
+        return common_inputs
+
+    def _generate_dummy_inputs_for_sequence_classification_and_question_answering(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        batch_size: int = -1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+    ) -> Mapping[str, Any]:
+        # Copied from OnnxConfig.generate_dummy_inputs
+        # Did not use super(OnnxConfigWithPast, self).generate_dummy_inputs for code clarity.
+        # If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX
+        batch_size = compute_effective_axis_dimension(
+            batch_size, fixed_dimension=OnnxConfig.default_fixed_batch, num_token_to_add=0
+        )
+
+        # If dynamic axis (-1) we forward with a fixed dimension of 8 tokens to avoid optimizations made by ONNX
+        token_to_add = tokenizer.num_special_tokens_to_add(is_pair)
+        seq_length = compute_effective_axis_dimension(
+            seq_length, fixed_dimension=OnnxConfig.default_fixed_sequence, num_token_to_add=token_to_add
+        )
+
+        # Generate dummy inputs according to compute batch and sequence
+        dummy_input = [" ".join([tokenizer.unk_token]) * seq_length] * batch_size
+        common_inputs = dict(tokenizer(dummy_input, return_tensors="pt"))
+        return common_inputs
+
+    def generate_dummy_inputs(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        batch_size: int = -1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+    ) -> Mapping[str, Any]:
+        if self.task in ["default", "seq2seq-lm"]:
+            common_inputs = self._generate_dummy_inputs_for_default_and_seq2seq_lm(
+                tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair
+            )
+
+        elif self.task == "causal-lm":
+            common_inputs = self._generate_dummy_inputs_for_causal_lm(
+                tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair
+            )
+        else:
+            common_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering(
+                tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair
+            )
+
+        return common_inputs
+
+    def _flatten_past_key_values_(self, flattened_output, name, idx, t):
+        if self.task in ["default", "seq2seq-lm"]:
+            flattened_output = super()._flatten_past_key_values_(flattened_output, name, idx, t)
+        else:
+            flattened_output = super(OnnxSeq2SeqConfigWithPast, self)._flatten_past_key_values_(
+                flattened_output, name, idx, t
+            )
+
+
+__all__ = ["BlenderbotSmallConfig", "BlenderbotSmallOnnxConfig"]
--- a/src/transformers/models/bloom/configuration_bloom.py
+++ b/src/transformers/models/bloom/configuration_bloom.py
@ -14,8 +14,19 @@
 # limitations under the License.
 """Bloom configuration"""

+from collections import OrderedDict
+from collections.abc import Mapping
+from typing import TYPE_CHECKING, Any, Optional
+
+from packaging import version
+
+
+if TYPE_CHECKING:
+    from ... import PreTrainedTokenizer
+
 from ...configuration_utils import PreTrainedConfig
-from ...utils import logging
+from ...onnx import OnnxConfigWithPast, PatchingSpec
+from ...utils import is_torch_available, logging


 logger = logging.get_logger(__name__)
@ -131,4 +142,99 @@ class BloomConfig(PreTrainedConfig):
        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)


-__all__ = ["BloomConfig"]
+class BloomOnnxConfig(OnnxConfigWithPast):
+    torch_onnx_minimum_version = version.parse("1.12")
+
+    def __init__(
+        self,
+        config: PreTrainedConfig,
+        task: str = "default",
+        patching_specs: Optional[list[PatchingSpec]] = None,
+        use_past: bool = False,
+    ):
+        super().__init__(config, task=task, patching_specs=patching_specs, use_past=use_past)
+        if not getattr(self._config, "pad_token_id", None):
+            # TODO: how to do that better?
+            self._config.pad_token_id = 0
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        common_inputs = OrderedDict({"input_ids": {0: "batch", 1: "sequence"}})
+        if self.use_past:
+            # BLOOM stores values on dynamic axis 2. For more details see: https://github.com/huggingface/transformers/pull/18344
+            self.fill_with_past_key_values_(common_inputs, direction="inputs", inverted_values_shape=True)
+            common_inputs["attention_mask"] = {0: "batch", 1: "past_sequence + sequence"}
+        else:
+            common_inputs["attention_mask"] = {0: "batch", 1: "sequence"}
+
+        return common_inputs
+
+    @property
+    def num_layers(self) -> int:
+        return self._config.n_layer
+
+    @property
+    def num_attention_heads(self) -> int:
+        return self._config.n_head
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-3
+
+    def generate_dummy_inputs(
+        self,
+        tokenizer: "PreTrainedTokenizer",
+        batch_size: int = -1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+    ) -> Mapping[str, Any]:
+        common_inputs = super(OnnxConfigWithPast, self).generate_dummy_inputs(
+            tokenizer,
+            batch_size=batch_size,
+            seq_length=seq_length,
+            is_pair=is_pair,
+        )
+
+        # We need to order the input in the way they appears in the forward()
+        ordered_inputs = OrderedDict({"input_ids": common_inputs["input_ids"]})
+
+        # Need to add the past_keys
+        if self.use_past:
+            if not is_torch_available():
+                raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
+            else:
+                import torch
+
+                batch, seqlen = common_inputs["input_ids"].shape
+                # Not using the same length for past_key_values
+                past_key_values_length = seqlen + 2
+                head_dim = self._config.hidden_size // self.num_attention_heads
+                past_key_shape = (
+                    batch * self.num_attention_heads,
+                    head_dim,
+                    past_key_values_length,
+                )
+                past_value_shape = (
+                    batch * self.num_attention_heads,
+                    past_key_values_length,
+                    head_dim,
+                )
+                ordered_inputs["past_key_values"] = [
+                    (torch.zeros(past_key_shape), torch.zeros(past_value_shape)) for _ in range(self.num_layers)
+                ]
+
+        ordered_inputs["attention_mask"] = common_inputs["attention_mask"]
+        if self.use_past:
+            mask_dtype = ordered_inputs["attention_mask"].dtype
+            ordered_inputs["attention_mask"] = torch.cat(
+                [ordered_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1
+            )
+
+        return ordered_inputs
+
+    @property
+    def default_onnx_opset(self) -> int:
+        return 13
+
+
+__all__ = ["BloomConfig", "BloomOnnxConfig"]
--- a/src/transformers/models/camembert/configuration_camembert.py
+++ b/src/transformers/models/camembert/configuration_camembert.py
@ -15,7 +15,11 @@
 # limitations under the License.
 """CamemBERT configuration"""

+from collections import OrderedDict
+from collections.abc import Mapping
+
 from ...configuration_utils import PreTrainedConfig
+from ...onnx import OnnxConfig
 from ...utils import logging


@ -125,4 +129,19 @@ class CamembertConfig(PreTrainedConfig):
        self.classifier_dropout = classifier_dropout


-__all__ = ["CamembertConfig"]
+class CamembertOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        if self.task == "multiple-choice":
+            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
+        else:
+            dynamic_axis = {0: "batch", 1: "sequence"}
+        return OrderedDict(
+            [
+                ("input_ids", dynamic_axis),
+                ("attention_mask", dynamic_axis),
+            ]
+        )
+
+
+__all__ = ["CamembertConfig", "CamembertOnnxConfig"]
--- a/src/transformers/models/chinese_clip/configuration_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/configuration_chinese_clip.py
@ -14,7 +14,16 @@
 # limitations under the License.
 """Chinese-CLIP model configuration"""

+from collections import OrderedDict
+from collections.abc import Mapping
+from typing import TYPE_CHECKING, Any
+
+
+if TYPE_CHECKING:
+    from ...processing_utils import ProcessorMixin
+
 from ...configuration_utils import PreTrainedConfig
+from ...onnx import OnnxConfig
 from ...utils import logging


@ -359,4 +368,52 @@ class ChineseCLIPConfig(PreTrainedConfig):
        super().__init__(**kwargs)


-__all__ = ["ChineseCLIPConfig", "ChineseCLIPTextConfig", "ChineseCLIPVisionConfig"]
+class ChineseCLIPOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("input_ids", {0: "batch", 1: "sequence"}),
+                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
+                ("attention_mask", {0: "batch", 1: "sequence"}),
+            ]
+        )
+
+    @property
+    def outputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("logits_per_image", {0: "batch"}),
+                ("logits_per_text", {0: "batch"}),
+                ("text_embeds", {0: "batch"}),
+                ("image_embeds", {0: "batch"}),
+            ]
+        )
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-4
+
+    def generate_dummy_inputs(
+        self,
+        processor: "ProcessorMixin",
+        batch_size: int = -1,
+        seq_length: int = -1,
+    ) -> Mapping[str, Any]:
+        text_input_dict = super().generate_dummy_inputs(
+            processor.tokenizer,
+            batch_size=batch_size,
+            seq_length=seq_length,
+        )
+        image_input_dict = super().generate_dummy_inputs(
+            processor.image_processor,
+            batch_size=batch_size,
+        )
+        return {**text_input_dict, **image_input_dict}
+
+    @property
+    def default_onnx_opset(self) -> int:
+        return 14
+
+
+__all__ = ["ChineseCLIPConfig", "ChineseCLIPOnnxConfig", "ChineseCLIPTextConfig", "ChineseCLIPVisionConfig"]
--- a/src/transformers/models/clip/configuration_clip.py
+++ b/src/transformers/models/clip/configuration_clip.py
@ -14,7 +14,16 @@
 # limitations under the License.
 """CLIP model configuration"""

+from collections import OrderedDict
+from collections.abc import Mapping
+from typing import TYPE_CHECKING, Any
+
+
+if TYPE_CHECKING:
+    from ...processing_utils import ProcessorMixin
+
 from ...configuration_utils import PreTrainedConfig
+from ...onnx import OnnxConfig
 from ...utils import logging


@ -355,4 +364,52 @@ class CLIPConfig(PreTrainedConfig):
        super().__init__(**kwargs)


-__all__ = ["CLIPConfig", "CLIPTextConfig", "CLIPVisionConfig"]
+class CLIPOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("input_ids", {0: "batch", 1: "sequence"}),
+                ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
+                ("attention_mask", {0: "batch", 1: "sequence"}),
+            ]
+        )
+
+    @property
+    def outputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("logits_per_image", {0: "batch"}),
+                ("logits_per_text", {0: "batch"}),
+                ("text_embeds", {0: "batch"}),
+                ("image_embeds", {0: "batch"}),
+            ]
+        )
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-4
+
+    def generate_dummy_inputs(
+        self,
+        processor: "ProcessorMixin",
+        batch_size: int = -1,
+        seq_length: int = -1,
+    ) -> Mapping[str, Any]:
+        text_input_dict = super().generate_dummy_inputs(
+            processor.tokenizer,
+            batch_size=batch_size,
+            seq_length=seq_length,
+        )
+        image_input_dict = super().generate_dummy_inputs(
+            processor.image_processor,
+            batch_size=batch_size,
+        )
+        return {**text_input_dict, **image_input_dict}
+
+    @property
+    def default_onnx_opset(self) -> int:
+        return 14
+
+
+__all__ = ["CLIPConfig", "CLIPOnnxConfig", "CLIPTextConfig", "CLIPVisionConfig"]
--- a/src/transformers/models/clip/modeling_clip.py
+++ b/src/transformers/models/clip/modeling_clip.py
@ -22,21 +22,11 @@ import torch
 from torch import nn

 from ...activations import ACT2FN
-from ...masking_utils import create_causal_mask
+from ...modeling_attn_mask_utils import _create_4d_causal_attention_mask, _prepare_4d_attention_mask
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
-from ...processing_utils import Unpack
-from ...utils import (
-    ModelOutput,
-    TransformersKwargs,
-    auto_docstring,
-    can_return_tuple,
-    filter_out_non_signature_kwargs,
-    logging,
-    torch_int,
-)
-from ...utils.generic import check_model_inputs
+from ...utils import ModelOutput, auto_docstring, can_return_tuple, filter_out_non_signature_kwargs, logging, torch_int
 from .configuration_clip import CLIPConfig, CLIPTextConfig, CLIPVisionConfig


@ -270,7 +260,8 @@ def eager_attention_forward(
    attention_mask: Optional[torch.Tensor],
    scaling: float,
    dropout: float = 0.0,
-    **kwargs: Unpack[TransformersKwargs],
+    output_attentions: bool = True,
+    **kwargs,
 ):
    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
    if attention_mask is not None:
@ -280,6 +271,8 @@ def eager_attention_forward(

    attn_output = torch.matmul(attn_weights, value)
    attn_output = attn_output.transpose(1, 2).contiguous()
+    if not output_attentions:
+        attn_weights = None
    return attn_output, attn_weights


@ -310,7 +303,8 @@ class CLIPAttention(nn.Module):
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
-        **kwargs: Unpack[TransformersKwargs],
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
        """Input shape: Batch x Time x Channel"""

@ -323,6 +317,15 @@ class CLIPAttention(nn.Module):
        queries = queries.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2)
        keys = keys.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2)
        values = values.view(batch_size, seq_length, -1, self.head_dim).transpose(1, 2)
+        # CLIP text model uses both `causal_attention_mask` and `attention_mask`
+        # in case FA2 kernel is called, `is_causal` should be inferred from `causal_attention_mask`
+        if self.config._attn_implementation == "flash_attention_2":
+            self.is_causal = causal_attention_mask is not None
+        else:
+            if attention_mask is not None and causal_attention_mask is not None:
+                attention_mask = attention_mask + causal_attention_mask
+            elif causal_attention_mask is not None:
+                attention_mask = causal_attention_mask

        attention_interface: Callable = eager_attention_forward
        if self.config._attn_implementation != "eager":
@ -334,14 +337,17 @@ class CLIPAttention(nn.Module):
            keys,
            values,
            attention_mask,
+            is_causal=self.is_causal,
            scaling=self.scale,
            dropout=0.0 if not self.training else self.dropout,
-            **kwargs,
+            output_attentions=output_attentions,
        )

-        attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous()
+        attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous()
        attn_output = self.out_proj(attn_output)

+        if not output_attentions:
+            attn_weights = None
        return attn_output, attn_weights


@ -373,15 +379,27 @@ class CLIPEncoderLayer(GradientCheckpointingLayer):
        self,
        hidden_states: torch.Tensor,
        attention_mask: torch.Tensor,
-        **kwargs: Unpack[TransformersKwargs],
-    ) -> torch.FloatTensor:
+        causal_attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
        residual = hidden_states

        hidden_states = self.layer_norm1(hidden_states)
-        hidden_states, _ = self.self_attn(
+        hidden_states, attn_weights = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
-            **kwargs,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
        )
        hidden_states = residual + hidden_states

@ -390,7 +408,12 @@ class CLIPEncoderLayer(GradientCheckpointingLayer):
        hidden_states = self.mlp(hidden_states)
        hidden_states = residual + hidden_states

-        return hidden_states
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs


@auto_docstring
@ -403,10 +426,6 @@ class CLIPPreTrainedModel(PreTrainedModel):
    _supports_flash_attn = True
    _supports_flex_attn = True
    _supports_attention_backend = True
-    _can_record_outputs = {
-        "hidden_states": CLIPEncoderLayer,
-        "attentions": CLIPAttention,
-    }

    def _init_weights(self, module):
        """Initialize the weights"""
@ -484,7 +503,9 @@ class CLIPEncoder(nn.Module):
        self,
        inputs_embeds,
        attention_mask: Optional[torch.Tensor] = None,
-        **kwargs: Unpack[TransformersKwargs],
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
    ) -> BaseModelOutput:
        r"""
        Args:
@ -499,17 +520,53 @@ class CLIPEncoder(nn.Module):
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
+            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Causal mask for the text model. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
        hidden_states = inputs_embeds
-        for encoder_layer in self.layers:
-            hidden_states = encoder_layer(
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            layer_outputs = encoder_layer(
                hidden_states,
                attention_mask,
-                **kwargs,
+                causal_attention_mask,
+                output_attentions=output_attentions,
            )

+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
        return BaseModelOutput(
            last_hidden_state=hidden_states,
+            hidden_states=encoder_states,
+            attentions=all_attentions,
        )


@ -531,8 +588,14 @@ class CLIPTextTransformer(nn.Module):
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
-        **kwargs: Unpack[TransformersKwargs],
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
    ) -> BaseModelOutputWithPooling:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
        if input_ids is None:
            raise ValueError("You have to specify input_ids")

@ -541,20 +604,23 @@ class CLIPTextTransformer(nn.Module):

        hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)

-        attention_mask = create_causal_mask(
-            config=self.config,
-            input_embeds=hidden_states,
-            attention_mask=attention_mask,
-            cache_position=torch.arange(hidden_states.shape[1], device=hidden_states.device),
-            past_key_values=None,
+        # CLIP's text model uses causal mask, prepare it here.
+        # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
+        causal_attention_mask = _create_4d_causal_attention_mask(
+            input_shape, hidden_states.dtype, device=hidden_states.device
        )

-        kwargs.pop("is_causal", None)
+        # expand attention_mask
+        if attention_mask is not None and self.config._attn_implementation != "flash_attention_2":
+            # [batch_size, seq_len] -> [batch_size, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)
+
        encoder_outputs: BaseModelOutput = self.encoder(
            inputs_embeds=hidden_states,
            attention_mask=attention_mask,
-            is_causal=True,
-            **kwargs,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
        )

        last_hidden_state = encoder_outputs.last_hidden_state
@ -585,6 +651,8 @@ class CLIPTextTransformer(nn.Module):
        return BaseModelOutputWithPooling(
            last_hidden_state=last_hidden_state,
            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
        )


@ -598,6 +666,7 @@ class CLIPTextModel(CLIPPreTrainedModel):
    input_modalities = "text"

    _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer"]
+    _supports_flash_attn = False  # mask creation only accounts for sdpa/eager

    def __init__(self, config: CLIPTextConfig):
        super().__init__(config)
@ -611,14 +680,15 @@ class CLIPTextModel(CLIPPreTrainedModel):
    def set_input_embeddings(self, value):
        self.text_model.embeddings.token_embedding = value

-    @check_model_inputs(tie_last_hidden_states=False)
+    @can_return_tuple
    @auto_docstring
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
-        **kwargs: Unpack[TransformersKwargs],
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
    ) -> BaseModelOutputWithPooling:
        r"""
        Examples:
@ -640,7 +710,8 @@ class CLIPTextModel(CLIPPreTrainedModel):
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
-            **kwargs,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
        )


@ -659,9 +730,15 @@ class CLIPVisionTransformer(nn.Module):
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
        interpolate_pos_encoding: Optional[bool] = False,
-        **kwargs: Unpack[TransformersKwargs],
    ) -> BaseModelOutputWithPooling:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
        if pixel_values is None:
            raise ValueError("You have to specify pixel_values")

@ -670,7 +747,8 @@ class CLIPVisionTransformer(nn.Module):

        encoder_outputs: BaseModelOutput = self.encoder(
            inputs_embeds=hidden_states,
-            **kwargs,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
        )

        last_hidden_state = encoder_outputs.last_hidden_state
@ -680,6 +758,8 @@ class CLIPVisionTransformer(nn.Module):
        return BaseModelOutputWithPooling(
            last_hidden_state=last_hidden_state,
            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
        )


@ -703,13 +783,14 @@ class CLIPVisionModel(CLIPPreTrainedModel):
    def get_input_embeddings(self) -> nn.Module:
        return self.vision_model.embeddings.patch_embedding

-    @check_model_inputs(tie_last_hidden_states=False)
+    @can_return_tuple
    @auto_docstring
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
        interpolate_pos_encoding: bool = False,
-        **kwargs: Unpack[TransformersKwargs],
    ) -> BaseModelOutputWithPooling:
        r"""
        Example:
@ -734,8 +815,9 @@ class CLIPVisionModel(CLIPPreTrainedModel):

        return self.vision_model(
            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
            interpolate_pos_encoding=interpolate_pos_encoding,
-            **kwargs,
        )


@ -743,6 +825,7 @@ class CLIPVisionModel(CLIPPreTrainedModel):
 class CLIPModel(CLIPPreTrainedModel):
    config: CLIPConfig
    _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer", "CLIPVisionEmbeddings"]
+    _supports_flash_attn = False  # mask creation only accounts for sdpa/eager

    def __init__(self, config: CLIPConfig):
        super().__init__(config)
@ -864,8 +947,9 @@ class CLIPModel(CLIPPreTrainedModel):
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        return_loss: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
        interpolate_pos_encoding: bool = False,
-        **kwargs: Unpack[TransformersKwargs],
    ) -> CLIPOutput:
        r"""
        return_loss (`bool`, *optional*):
@ -893,17 +977,25 @@ class CLIPModel(CLIPPreTrainedModel):
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```"""
+        # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
        vision_outputs: BaseModelOutputWithPooling = self.vision_model(
            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
            interpolate_pos_encoding=interpolate_pos_encoding,
-            **kwargs,
        )

        text_outputs: BaseModelOutputWithPooling = self.text_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
-            **kwargs,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
        )

        image_embeds = vision_outputs.pooler_output
@ -942,6 +1034,7 @@ class CLIPTextModelWithProjection(CLIPPreTrainedModel):
    config: CLIPTextConfig
    input_modalities = "text"

+    _supports_flash_attn = False
    _no_split_modules = ["CLIPTextEmbeddings", "CLIPEncoderLayer"]

    def __init__(self, config: CLIPTextConfig):
@ -961,14 +1054,15 @@ class CLIPTextModelWithProjection(CLIPPreTrainedModel):
    def set_input_embeddings(self, value):
        self.text_model.embeddings.token_embedding = value

-    @check_model_inputs(tie_last_hidden_states=False)
+    @can_return_tuple
    @auto_docstring
    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
-        **kwargs: Unpack[TransformersKwargs],
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
    ) -> CLIPTextModelOutput:
        r"""
        Examples:
@ -991,7 +1085,8 @@ class CLIPTextModelWithProjection(CLIPPreTrainedModel):
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
-            **kwargs,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
        )
        pooled_output = text_outputs.pooler_output
        text_embeds = self.text_projection(pooled_output)
@ -999,6 +1094,8 @@ class CLIPTextModelWithProjection(CLIPPreTrainedModel):
        return CLIPTextModelOutput(
            text_embeds=text_embeds,
            last_hidden_state=text_outputs.last_hidden_state,
+            hidden_states=text_outputs.hidden_states,
+            attentions=text_outputs.attentions,
        )


@ -1022,13 +1119,14 @@ class CLIPVisionModelWithProjection(CLIPPreTrainedModel):
    def get_input_embeddings(self) -> nn.Module:
        return self.vision_model.embeddings.patch_embedding

-    @check_model_inputs(tie_last_hidden_states=False)
+    @can_return_tuple
    @auto_docstring
    def forward(
        self,
        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
        interpolate_pos_encoding: bool = False,
-        **kwargs: Unpack[TransformersKwargs],
    ) -> CLIPVisionModelOutput:
        r"""
        Examples:
@ -1053,8 +1151,9 @@ class CLIPVisionModelWithProjection(CLIPPreTrainedModel):

        vision_outputs: BaseModelOutputWithPooling = self.vision_model(
            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
            interpolate_pos_encoding=interpolate_pos_encoding,
-            **kwargs,
        )
        pooled_output = vision_outputs.pooler_output
        image_embeds = self.visual_projection(pooled_output)
@ -1062,6 +1161,8 @@ class CLIPVisionModelWithProjection(CLIPPreTrainedModel):
        return CLIPVisionModelOutput(
            image_embeds=image_embeds,
            last_hidden_state=vision_outputs.last_hidden_state,
+            hidden_states=vision_outputs.hidden_states,
+            attentions=vision_outputs.attentions,
        )


@ -1090,13 +1191,14 @@ class CLIPForImageClassification(CLIPPreTrainedModel):
        # Initialize weights and apply final processing
        self.post_init()

-    @check_model_inputs(tie_last_hidden_states=False)
+    @can_return_tuple
    @auto_docstring
    def forward(
        self,
        pixel_values: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
-        **kwargs: Unpack[TransformersKwargs],
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
    ) -> ImageClassifierOutput:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
@ -1104,14 +1206,22 @@ class CLIPForImageClassification(CLIPPreTrainedModel):
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
        outputs: BaseModelOutputWithPooling = self.vision_model(
            pixel_values,
-            **kwargs,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
        )

        sequence_output = outputs.last_hidden_state

+        # average pool the patch tokens
        sequence_output = torch.mean(sequence_output[:, 1:, :], dim=1)
+        # apply classifier
        logits = self.classifier(sequence_output)

        loss = None
@ -1121,6 +1231,8 @@ class CLIPForImageClassification(CLIPPreTrainedModel):
        return ImageClassifierOutput(
            loss=loss,
            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
        )


--- a/src/transformers/models/codegen/configuration_codegen.py
+++ b/src/transformers/models/codegen/configuration_codegen.py
@ -14,7 +14,13 @@
 # limitations under the License.
 """CodeGen model configuration"""

+from collections import OrderedDict
+from collections.abc import Mapping
+from typing import Any, Optional
+
+from ... import PreTrainedTokenizer, is_torch_available
 from ...configuration_utils import PreTrainedConfig
+from ...onnx import OnnxConfigWithPast, PatchingSpec
 from ...utils import logging


@ -140,4 +146,85 @@ class CodeGenConfig(PreTrainedConfig):
        )


-__all__ = ["CodeGenConfig"]
+# Copied from transformers.models.gpt2.configuration_gpt2.GPT2OnnxConfig with GPT2->CodeGen
+class CodeGenOnnxConfig(OnnxConfigWithPast):
+    def __init__(
+        self,
+        config: PreTrainedConfig,
+        task: str = "default",
+        patching_specs: Optional[list[PatchingSpec]] = None,
+        use_past: bool = False,
+    ):
+        super().__init__(config, task=task, patching_specs=patching_specs, use_past=use_past)
+        if not getattr(self._config, "pad_token_id", None):
+            # TODO: how to do that better?
+            self._config.pad_token_id = 0
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        common_inputs = OrderedDict({"input_ids": {0: "batch", 1: "sequence"}})
+        if self.use_past:
+            self.fill_with_past_key_values_(common_inputs, direction="inputs")
+            common_inputs["attention_mask"] = {0: "batch", 1: "past_sequence + sequence"}
+        else:
+            common_inputs["attention_mask"] = {0: "batch", 1: "sequence"}
+
+        return common_inputs
+
+    @property
+    def num_layers(self) -> int:
+        return self._config.n_layer
+
+    @property
+    def num_attention_heads(self) -> int:
+        return self._config.n_head
+
+    def generate_dummy_inputs(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        batch_size: int = -1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+    ) -> Mapping[str, Any]:
+        common_inputs = super(OnnxConfigWithPast, self).generate_dummy_inputs(
+            tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair
+        )
+
+        # We need to order the input in the way they appears in the forward()
+        ordered_inputs = OrderedDict({"input_ids": common_inputs["input_ids"]})
+
+        # Need to add the past_keys
+        if self.use_past:
+            if not is_torch_available():
+                raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
+            else:
+                import torch
+
+                batch, seqlen = common_inputs["input_ids"].shape
+                # Not using the same length for past_key_values
+                past_key_values_length = seqlen + 2
+                past_shape = (
+                    batch,
+                    self.num_attention_heads,
+                    past_key_values_length,
+                    self._config.hidden_size // self.num_attention_heads,
+                )
+                ordered_inputs["past_key_values"] = [
+                    (torch.zeros(past_shape), torch.zeros(past_shape)) for _ in range(self.num_layers)
+                ]
+
+        ordered_inputs["attention_mask"] = common_inputs["attention_mask"]
+        if self.use_past:
+            mask_dtype = ordered_inputs["attention_mask"].dtype
+            ordered_inputs["attention_mask"] = torch.cat(
+                [ordered_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1
+            )
+
+        return ordered_inputs
+
+    @property
+    def default_onnx_opset(self) -> int:
+        return 13
+
+
+__all__ = ["CodeGenConfig", "CodeGenOnnxConfig"]
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Pablo Montalvo	557ecce22e	push working qwen vl viz	2025-10-21 18:04:08 +02:00
Pablo Montalvo	f3b187027a	fixup	2025-10-21 16:33:01 +02:00
Pablo Montalvo	2767a59df9	biig update for ragged inputs why not	2025-10-21 16:17:45 +02:00
Pablo Montalvo	c9f1003c70	Merge branch 'main' into vision_visualizer	2025-10-21 14:19:35 +02:00
molbap	b356fce1da	solve unequal cropping	2025-08-11 19:20:28 +02:00
molbap	af7f75e682	use existing methods, add default image	2025-08-11 16:44:06 +02:00
molbap	34ba5909a2	add an unnormalize image method	2025-08-11 16:43:27 +02:00
Pablo Montalvo	fbec904fb0	Merge branch 'main' into vision_visualizer	2025-08-06 19:19:09 +02:00
molbap	a1263dfe7b	fixup	2025-08-06 19:17:38 +02:00
molbap	1878d6c4ff	add captions and better tiling detection	2025-08-06 19:16:14 +02:00
molbap	a6a18efe53	better namings	2025-08-05 17:30:05 +02:00
Pablo Montalvo	e581d2f2ce	fixup	2025-07-25 08:02:39 +00:00
Pablo Montalvo	1f6822d114	move processor visualizer	2025-07-25 07:58:35 +00:00
Pablo Montalvo	edb70ae15c	Merge branch 'main' into vision_visualizer	2025-07-24 12:50:27 +00:00
Pablo Montalvo	27bc371bea	Merge branch 'main' into vision_visualizer	2025-07-22 13:01:45 +02:00
Pablo Montalvo	58c619e809	draft the vision visualizer	2025-03-21 18:53:04 +01:00