run_ci_manually

2025-11-04 20:14:36 +08:00 · 2024-11-15 22:01:07 +01:00 · 2024-11-15 19:12:25 +01:00
159 changed files with 1977 additions and 7598 deletions
--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
@ -3,7 +3,7 @@ name: Build docker images (scheduled)
 on:
  push:
    branches:
-      - test_docker_run_quantization
+      - build_ci_docker_image*
  repository_dispatch:
  workflow_call:
    inputs:
@ -18,341 +18,341 @@ concurrency:
  cancel-in-progress: false

 jobs:
-  # latest-docker:
-  #   name: "Latest PyTorch + TensorFlow [dev]"
-  #   runs-on:
-  #     group: aws-general-8-plus
-  #   steps:
-  #     -
-  #       name: Set up Docker Buildx
-  #       uses: docker/setup-buildx-action@v3
-  #     -
-  #       name: Check out code
-  #       uses: actions/checkout@v4
-  #     -
-  #       name: Login to DockerHub
-  #       uses: docker/login-action@v3
-  #       with:
-  #         username: ${{ secrets.DOCKERHUB_USERNAME }}
-  #         password: ${{ secrets.DOCKERHUB_PASSWORD }}
-  #     -
-  #       name: Build and push
-  #       uses: docker/build-push-action@v5
-  #       with:
-  #         context: ./docker/transformers-all-latest-gpu
-  #         build-args: |
-  #           REF=main
-  #         push: true
-  #         tags: huggingface/transformers-all-latest-gpu${{ inputs.image_postfix }}
-  #     # Push CI images still need to be re-built daily
-  #     -
-  #       name: Build and push (for Push CI) in a daily basis
-  #       # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
-  #       # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
-  #       if: inputs.image_postfix != '-push-ci'
-  #       uses: docker/build-push-action@v5
-  #       with:
-  #         context: ./docker/transformers-all-latest-gpu
-  #         build-args: |
-  #           REF=main
-  #         push: true
-  #         tags: huggingface/transformers-all-latest-gpu-push-ci
+  latest-docker:
+    name: "Latest PyTorch + TensorFlow [dev]"
+    runs-on:
+      group: aws-general-8-plus
+    steps:
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      -
+        name: Check out code
+        uses: actions/checkout@v4
+      -
+        name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      -
+        name: Build and push
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-all-latest-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-all-latest-gpu${{ inputs.image_postfix }}
+      # Push CI images still need to be re-built daily
+      -
+        name: Build and push (for Push CI) in a daily basis
+        # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
+        # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
+        if: inputs.image_postfix != '-push-ci'
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-all-latest-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-all-latest-gpu-push-ci

-  #     - name: Post to Slack
-  #       if: always()
-  #       uses: huggingface/hf-workflows/.github/actions/post-slack@main
-  #       with:
-  #         slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
-  #         title: 🤗 Results of the transformers-all-latest-gpu-push-ci docker build 
-  #         status: ${{ job.status }}
-  #         slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        with:
+          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
+          title: 🤗 Results of the transformers-all-latest-gpu-push-ci docker build 
+          status: ${{ job.status }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

-  # latest-torch-deepspeed-docker:
-  #   name: "Latest PyTorch + DeepSpeed"
-  #   runs-on:
-  #     group: aws-general-8-plus
-  #   steps:
-  #     -
-  #       name: Set up Docker Buildx
-  #       uses: docker/setup-buildx-action@v3
-  #     -
-  #       name: Check out code
-  #       uses: actions/checkout@v4
-  #     -
-  #       name: Login to DockerHub
-  #       uses: docker/login-action@v3
-  #       with:
-  #         username: ${{ secrets.DOCKERHUB_USERNAME }}
-  #         password: ${{ secrets.DOCKERHUB_PASSWORD }}
-  #     -
-  #       name: Build and push
-  #       uses: docker/build-push-action@v5
-  #       with:
-  #         context: ./docker/transformers-pytorch-deepspeed-latest-gpu
-  #         build-args: |
-  #           REF=main
-  #         push: true
-  #         tags: huggingface/transformers-pytorch-deepspeed-latest-gpu${{ inputs.image_postfix }}
+  latest-torch-deepspeed-docker:
+    name: "Latest PyTorch + DeepSpeed"
+    runs-on:
+      group: aws-general-8-plus
+    steps:
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      -
+        name: Check out code
+        uses: actions/checkout@v4
+      -
+        name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      -
+        name: Build and push
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-pytorch-deepspeed-latest-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-pytorch-deepspeed-latest-gpu${{ inputs.image_postfix }}

-  #     - name: Post to Slack
-  #       if: always()
-  #       uses: huggingface/hf-workflows/.github/actions/post-slack@main
-  #       with:
-  #         slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER}}
-  #         title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu docker build 
-  #         status: ${{ job.status }}
-  #         slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        with:
+          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER}}
+          title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu docker build 
+          status: ${{ job.status }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

-  # # Can't build 2 images in a single job `latest-torch-deepspeed-docker` (for `nvcr.io/nvidia`)
-  # latest-torch-deepspeed-docker-for-push-ci-daily-build:
-  #   name: "Latest PyTorch + DeepSpeed (Push CI - Daily Build)"
-  #   runs-on:
-  #     group: aws-general-8-plus
-  #   steps:
-  #     -
-  #       name: Set up Docker Buildx
-  #       uses: docker/setup-buildx-action@v3
-  #     -
-  #       name: Check out code
-  #       uses: actions/checkout@v4
-  #     -
-  #       name: Login to DockerHub
-  #       uses: docker/login-action@v3
-  #       with:
-  #         username: ${{ secrets.DOCKERHUB_USERNAME }}
-  #         password: ${{ secrets.DOCKERHUB_PASSWORD }}
-  #     # Push CI images still need to be re-built daily
-  #     -
-  #       name: Build and push (for Push CI) in a daily basis
-  #       # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
-  #       # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
-  #       if: inputs.image_postfix != '-push-ci'
-  #       uses: docker/build-push-action@v5
-  #       with:
-  #         context: ./docker/transformers-pytorch-deepspeed-latest-gpu
-  #         build-args: |
-  #           REF=main
-  #         push: true
-  #         tags: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci
+  # Can't build 2 images in a single job `latest-torch-deepspeed-docker` (for `nvcr.io/nvidia`)
+  latest-torch-deepspeed-docker-for-push-ci-daily-build:
+    name: "Latest PyTorch + DeepSpeed (Push CI - Daily Build)"
+    runs-on:
+      group: aws-general-8-plus
+    steps:
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      -
+        name: Check out code
+        uses: actions/checkout@v4
+      -
+        name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      # Push CI images still need to be re-built daily
+      -
+        name: Build and push (for Push CI) in a daily basis
+        # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
+        # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
+        if: inputs.image_postfix != '-push-ci'
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-pytorch-deepspeed-latest-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci

-  #     - name: Post to Slack
-  #       if: always()
-  #       uses: huggingface/hf-workflows/.github/actions/post-slack@main
-  #       with:
-  #         slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
-  #         title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu-push-ci docker build 
-  #         status: ${{ job.status }}
-  #         slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        with:
+          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
+          title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu-push-ci docker build 
+          status: ${{ job.status }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

-  # doc-builder:
-  #   name: "Doc builder"
-  #   # Push CI doesn't need this image
-  #   if: inputs.image_postfix != '-push-ci'
-  #   runs-on:
-  #     group: aws-general-8-plus
-  #   steps:
-  #     -
-  #       name: Set up Docker Buildx
-  #       uses: docker/setup-buildx-action@v3
-  #     -
-  #       name: Check out code
-  #       uses: actions/checkout@v4
-  #     -
-  #       name: Login to DockerHub
-  #       uses: docker/login-action@v3
-  #       with:
-  #         username: ${{ secrets.DOCKERHUB_USERNAME }}
-  #         password: ${{ secrets.DOCKERHUB_PASSWORD }}
-  #     -
-  #       name: Build and push
-  #       uses: docker/build-push-action@v5
-  #       with:
-  #         context: ./docker/transformers-doc-builder
-  #         push: true
-  #         tags: huggingface/transformers-doc-builder
+  doc-builder:
+    name: "Doc builder"
+    # Push CI doesn't need this image
+    if: inputs.image_postfix != '-push-ci'
+    runs-on:
+      group: aws-general-8-plus
+    steps:
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      -
+        name: Check out code
+        uses: actions/checkout@v4
+      -
+        name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      -
+        name: Build and push
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-doc-builder
+          push: true
+          tags: huggingface/transformers-doc-builder

-  #     - name: Post to Slack
-  #       if: always()
-  #       uses: huggingface/hf-workflows/.github/actions/post-slack@main
-  #       with:
-  #         slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
-  #         title: 🤗 Results of the huggingface/transformers-doc-builder docker build 
-  #         status: ${{ job.status }}
-  #         slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        with:
+          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
+          title: 🤗 Results of the huggingface/transformers-doc-builder docker build 
+          status: ${{ job.status }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

-  # latest-pytorch:
-  #   name: "Latest PyTorch [dev]"
-  #   # Push CI doesn't need this image
-  #   if: inputs.image_postfix != '-push-ci'
-  #   runs-on:
-  #     group: aws-general-8-plus
-  #   steps:
-  #     -
-  #       name: Set up Docker Buildx
-  #       uses: docker/setup-buildx-action@v3
-  #     -
-  #       name: Check out code
-  #       uses: actions/checkout@v4
-  #     -
-  #       name: Login to DockerHub
-  #       uses: docker/login-action@v3
-  #       with:
-  #         username: ${{ secrets.DOCKERHUB_USERNAME }}
-  #         password: ${{ secrets.DOCKERHUB_PASSWORD }}
-  #     -
-  #       name: Build and push
-  #       uses: docker/build-push-action@v5
-  #       with:
-  #         context: ./docker/transformers-pytorch-gpu
-  #         build-args: |
-  #           REF=main
-  #         push: true
-  #         tags: huggingface/transformers-pytorch-gpu
+  latest-pytorch:
+    name: "Latest PyTorch [dev]"
+    # Push CI doesn't need this image
+    if: inputs.image_postfix != '-push-ci'
+    runs-on:
+      group: aws-general-8-plus
+    steps:
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      -
+        name: Check out code
+        uses: actions/checkout@v4
+      -
+        name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      -
+        name: Build and push
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-pytorch-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-pytorch-gpu

-  #     - name: Post to Slack
-  #       if: always()
-  #       uses: huggingface/hf-workflows/.github/actions/post-slack@main
-  #       with:
-  #         slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
-  #         title: 🤗 Results of the huggingface/transformers-pytorch-gpudocker build 
-  #         status: ${{ job.status }}
-  #         slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        with:
+          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
+          title: 🤗 Results of the huggingface/transformers-pytorch-gpudocker build 
+          status: ${{ job.status }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

-  # latest-pytorch-amd:
-  #   name: "Latest PyTorch (AMD) [dev]"
-  #   runs-on:
-  #     group: aws-general-8-plus
-  #   steps:
-  #     - 
-  #       name: Set up Docker Buildx
-  #       uses: docker/setup-buildx-action@v3
-  #     - 
-  #       name: Check out code
-  #       uses: actions/checkout@v4
-  #     - 
-  #       name: Login to DockerHub
-  #       uses: docker/login-action@v3
-  #       with:
-  #         username: ${{ secrets.DOCKERHUB_USERNAME }}
-  #         password: ${{ secrets.DOCKERHUB_PASSWORD }}
-  #     - 
-  #       name: Build and push
-  #       uses: docker/build-push-action@v5
-  #       with:
-  #         context: ./docker/transformers-pytorch-amd-gpu
-  #         build-args: |
-  #           REF=main
-  #         push: true
-  #         tags: huggingface/transformers-pytorch-amd-gpu${{ inputs.image_postfix }}
-  #     # Push CI images still need to be re-built daily
-  #     -
-  #       name: Build and push (for Push CI) in a daily basis
-  #       # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
-  #       # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
-  #       if: inputs.image_postfix != '-push-ci'
-  #       uses: docker/build-push-action@v5
-  #       with:
-  #         context: ./docker/transformers-pytorch-amd-gpu
-  #         build-args: |
-  #           REF=main
-  #         push: true
-  #         tags: huggingface/transformers-pytorch-amd-gpu-push-ci
+  latest-pytorch-amd:
+    name: "Latest PyTorch (AMD) [dev]"
+    runs-on:
+      group: aws-general-8-plus
+    steps:
+      - 
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - 
+        name: Check out code
+        uses: actions/checkout@v4
+      - 
+        name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      - 
+        name: Build and push
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-pytorch-amd-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-pytorch-amd-gpu${{ inputs.image_postfix }}
+      # Push CI images still need to be re-built daily
+      -
+        name: Build and push (for Push CI) in a daily basis
+        # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
+        # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
+        if: inputs.image_postfix != '-push-ci'
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-pytorch-amd-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-pytorch-amd-gpu-push-ci

-  #     - name: Post to Slack
-  #       if: always()
-  #       uses: huggingface/hf-workflows/.github/actions/post-slack@main
-  #       with:
-  #         slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
-  #         title: 🤗 Results of the huggingface/transformers-pytorch-amd-gpu-push-ci build 
-  #         status: ${{ job.status }}
-  #         slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        with:
+          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
+          title: 🤗 Results of the huggingface/transformers-pytorch-amd-gpu-push-ci build 
+          status: ${{ job.status }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

-  # latest-tensorflow:
-  #   name: "Latest TensorFlow [dev]"
-  #   # Push CI doesn't need this image
-  #   if: inputs.image_postfix != '-push-ci'
-  #   runs-on:
-  #     group: aws-general-8-plus
-  #   steps:
-  #     -
-  #       name: Set up Docker Buildx
-  #       uses: docker/setup-buildx-action@v3
-  #     -
-  #       name: Check out code
-  #       uses: actions/checkout@v4
-  #     -
-  #       name: Login to DockerHub
-  #       uses: docker/login-action@v3
-  #       with:
-  #         username: ${{ secrets.DOCKERHUB_USERNAME }}
-  #         password: ${{ secrets.DOCKERHUB_PASSWORD }}
-  #     -
-  #       name: Build and push
-  #       uses: docker/build-push-action@v5
-  #       with:
-  #         context: ./docker/transformers-tensorflow-gpu
-  #         build-args: |
-  #           REF=main
-  #         push: true
-  #         tags: huggingface/transformers-tensorflow-gpu
+  latest-tensorflow:
+    name: "Latest TensorFlow [dev]"
+    # Push CI doesn't need this image
+    if: inputs.image_postfix != '-push-ci'
+    runs-on:
+      group: aws-general-8-plus
+    steps:
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      -
+        name: Check out code
+        uses: actions/checkout@v4
+      -
+        name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      -
+        name: Build and push
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-tensorflow-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-tensorflow-gpu

-  #     - name: Post to Slack
-  #       if: always()
-  #       uses: huggingface/hf-workflows/.github/actions/post-slack@main
-  #       with:
-  #         slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
-  #         title: 🤗 Results of the huggingface/transformers-tensorflow-gpu build 
-  #         status: ${{ job.status }}
-  #         slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        with:
+          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
+          title: 🤗 Results of the huggingface/transformers-tensorflow-gpu build 
+          status: ${{ job.status }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

-  # latest-pytorch-deepspeed-amd:
-  #   name: "PyTorch + DeepSpeed (AMD) [dev]"
-  #   runs-on:
-  #     group: aws-general-8-plus
-  #   steps:
-  #     - 
-  #       name: Set up Docker Buildx
-  #       uses: docker/setup-buildx-action@v3
-  #     - 
-  #       name: Check out code
-  #       uses: actions/checkout@v4
-  #     - 
-  #       name: Login to DockerHub
-  #       uses: docker/login-action@v3
-  #       with:
-  #         username: ${{ secrets.DOCKERHUB_USERNAME }}
-  #         password: ${{ secrets.DOCKERHUB_PASSWORD }}
-  #     - 
-  #       name: Build and push
-  #       uses: docker/build-push-action@v5
-  #       with:
-  #         context: ./docker/transformers-pytorch-deepspeed-amd-gpu
-  #         build-args: |
-  #           REF=main
-  #         push: true
-  #         tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }}
-  #     # Push CI images still need to be re-built daily
-  #     -
-  #       name: Build and push (for Push CI) in a daily basis
-  #       # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
-  #       # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
-  #       if: inputs.image_postfix != '-push-ci'
-  #       uses: docker/build-push-action@v5
-  #       with:
-  #         context: ./docker/transformers-pytorch-deepspeed-amd-gpu
-  #         build-args: |
-  #           REF=main
-  #         push: true
-  #         tags: huggingface/transformers-pytorch-deepspeed-amd-gpu-push-ci
+  latest-pytorch-deepspeed-amd:
+    name: "PyTorch + DeepSpeed (AMD) [dev]"
+    runs-on:
+      group: aws-general-8-plus
+    steps:
+      - 
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      - 
+        name: Check out code
+        uses: actions/checkout@v4
+      - 
+        name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      - 
+        name: Build and push
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-pytorch-deepspeed-amd-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }}
+      # Push CI images still need to be re-built daily
+      -
+        name: Build and push (for Push CI) in a daily basis
+        # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
+        # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
+        if: inputs.image_postfix != '-push-ci'
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-pytorch-deepspeed-amd-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-pytorch-deepspeed-amd-gpu-push-ci

-  #     - name: Post to Slack
-  #       if: always()
-  #       uses: huggingface/hf-workflows/.github/actions/post-slack@main
-  #       with:
-  #         slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
-  #         title: 🤗 Results of the transformers-pytorch-deepspeed-amd-gpu build 
-  #         status: ${{ job.status }}
-  #         slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        with:
+          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
+          title: 🤗 Results of the transformers-pytorch-deepspeed-amd-gpu build 
+          status: ${{ job.status }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

  latest-quantization-torch-docker:
    name: "Latest Pytorch + Quantization [dev]"
--- a/.github/workflows/self-scheduled-caller.yml
+++ b/.github/workflows/self-scheduled-caller.yml
@ -7,7 +7,7 @@ on:
    - cron: "17 2 * * *"
  push:
    branches:
-      - run_scheduled_ci*
+      - run_ci_manually

 jobs:
  model-ci:
@ -20,59 +20,3 @@ jobs:
      docker: huggingface/transformers-all-latest-gpu
      ci_event: Daily CI
    secrets: inherit
-
-  torch-pipeline:
-    name: Torch pipeline CI
-    uses: ./.github/workflows/self-scheduled.yml
-    with:
-      job: run_pipelines_torch_gpu
-      slack_report_channel: "#transformers-ci-daily-pipeline-torch"
-      runner: daily-ci
-      docker: huggingface/transformers-pytorch-gpu
-      ci_event: Daily CI
-    secrets: inherit
-
-  tf-pipeline:
-    name: TF pipeline CI
-    uses: ./.github/workflows/self-scheduled.yml
-    with:
-      job: run_pipelines_tf_gpu
-      slack_report_channel: "#transformers-ci-daily-pipeline-tf"
-      runner: daily-ci
-      docker: huggingface/transformers-tensorflow-gpu
-      ci_event: Daily CI
-    secrets: inherit
-
-  example-ci:
-    name: Example CI
-    uses: ./.github/workflows/self-scheduled.yml
-    with:
-      job: run_examples_gpu
-      slack_report_channel: "#transformers-ci-daily-examples"
-      runner: daily-ci
-      docker: huggingface/transformers-all-latest-gpu
-      ci_event: Daily CI
-    secrets: inherit
-
-  deepspeed-ci:
-    name: DeepSpeed CI
-    uses: ./.github/workflows/self-scheduled.yml
-    with:
-      job: run_torch_cuda_extensions_gpu
-      slack_report_channel: "#transformers-ci-daily-deepspeed"
-      runner: daily-ci
-      docker: huggingface/transformers-pytorch-deepspeed-latest-gpu
-      ci_event: Daily CI
-      working-directory-prefix: /workspace
-    secrets: inherit
-
-  quantization-ci:
-    name: Quantization CI
-    uses: ./.github/workflows/self-scheduled.yml
-    with:
-      job: run_quantization_torch_gpu
-      slack_report_channel: "#transformers-ci-daily-quantization"
-      runner: daily-ci
-      docker: huggingface/transformers-quantization-latest-gpu
-      ci_event: Daily CI
-    secrets: inherit
--- a/docker/transformers-quantization-latest-gpu/Dockerfile
+++ b/docker/transformers-quantization-latest-gpu/Dockerfile
@ -9,7 +9,7 @@ SHELL ["sh", "-lc"]
 # The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
 # to be used as arguments for docker build (so far).

-ARG PYTORCH='2.5.1'
+ARG PYTORCH='2.4.1'
 # Example: `cu102`, `cu113`, etc.
 ARG CUDA='cu118'

--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@ -218,8 +218,6 @@
      title: CPU inference
    - local: perf_infer_gpu_one
      title: GPU inference
-    - local: perf_infer_gpu_multi
-      title: Multi-GPU inference
    title: Optimizing inference
  - local: big_models
    title: Instantiate a big model
@ -516,8 +514,6 @@
        title: Nyströmformer
      - local: model_doc/olmo
        title: OLMo
-      - local: model_doc/olmo_1124
-        title: OLMo November 2024
      - local: model_doc/olmoe
        title: OLMoE
      - local: model_doc/open-llama
--- a/docs/source/en/generation_strategies.md
+++ b/docs/source/en/generation_strategies.md
@ -416,6 +416,16 @@ Assisted decoding assumes the main and assistant models have the same tokenizer,
 Currently, only greedy search and sampling are supported with assisted decoding, and assisted decoding doesn't support batched inputs.
 To learn more about assisted decoding, check [this blog post](https://huggingface.co/blog/assisted-generation).

+#### Universal Assisted Decoding
+
+Universal Assisted Decoding (UAD) adds support for main and assistant models with different tokenizers.
+To use it, simply pass the tokenizers using the `tokenizer` and `assistant_tokenizer` arguments (see below).
+Internally, the main model input tokens are re-encoded into assistant model tokens, then candidate tokens are generated in the assistant encoding, which are
+in turn re-encoded into main model candidate tokens. Validation then proceeds as explained above.
+The re-encoding steps involve decoding token ids into text and then encoding the text using a different tokenizer.
+Since re-encoding the tokens may result in tokenization discrepancies, UAD finds the longest common subsequence between the source and target encodings, 
+to ensure the new tokens include the correct prompt suffix.
+
 To enable assisted decoding, set the `assistant_model` argument with a model.

 ```python
@ -435,6 +445,26 @@ To enable assisted decoding, set the `assistant_model` argument with a model.
 ['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a']
 ```

+If the main and assistant models have different tokenizers, use Universal Assisted Decoding.
+
+```python
+>>> from transformers import AutoModelForCausalLM, AutoTokenizer
+
+>>> prompt = "Alice and Bob"
+>>> checkpoint = "google/gemma-2-9b"
+>>> assistant_checkpoint = "double7/vicuna-68m"
+
+>>> assistant_tokenizer = AutoTokenizer.from_pretrained(assistant_checkpoint)
+>>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+>>> inputs = tokenizer(prompt, return_tensors="pt")
+
+>>> model = AutoModelForCausalLM.from_pretrained(checkpoint)
+>>> assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint)
+>>> outputs = model.generate(**inputs, assistant_model=assistant_model, tokenizer=tokenizer, assistant_tokenizer=assistant_tokenizer)
+>>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a']
+```
+
 When using assisted decoding with sampling methods, you can use the `temperature` argument to control the randomness,
 just like in multinomial sampling. However, in assisted decoding, reducing the temperature may help improve the latency.

@ -456,63 +486,9 @@ just like in multinomial sampling. However, in assisted decoding, reducing the t
 ['Alice and Bob, a couple of friends of mine, who are both in the same office as']
 ```

-#### Universal Assisted Decoding
-
-Universal Assisted Decoding (UAD) adds support for main and assistant models with different tokenizers.
-To use it, simply pass the tokenizers using the `tokenizer` and `assistant_tokenizer` arguments (see below).
-Internally, the main model input tokens are re-encoded into assistant model tokens, then candidate tokens are generated in the assistant encoding, which are
-in turn re-encoded into main model candidate tokens. Validation then proceeds as explained above.
-The re-encoding steps involve decoding token ids into text and then encoding the text using a different tokenizer.
-Since re-encoding the tokens may result in tokenization discrepancies, UAD finds the longest common subsequence between the source and target encodings,
-to ensure the new tokens include the correct prompt suffix.
-
-```python
->>> from transformers import AutoModelForCausalLM, AutoTokenizer
-
->>> prompt = "Alice and Bob"
->>> checkpoint = "google/gemma-2-9b"
->>> assistant_checkpoint = "double7/vicuna-68m"
-
->>> assistant_tokenizer = AutoTokenizer.from_pretrained(assistant_checkpoint)
->>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
->>> inputs = tokenizer(prompt, return_tensors="pt")
-
->>> model = AutoModelForCausalLM.from_pretrained(checkpoint)
->>> assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint)
->>> outputs = model.generate(**inputs, assistant_model=assistant_model, tokenizer=tokenizer, assistant_tokenizer=assistant_tokenizer)
->>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a']
-```
-
-#### Prompt Lookup
-
 Alternatively, you can also set the `prompt_lookup_num_tokens` to trigger n-gram based assisted decoding, as opposed
 to model based assisted decoding. You can read more about it [here](https://twitter.com/joao_gante/status/1747322413006643259).

-#### Self-Speculative Decoding
-
-An LLM can be trained to also use its language modeling head with earlier hidden states as input, effectively
-skipping layers to yield a lower-quality output -- a technique called early exiting.
-We use the lower-quality early exit output as an assistant output, and apply self-speculation to fix the output using the remaining layers. The final generation of that self-speculative solution is the same (or has the same distribution) as the original model's generation.
-If the model you're using was trained to do early exit, you can pass
-`assistant_early_exit` (integer). In this case, the assistant model will be the same model but exiting early, hence the
-"self-speculative" name. Because the assistant model is a portion of the target model, caches and weights can be shared, which results in lower memory requirements. As in other assisted generation methods, the final generated result has the same quality as if no assistant had been used.
-
-```python
->>> from transformers import AutoModelForCausalLM, AutoTokenizer
-
->>> prompt = "Alice and Bob"
->>> checkpoint = "facebook/layerskip-llama3.2-1B"
-
->>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
->>> inputs = tokenizer(prompt, return_tensors="pt")
-
->>> model = AutoModelForCausalLM.from_pretrained(checkpoint)
->>> outputs = model.generate(**inputs, assistant_early_exit=4, do_sample=False, max_new_tokens=20)
->>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a']
-```
-
 ### DoLa Decoding

 **D**ecoding by C**o**ntrasting **La**yers (DoLa) is a contrastive decoding strategy to improve the factuality and reduce the
--- a/docs/source/en/gguf.md
+++ b/docs/source/en/gguf.md
@ -87,7 +87,6 @@ For now the supported model architectures are the architectures that have been v
 - Starcoder2
 - T5
 - Mamba
- Nemotron

 ## Example usage

--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@ -240,7 +240,6 @@ Flax), PyTorch, and/or TensorFlow.
 |                        [Nougat](model_doc/nougat)                        |       ✅        |         ✅         |      ✅      |
 |                 [Nyströmformer](model_doc/nystromformer)                 |       ✅        |         ❌         |      ❌      |
 |                          [OLMo](model_doc/olmo)                          |       ✅        |         ❌         |      ❌      |
-|                [OLMo November 2024](model_doc/olmo_1124)                 |       ✅        |         ❌         |      ❌      |
 |                         [OLMoE](model_doc/olmoe)                         |       ✅        |         ❌         |      ❌      |
 |                   [OmDet-Turbo](model_doc/omdet-turbo)                   |       ✅        |         ❌         |      ❌      |
 |                     [OneFormer](model_doc/oneformer)                     |       ✅        |         ❌         |      ❌      |
--- a/docs/source/en/model_doc/blip-2.md
+++ b/docs/source/en/model_doc/blip-2.md
@ -40,10 +40,6 @@ The original code can be found [here](https://github.com/salesforce/LAVIS/tree/5
 - BLIP-2 can be used for conditional text generation given an image and an optional text prompt. At inference time, it's recommended to use the [`generate`] method.
 - One can use [`Blip2Processor`] to prepare images for the model, and decode the predicted tokens ID's back to text.

-> [!NOTE]
-> BLIP models after release v4.46 will raise warnings about adding `processor.num_query_tokens = {{num_query_tokens}}` and expand model embeddings layer to add special `<image>` token. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you. Adding these attributes means that BLIP will add the number of query tokens required per image and expand the text with as many `<image>` placeholders as there will be query tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there wil be failure when merging the embeddings.
-The attributes can be obtained from model config, as `model.config.num_query_tokens` and model embeddings expansion can be done by following [this link](https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042).
-
 ## Resources

 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with BLIP-2.
--- a/docs/source/en/model_doc/deformable_detr.md
+++ b/docs/source/en/model_doc/deformable_detr.md
@ -54,12 +54,6 @@ If you're interested in submitting a resource to be included here, please feel f
    - preprocess
    - post_process_object_detection

-## DeformableDetrImageProcessorFast
-
-[[autodoc]] DeformableDetrImageProcessorFast
-    - preprocess
-    - post_process_object_detection
-
 ## DeformableDetrFeatureExtractor

 [[autodoc]] DeformableDetrFeatureExtractor
--- a/docs/source/en/model_doc/instructblip.md
+++ b/docs/source/en/model_doc/instructblip.md
@ -33,10 +33,6 @@ The original code can be found [here](https://github.com/salesforce/LAVIS/tree/m

 InstructBLIP uses the same architecture as [BLIP-2](blip2) with a tiny but important difference: it also feeds the text prompt (instruction) to the Q-Former.

-> [!NOTE]
-> BLIP models after release v4.46 will raise warnings about adding `processor.num_query_tokens = {{num_query_tokens}}` and expand model embeddings layer to add special `<image>` token. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you. Adding these attributes means that BLIP will add the number of query tokens required per image and expand the text with as many `<image>` placeholders as there will be query tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there wil be failure when merging the embeddings.
-The attributes can be obtained from model config, as `model.config.num_query_tokens` and model embeddings expansion can be done by following [this link](https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042).
-
 ## InstructBlipConfig

 [[autodoc]] InstructBlipConfig
--- a/docs/source/en/model_doc/instructblipvideo.md
+++ b/docs/source/en/model_doc/instructblipvideo.md
@ -35,10 +35,6 @@ The original code can be found [here](https://github.com/salesforce/LAVIS/tree/m

 - The model was trained by sampling 4 frames per video, so it's recommended to sample 4 frames

-> [!NOTE]
-> BLIP models after release v4.46 will raise warnings about adding `processor.num_query_tokens = {{num_query_tokens}}` and expand model embeddings layer to add special `<image>` token. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you. Adding these attributes means that BLIP will add the number of query tokens required per image and expand the text with as many `<image>` placeholders as there will be query tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there wil be failure when merging the embeddings.
-The attributes can be obtained from model config, as `model.config.num_query_tokens` and model embeddings expansion can be done by following [this link](https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042).
-
 ## InstructBlipVideoConfig

 [[autodoc]] InstructBlipVideoConfig
--- a/docs/source/en/model_doc/llava.md
+++ b/docs/source/en/model_doc/llava.md
@ -40,13 +40,6 @@ The original code can be found [here](https://github.com/haotian-liu/LLaVA/tree/

 - Note the model has not been explicitly trained to process multiple images in the same prompt, although this is technically possible, you may experience inaccurate results.

-
-> [!NOTE]
-> LLaVA models after release v4.46 will raise warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you.
-Adding these attributes means that LLaVA will try to infer the number of image tokens required per image and expand the text with as many `<image>` placeholders as there will be tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there will be failure when merging the embeddings.
-The attributes can be obtained from model config, as `model.config.vision_config.patch_size` or `model.config.vision_feature_select_strategy`. The `num_additional_image_tokens` should be `1` if the vision backbone adds a CLS token or `0` if nothing extra is added to the vision patches.
-
-
 ### Single image inference

 For best results, we recommend users to use the processor's `apply_chat_template()` method to format your prompt correctly. For that you need to construct a conversation history, passing in a plain string will not format your prompt. Each message in the conversation history for chat templates is a dictionary with keys "role" and "content". The "content" should be a list of dictionaries, for "text" and "image" modalities, as follows:
@ -92,10 +85,10 @@ LLaVa also supports batched inference. Here is how you can do it:
 import requests
 from PIL import Image
 import torch
-from transformers import AutoProcessor, LlavaForConditionalGeneration
+from transformers import AutoProcessor, LLavaForConditionalGeneration

 # Load the model in half-precision
-model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf", torch_dtype=torch.float16, device_map="auto")
+model = LLavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf", torch_dtype=torch.float16, device_map="auto")
 processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")

 # Get two different images
--- a/docs/source/en/model_doc/llava_next.md
+++ b/docs/source/en/model_doc/llava_next.md
@ -53,12 +53,6 @@ The original code can be found [here](https://github.com/haotian-liu/LLaVA/tree/
 </Tip>


-> [!NOTE]
-> LLaVA models after release v4.46 will raise warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you.
-Adding these attributes means that LLaVA will try to infer the number of image tokens required per image and expand the text with as many `<image>` placeholders as there will be tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there will be failure when merging the embeddings.
-The attributes can be obtained from model config, as `model.config.vision_config.patch_size` or `model.config.vision_feature_select_strategy`. The `num_additional_image_tokens` should be `1` if the vision backbone adds a CLS token or `0` if nothing extra is added to the vision patches.
-
-
 - Note that each checkpoint has been trained with a specific prompt format, depending on which large language model (LLM) was used. You can use the processor's `apply_chat_template` to format your prompts correctly. For that you have to construct a conversation history, passing a plain string will not format your prompt. Each message in the conversation history for chat templates is a dictionary with keys "role" and "content". The "content" should be a list of dictionaries, for "text" and "image" modalities. Below is an example of how to do that and the list of formats accepted by each checkpoint.

 We will use [llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf) and a conversation history of text and image. Each content field has to be a list of dicts, as follows:
--- a/docs/source/en/model_doc/llava_next_video.md
+++ b/docs/source/en/model_doc/llava_next_video.md
@ -50,12 +50,6 @@ The original code can be found [here](https://github.com/LLaVA-VL/LLaVA-NeXT/tre
 </Tip>


-> [!NOTE]
-> LLaVA models after release v4.46 will raise warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you.
-Adding these attributes means that LLaVA will try to infer the number of image tokens required per image and expand the text with as many `<image>` placeholders as there will be tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there will be failure when merging the embeddings.
-The attributes can be obtained from model config, as `model.config.vision_config.patch_size` or `model.config.vision_feature_select_strategy`. The `num_additional_image_tokens` should be `1` if the vision backbone adds a CLS token or `0` if nothing extra is added to the vision patches.
-
-
 - Note that each checkpoint has been trained with a specific prompt format, depending on which large language model (LLM) was used. You can use tokenizer's `apply_chat_template` to format your prompts correctly. Below is an example of how to do that.

 We will use [LLaVA-NeXT-Video-7B-hf](https://huggingface.co/llava-hf/LLaVA-NeXT-Video-7B-hf) and a conversation history of videos and images. Each content field has to be a list of dicts, as follows:
--- a/docs/source/en/model_doc/olmo_1124.md
+++ b/docs/source/en/model_doc/olmo_1124.md
@ -1,46 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# OLMo November 2024
-
-## Overview
-
-The OLMo November 2024 model is a successor of the OLMo model, which was proposed in
-[OLMo: Accelerating the Science of Language Models](https://arxiv.org/abs/2402.00838).
-
- The architectural changes from the original OLMo model to this model are:
-
- RMSNorm is used instead of standard layer norm.
- Norm is applied to attention queries and keys.
- Norm is applied after attention/feedforward layers rather than before.
-
-This model was contributed by [shanearora](https://huggingface.co/shanearora).
-The original code can be found [here](https://github.com/allenai/OLMo/tree/main/olmo).
-
-
-## Olmo1124Config
-
-[[autodoc]] Olmo1124Config
-
-## Olmo1124Model
-
-[[autodoc]] Olmo1124Model
-    - forward
-
-## Olmo1124ForCausalLM
-
-[[autodoc]] Olmo1124ForCausalLM
-    - forward
--- a/docs/source/en/model_doc/rt_detr.md
+++ b/docs/source/en/model_doc/rt_detr.md
@ -57,7 +57,7 @@ Initially, an image is processed using a pre-trained convolutional neural networ
 >>> with torch.no_grad():
 ...     outputs = model(**inputs)

->>> results = image_processor.post_process_object_detection(outputs, target_sizes=torch.tensor([(image.height, image.width)]), threshold=0.3)
+>>> results = image_processor.post_process_object_detection(outputs, target_sizes=torch.tensor([image.size[::-1]]), threshold=0.3)

 >>> for result in results:
 ...     for score, label_id, box in zip(result["scores"], result["labels"], result["boxes"]):
--- a/docs/source/en/model_doc/video_llava.md
+++ b/docs/source/en/model_doc/video_llava.md
@ -54,12 +54,6 @@ This model was contributed by [RaushanTurganbay](https://huggingface.co/RaushanT
 The original code can be found [here](https://github.com/PKU-YuanGroup/Video-LLaVA).


-> [!NOTE]
-> LLaVA models after release v4.46 will raise warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you.
-Adding these attributes means that LLaVA will try to infer the number of image tokens required per image and expand the text with as many `<image>` placeholders as there will be tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there will be failure when merging the embeddings.
-The attributes can be obtained from model config, as `model.config.vision_config.patch_size` or `model.config.vision_feature_select_strategy`. The `num_additional_image_tokens` should be `1` if the vision backbone adds a CLS token or `0` if nothing extra is added to the vision patches.
-
-
 ## Usage example

 ### Single Media Mode
--- a/docs/source/en/model_doc/vipllava.md
+++ b/docs/source/en/model_doc/vipllava.md
@ -39,12 +39,6 @@ This model was contributed by [Younes Belkada](https://huggingface.co/ybelkada)

 - Note the model has not been explicitly trained to process multiple images in the same prompt, although this is technically possible, you may experience inaccurate results.

-> [!NOTE]
-> LLaVA models after release v4.46 will raise warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you.
-Adding these attributes means that LLaVA will try to infer the number of image tokens required per image and expand the text with as many `<image>` placeholders as there will be tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there will be failure when merging the embeddings.
-The attributes can be obtained from model config, as `model.config.vision_config.patch_size` or `model.config.vision_feature_select_strategy`. The `num_additional_image_tokens` should be `1` if the vision backbone adds a CLS token or `0` if nothing extra is added to the vision patches.
-
-
 - For better results, we recommend users to use the processor's `apply_chat_template()` method to format your prompt correctly. For that you need to construct a conversation history, passing in a plain string will not format your prompt. Each message in the conversation history for chat templates is a dictionary with keys "role" and "content". The "content" should be a list of dictionaries, for "text" and "image" modalities, as follows:

 ```python
--- a/docs/source/en/perf_infer_gpu_multi.md
+++ b/docs/source/en/perf_infer_gpu_multi.md
@ -1,68 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Multi-GPU inference
-
-Built-in Tensor Parallelism (TP) is now available with certain models using PyTorch. Tensor parallelism shards a model onto multiple GPUs, enabling larger model sizes, and parallelizes computations such as matrix multiplication.
-
-To enable tensor parallel, pass the argument `tp_plan="auto"` to [`~AutoModelForCausalLM.from_pretrained`]:
-
-```python
-import os
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
-
-# Initialize distributed
-rank = int(os.environ["RANK"])
-device = torch.device(f"cuda:{rank}")
-torch.distributed.init_process_group("nccl", device_id=device)
-
-# Retrieve tensor parallel model
-model = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    tp_plan="auto",
-)
-
-# Prepare input tokens
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-prompt = "Can I help"
-inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
-
-# Distributed run
-outputs = model(inputs)
-```
-
-You can use `torchrun` to launch the above script with multiple processes, each mapping to a GPU:
-
-```
-torchrun --nproc-per-node 4 demo.py
-```
-
-PyTorch tensor parallel is currently supported for the following models:
-* [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel)
-
-You can request to add tensor parallel support for another model by opening a GitHub Issue or Pull Request.
-
-### Expected speedups
-
-You can benefit from considerable speedups for inference, especially for inputs with large batch size or long sequences.
-
-For a single forward pass on [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel) with a sequence length of 512 and various batch sizes, the expected speedup is as follows:
-
-<div style="text-align: center">
-<img src="huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Meta-Llama-3-8B-Instruct, seqlen = 512, python, w_ compile.png">
-</div>
--- a/docs/source/en/perf_infer_gpu_one.md
+++ b/docs/source/en/perf_infer_gpu_one.md
@ -77,7 +77,6 @@ FlashAttention-2 is currently supported for the following architectures:
 * [Nemotron](https://huggingface.co/docs/transformers/model_doc/nemotron)
 * [NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)
 * [OLMo](https://huggingface.co/docs/transformers/model_doc/olmo#transformers.OlmoModel)
-* [OLMo November 2024](https://huggingface.co/docs/transformers/model_doc/olmo_1124#transformers.Olmo1124Model)
 * [OLMoE](https://huggingface.co/docs/transformers/model_doc/olmoe#transformers.OlmoeModel)
 * [OPT](https://huggingface.co/docs/transformers/model_doc/opt#transformers.OPTModel)
 * [PaliGemma](https://huggingface.co/docs/transformers/model_doc/paligemma#transformers.PaliGemmaForConditionalGeneration)
@ -261,7 +260,6 @@ For now, Transformers supports SDPA inference and training for the following arc
 * [MusicGen Melody](https://huggingface.co/docs/transformers/model_doc/musicgen_melody#transformers.MusicgenMelodyModel)
 * [NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)
 * [OLMo](https://huggingface.co/docs/transformers/model_doc/olmo#transformers.OlmoModel)
-* [OLMo November 2024](https://huggingface.co/docs/transformers/model_doc/olmo_1124#transformers.Olmo1124Model)
 * [OLMoE](https://huggingface.co/docs/transformers/model_doc/olmoe#transformers.OlmoeModel)
 * [OPT](https://huggingface.co/docs/transformers/en/model_doc/opt)
 * [PaliGemma](https://huggingface.co/docs/transformers/model_doc/paligemma#transformers.PaliGemmaForConditionalGeneration)
--- a/docs/source/en/performance.md
+++ b/docs/source/en/performance.md
@ -53,7 +53,7 @@ sections we go through the steps to run inference on CPU and single/multi-GPU se

 * [Inference on a single CPU](perf_infer_cpu)
 * [Inference on a single GPU](perf_infer_gpu_one)
-* [Multi-GPU inference](perf_infer_gpu_multi)
+* [Multi-GPU inference](perf_infer_gpu_one)
 * [XLA Integration for TensorFlow Models](tf_xla)


--- a/docs/source/en/quantization/overview.md
+++ b/docs/source/en/quantization/overview.md
@ -45,19 +45,19 @@ In short, supporting a wide range of quantization methods allows you to pick the

 Use the table below to help you decide which quantization method to use.

-| Quantization method                 | On the fly quantization | CPU | CUDA GPU | RoCm GPU (AMD) | Metal (Apple Silicon) | Intel GPU | torch.compile() support | Number of bits | Supports fine-tuning (through PEFT) | Serializable with 🤗 transformers | 🤗 transformers support | Link to library                             |
-|-------------------------------------|-------------------------|-----|----------|----------------|-----------------------|-----------|-------------------------|----------------|-------------------------------------|--------------|------------------------|---------------------------------------------|
-| [AQLM](./aqlm)                                | 🔴                       |  🟢   |     🟢     | 🔴              | 🔴                     | 🔴         | 🟢                      | 1 / 2          | 🟢                                   | 🟢            | 🟢                      | https://github.com/Vahe1994/AQLM            |
-| [AWQ](./awq) | 🔴                       | 🟢   | 🟢        | 🟢              | 🔴                     | 🟢         | ?                       | 4              | 🟢                                   | 🟢            | 🟢                      | https://github.com/casper-hansen/AutoAWQ    |
-| [bitsandbytes](./bitsandbytes)     | 🟢            | 🟡 *   |     🟢     | 🟡 *            | 🔴 **    | 🟡 *       | 🔴    (soon!)          | 4 / 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/bitsandbytes-foundation/bitsandbytes |
-| [compressed-tensors](./compressed_tensors)                        | 🔴                       | 🟢   |     🟢     | 🟢              | 🔴                     | 🔴         | 🔴                       | 1 - 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/neuralmagic/compressed-tensors |
-| [EETQ](./eetq)                                | 🟢                       | 🔴   | 🟢        | 🔴              | 🔴         | 🔴                     | ?                       | 8              | 🟢                                   | 🟢            | 🟢                      | https://github.com/NetEase-FuXi/EETQ        |
-| GGUF / GGML (llama.cpp)             | 🟢                       | 🟢   | 🟢        | 🔴              | 🟢                     | 🔴         | 🔴                       | 1 - 8          | 🔴                                   | [See GGUF section](../gguf)                | [See GGUF section](../gguf)                      | https://github.com/ggerganov/llama.cpp      |
-| [GPTQ](./gptq)                                | 🔴                       | 🔴   | 🟢        | 🟢              | 🔴                     | 🔴         | 🔴                       | 2 - 3 - 4 - 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/AutoGPTQ/AutoGPTQ        |
-| [HQQ](./hqq)                                 | 🟢                       | 🟢    | 🟢        | 🔴              | 🔴                     | 🔴         | 🟢                       | 1 - 8          | 🟢                                   | 🔴            | 🟢                      | https://github.com/mobiusml/hqq/            |
-| [optimum-quanto](./quanto)                              | 🟢                       | 🟢   | 🟢        | 🔴              | 🟢                     | 🔴         | 🟢                       | 2 / 4 / 8      | 🔴                                   | 🔴            | 🟢                      | https://github.com/huggingface/optimum-quanto       |
-| [FBGEMM_FP8](./fbgemm_fp8.md)                              | 🟢                       | 🔴    | 🟢        | 🔴              | 🔴                      | 🔴         | 🔴                        | 8      | 🔴                                   | 🟢            | 🟢                      | https://github.com/pytorch/FBGEMM       |
-| [torchao](./torchao.md)                              | 🟢                       |     | 🟢        | 🔴              | partial support (int4 weight only)       | 🔴         |                       | 4 / 8      |                                   | 🟢🔴           | 🟢                      | https://github.com/pytorch/ao       |
+| Quantization method                 | On the fly quantization | CPU | CUDA GPU | RoCm GPU (AMD) | Metal (Apple Silicon) | torch.compile() support | Number of bits | Supports fine-tuning (through PEFT) | Serializable with 🤗 transformers | 🤗 transformers support | Link to library                             |
+|-------------------------------------|-------------------------|-----|----------|----------------|-----------------------|-------------------------|----------------|-------------------------------------|--------------|------------------------|---------------------------------------------|
+| [AQLM](./aqlm)                                | 🔴                       |  🟢   |     🟢     | 🔴              | 🔴                     | 🟢                      | 1 / 2          | 🟢                                   | 🟢            | 🟢                      | https://github.com/Vahe1994/AQLM            |
+| [AWQ](./awq) | 🔴                       | 🔴   | 🟢        | 🟢              | 🔴                     | ?                       | 4              | 🟢                                   | 🟢            | 🟢                      | https://github.com/casper-hansen/AutoAWQ    |
+| [bitsandbytes](./bitsandbytes)     | 🟢            | 🟡 *   |     🟢     | 🟡 *            | 🔴 **    | 🔴    (soon!)          | 4 / 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/bitsandbytes-foundation/bitsandbytes |
+| [compressed-tensors](./compressed_tensors)                        | 🔴                       | 🟢   |     🟢     | 🟢              | 🔴                     | 🔴                       | 1 - 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/neuralmagic/compressed-tensors |
+| [EETQ](./eetq)                                | 🟢                       | 🔴   | 🟢        | 🔴              | 🔴                     | ?                       | 8              | 🟢                                   | 🟢            | 🟢                      | https://github.com/NetEase-FuXi/EETQ        |
+| GGUF / GGML (llama.cpp)             | 🟢                       | 🟢   | 🟢        | 🔴              | 🟢                     | 🔴                       | 1 - 8          | 🔴                                   | [See GGUF section](../gguf)                | [See GGUF section](../gguf)                      | https://github.com/ggerganov/llama.cpp      |
+| [GPTQ](./gptq)                                | 🔴                       | 🔴   | 🟢        | 🟢              | 🔴                     | 🔴                       | 2 - 3 - 4 - 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/AutoGPTQ/AutoGPTQ        |
+| [HQQ](./hqq)                                 | 🟢                       | 🟢    | 🟢        | 🔴              | 🔴                     | 🟢                       | 1 - 8          | 🟢                                   | 🔴            | 🟢                      | https://github.com/mobiusml/hqq/            |
+| [Quanto](./quanto)                              | 🟢                       | 🟢   | 🟢        | 🔴              | 🟢                     | 🟢                       | 2 / 4 / 8      | 🔴                                   | 🔴            | 🟢                      | https://github.com/huggingface/quanto       |
+| [FBGEMM_FP8](./fbgemm_fp8.md)                              | 🟢                       | 🔴    | 🟢        | 🔴              | 🔴                      | 🔴                        | 8      | 🔴                                   | 🟢            | 🟢                      | https://github.com/pytorch/FBGEMM       |
+| [torchao](./torchao.md)                              | 🟢                       |     | 🟢        | 🔴              | partial support (int4 weight only)       |                       | 4 / 8      |                                   | 🟢🔴           | 🟢                      | https://github.com/pytorch/ao       |

 <Tip>

--- a/docs/source/en/quantization/quanto.md
+++ b/docs/source/en/quantization/quanto.md
@ -14,21 +14,21 @@ rendered properly in your Markdown viewer.

 -->

-# Optimum-quanto
+# Quanto

 <Tip>

-Try optimum-quanto + transformers with this [notebook](https://colab.research.google.com/drive/16CXfVmtdQvciSh9BopZUDYcmXCDpvgrT?usp=sharing)!
+Try Quanto + transformers with this [notebook](https://colab.research.google.com/drive/16CXfVmtdQvciSh9BopZUDYcmXCDpvgrT?usp=sharing)!

 </Tip>


-[🤗 optimum-quanto](https://github.com/huggingface/optimum-quanto) library is a versatile pytorch quantization toolkit. The quantization method used is the linear quantization. Quanto provides several unique features such as:
+[🤗 Quanto](https://github.com/huggingface/quanto) library is a versatile pytorch quantization toolkit. The quantization method used is the linear quantization. Quanto provides several unique features such as:

 - weights quantization (`float8`,`int8`,`int4`,`int2`)
 - activation quantization (`float8`,`int8`)
 - modality agnostic (e.g CV,LLM)
- device agnostic (e.g CUDA,XPU,MPS,CPU)
+- device agnostic (e.g CUDA,MPS,CPU)
 - compatibility with `torch.compile`
 - easy to add custom kernel for specific device
 - supports quantization aware training
@ -37,12 +37,12 @@ Try optimum-quanto + transformers with this [notebook](https://colab.research.go
 Before you begin, make sure the following libraries are installed:

 ```bash
-pip install optimum-quanto accelerate transformers
+pip install quanto accelerate transformers
 ```

 Now you can quantize a model by passing [`QuantoConfig`] object in the [`~PreTrainedModel.from_pretrained`] method. This works for any model in any modality, as long as it contains `torch.nn.Linear` layers. 

-The integration with transformers only supports weights quantization. For the more complex use case such as activation quantization, calibration and quantization aware training, you should use [optimum-quanto](https://github.com/huggingface/optimum-quanto) library instead. 
+The integration with transformers only supports weights quantization. For the more complex use case such as activation quantization, calibration and quantization aware training, you should use [quanto](https://github.com/huggingface/quanto) library instead. 

 ```py
 from transformers import AutoModelForCausalLM, AutoTokenizer, QuantoConfig
@ -55,7 +55,7 @@ quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cud

 Note that serialization is not supported yet with transformers but it is coming soon! If you want to save the model, you can use quanto library instead.

-Optimum-quanto library uses linear quantization algorithm for quantization. Even though this is a basic quantization technique, we get very good results! Have a look at the following benchmark (llama-2-7b on perplexity metric). You can find more benchmarks [here](https://github.com/huggingface/optimum-quanto/tree/main/bench/generation)
+Quanto library uses linear quantization algorithm for quantization. Even though this is a basic quantization technique, we get very good results! Have a look at the following benchmark (llama-2-7b on perplexity metric). You can find more benchmarks [here](https://github.com/huggingface/quanto/tree/main/bench/generation)

 <div class="flex gap-4">
  <div>
--- a/docs/source/en/tasks/image_classification.md
+++ b/docs/source/en/tasks/image_classification.md
@ -26,7 +26,7 @@ after a natural disaster, monitoring crop health, or helping screen medical imag

 This guide illustrates how to:

-1. Fine-tune [ViT](../model_doc/vit) on the [Food-101](https://huggingface.co/datasets/food101) dataset to classify a food item in an image.
+1. Fine-tune [ViT](model_doc/vit) on the [Food-101](https://huggingface.co/datasets/food101) dataset to classify a food item in an image.
 2. Use your fine-tuned model for inference.

 <Tip>
--- a/docs/source/en/tasks/image_text_to_text.md
+++ b/docs/source/en/tasks/image_text_to_text.md
@ -120,46 +120,6 @@ print(generated_texts)
 ## ['User: What do we see in this image? \nAssistant: In this image we can see two cats on the nets. \nUser: And how about this image? \nAssistant: In this image we can see flowers, plants and insect.']
 ```

-## Pipeline
-
-The fastest way to get started is to use the [`Pipeline`] API. Specify the `"image-text-to-text"` task and the model you want to use.
-
-```python
-from transformers import pipeline
-pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
-```
-
-The example below uses chat templates to format the text inputs.
-
-```python
-messages = [
-     {
-         "role": "user",
-         "content": [
-             {
-                 "type": "image",
-                 "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg",
-             },
-             {"type": "text", "text": "Describe this image."},
-         ],
-     },
-     {
-         "role": "assistant",
-         "content": [
-             {"type": "text", "text": "There's a pink flower"},
-         ],
-     },
- ]
-```
-
-Pass the chat template formatted text and image to [`Pipeline`] and set `return_full_text=False` to remove the input from the generated output.
-
-```python
-outputs = pipe(text=messages, max_new_tokens=20, return_full_text=False)
-outputs[0]["generated_text"]
-#  with a yellow center in the foreground. The flower is surrounded by red and white flowers with green stems
-```
-
 ## Streaming

 We can use [text streaming](./generation_strategies#streaming) for a better generation experience. Transformers supports streaming with the [`TextStreamer`] or [`TextIteratorStreamer`] classes. We will use the [`TextIteratorStreamer`] with IDEFICS-8B.
--- a/docs/source/en/tasks_explained.md
+++ b/docs/source/en/tasks_explained.md
@ -182,7 +182,7 @@ There are three main components to Mask2Former:

    The mask predictions are generated by combining the pixel-embeddings with the final decoder hidden states. The sigmoid cross-entropy and dice loss is calculated between the logits and the ground truth mask to find the most likely mask.

-Ready to try your hand at image segmentation? Check out our complete [image segmentation guide](tasks/semantic_segmentation) to learn how to finetune SegFormer and use it for inference!
+Ready to try your hand at object detection? Check out our complete [image segmentation guide](tasks/semantic_segmentation) to learn how to finetune SegFormer and use it for inference!

 ### Depth estimation

@ -292,4 +292,4 @@ Ready to try your hand at translation? Check out our complete [translation guide

 For more information about text generation, check out the [text generation strategies](generation_strategies) guide!

-</Tip>
+</Tip>
--- a/docs/source/en/testing.md
+++ b/docs/source/en/testing.md
@ -428,7 +428,7 @@ pytest --instafail

 ### To GPU or not to GPU

-On a GPU-enabled setup, to test in CPU-only mode add `CUDA_VISIBLE_DEVICES=""` for CUDA GPUs:
+On a GPU-enabled setup, to test in CPU-only mode add `CUDA_VISIBLE_DEVICES=""`:

 ```bash
 CUDA_VISIBLE_DEVICES="" pytest tests/utils/test_logging.py
@ -441,12 +441,10 @@ second gpu if you have gpus `0` and `1`, you can run:
 CUDA_VISIBLE_DEVICES="1" pytest tests/utils/test_logging.py
 ```

-For Intel GPUs, use `ZE_AFFINITY_MASK` instead of `CUDA_VISIBLE_DEVICES` in the above example.
-
 This is handy when you want to run different tasks on different GPUs.

 Some tests must be run on CPU-only, others on either CPU or GPU or TPU, yet others on multiple-GPUs. The following skip
-decorators are used to set the requirements of tests CPU/GPU/XPU/TPU-wise:
+decorators are used to set the requirements of tests CPU/GPU/TPU-wise:

 - `require_torch` - this test will run only under torch
 - `require_torch_gpu` - as `require_torch` plus requires at least 1 GPU
--- a/docs/source/en/trainer.md
+++ b/docs/source/en/trainer.md
@ -174,7 +174,7 @@ trainer = Trainer(
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
-    callbacks=[EarlyStoppingCallback()],
+    callback=[EarlyStoppingCallback()],
 )
 ```

--- a/docs/source/en/training.md
+++ b/docs/source/en/training.md
@ -287,10 +287,9 @@ model.fit(tf_dataset)
 At this point, you may need to restart your notebook or execute the following code to free some memory:

 ```py
-from accelerate.utils.memory import clear_device_cache
 del model
 del trainer
-clear_device_cache()
+torch.cuda.empty_cache()
 ```

 Next, manually postprocess `tokenized_dataset` to prepare it for training.
@ -365,9 +364,8 @@ Lastly, specify `device` to use a GPU if you have access to one. Otherwise, trai

 ```py
 >>> import torch
->>> from accelerate.test_utils.testing import get_backend

->>> device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
+>>> device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
 >>> model.to(device)
 ```

--- a/docs/source/zh/_toctree.yml
+++ b/docs/source/zh/_toctree.yml
@ -86,8 +86,6 @@
    title: 🤗Transformers能做什么
  - local: tokenizer_summary
    title: 分词器的摘要
-  - local: attention
-    title: 注意力机制
  title: 概念指南
 - sections:
  - sections:
--- a/docs/source/zh/attention.md
+++ b/docs/source/zh/attention.md
@ -1,37 +0,0 @@
-<!--版权2023年HuggingFace团队保留所有权利。
-
-根据Apache许可证第2.0版（“许可证”）许可；除非符合许可证，否则您不得使用此文件。您可以在以下网址获取许可证的副本：
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-除非适用法律要求或书面同意，否则按“按原样”分发的软件，无论是明示还是暗示的，都没有任何担保或条件。请参阅许可证以了解特定语言下的权限和限制。
-
-⚠️ 请注意，本文件虽然使用Markdown编写，但包含了特定的语法，适用于我们的doc-builder（类似于MDX），可能无法在您的Markdown查看器中正常渲染。
-
-->
-
-# 注意力机制
-
-大多数 transformer 模型使用完全注意力机制，该机制采用正方形的注意力矩阵。当输入很长的文本时，这将导致巨大的计算瓶颈。Longformer 和 Reformer 是提高注意力机制效率的改进模型，它们使用稀疏化的注意力矩阵来加速训练。
-
-## 局部敏感哈希注意力机制（LSH attention）
-
-[Reformer](model_doc/reformer)使用LSH（局部敏感哈希）的注意力机制。在计算softmax(QK^t)时，只有矩阵QK^t中的最大元素（在softmax维度上）会做出有用的贡献。所以对于Q中的每个查询q，我们只需要考虑K中与q接近的键k，这里使用了一个哈希函数来确定q和k是否接近。注意力掩码被修改以掩盖当前的词符（token）（除了第一个位置之外），因为这样会使得查询和键相等（因此非常相似）。由于哈希可能会有些随机性，所以在实践中使用多个哈希函数（由n_rounds参数确定）,然后一起求平均。
-
-## 局部注意力机制（Local attention）
-[Longformer](model_doc/longformer)使用局部注意力机制:通常情况下，局部上下文（例如，左边和右边的两个词符是什么？）对于给定词符的操作已经足够了。此外，通过堆叠具有小窗口的注意力层，最后一层将拥有不仅仅是窗口内词符的感受野，这使得它们能构建整个句子的表示。
-
-一些预先选定的输入词符也被赋予全局注意力:对于这些少数词符，注意力矩阵可以访问所有词符（tokens），并且这个过程是对称的:所有其他词符除了它们局部窗口内的词符之外，也可以访问这些特定的词符。这在论文的图2d中有展示，下面是一个样本注意力掩码：
-
-<div class="flex justify-center">
-    <img scale="50 %" align="center" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/local_attention_mask.png"/>
-</div>
-
-使用参数更少的注意力矩阵，可以让模型处理更长的输入序列。
-
-## 其他技巧
-
-### 轴向位置编码
-
-[Reformer](model_doc/reformer)模型使用轴向位置编码：在传统的transformer模型中，位置编码矩阵E的大小是\\(l\\)乘以\\(d\\)，其中\\(l\\)是序列长度，\\(d\\)是隐藏状态的维度。如果你有非常长的文本，这个矩阵可能会非常大，将会占用大量的GPU显存。为了缓解这个问题，轴向位置编码将这个大矩阵E分解成两个较小的矩阵E1和E2，它们的维度分别是\\(l_{1} \times d_{1}\\) 和\\(l_{2} \times d_{2}\\)，满足\\(l_{1} \times l_{2} = l\\)和\\(d_{1} + d_{2} = d\\)（通过长度的乘积，最终得到的矩阵要小得多）。在E中，对于时间步\\(j\\) 的嵌入是通过连接E1中时间步 \\(j \% l1\\) 的嵌入和E2中时间步\\(j // l1\\)的嵌入来获得的。
-
--- a/examples/modular-transformers/configuration_my_new_model.py
+++ b/examples/modular-transformers/configuration_my_new_model.py
@ -130,16 +130,6 @@ class MyNewModelConfig(PretrainedConfig):

    model_type = "my_new_model"
    keys_to_ignore_at_inference = ["past_key_values"]
-    # Default tensor parallel plan for base model `MyNewModelModel`
-    base_model_tp_plan = {
-        "layers.*.self_attn.q_proj": "colwise",
-        "layers.*.self_attn.k_proj": "colwise",
-        "layers.*.self_attn.v_proj": "colwise",
-        "layers.*.self_attn.o_proj": "rowwise",
-        "layers.*.mlp.gate_proj": "colwise",
-        "layers.*.mlp.up_proj": "colwise",
-        "layers.*.mlp.down_proj": "rowwise",
-    }

    def __init__(
        self,
--- a/examples/modular-transformers/configuration_my_new_model2.py
+++ b/examples/modular-transformers/configuration_my_new_model2.py
@ -33,16 +33,6 @@ class MyNewModel2Config(PretrainedConfig):

    model_type = "my_new_model2"
    keys_to_ignore_at_inference = ["past_key_values"]
-    # Default tensor parallel plan for base model `MyNewModel2Model`
-    base_model_tp_plan = {
-        "layers.*.self_attn.q_proj": "colwise",
-        "layers.*.self_attn.k_proj": "colwise",
-        "layers.*.self_attn.v_proj": "colwise",
-        "layers.*.self_attn.o_proj": "rowwise",
-        "layers.*.mlp.gate_proj": "colwise",
-        "layers.*.mlp.up_proj": "colwise",
-        "layers.*.mlp.down_proj": "rowwise",
-    }

    def __init__(
        self,
--- a/examples/modular-transformers/modeling_dummy.py
+++ b/examples/modular-transformers/modeling_dummy.py
@ -8,6 +8,7 @@ import math
 from typing import List, Optional, Tuple, Union

 import torch
+import torch.nn.functional as F
 from torch import nn

 from ...activations import ACT2FN
@ -149,7 +150,25 @@ class DummyMLP(nn.Module):
        self.act_fn = ACT2FN[config.hidden_act]

    def forward(self, x):
-        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        if self.config.pretraining_tp > 1:
+            slice = self.intermediate_size // self.config.pretraining_tp
+            gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
+            up_proj_slices = self.up_proj.weight.split(slice, dim=0)
+            down_proj_slices = self.down_proj.weight.split(slice, dim=1)
+
+            gate_proj = torch.cat(
+                [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1
+            )
+            up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
+
+            intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
+            down_proj = [
+                F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
+            ]
+            down_proj = sum(down_proj)
+        else:
+            down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
        return down_proj


@ -245,14 +264,31 @@ class DummyAttention(nn.Module):
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        bsz, q_len, _ = hidden_states.size()

-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
+        if self.config.pretraining_tp > 1:
+            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
+            query_slices = self.q_proj.weight.split(
+                (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
+            )
+            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
+            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)

-        # use -1 to infer num_heads and num_key_value_heads as they may vary if tensor parallel is used
-        query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
+            query_states = torch.cat(query_states, dim=-1)
+
+            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
+            key_states = torch.cat(key_states, dim=-1)
+
+            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
+            value_states = torch.cat(value_states, dim=-1)
+
+        else:
+            query_states = self.q_proj(hidden_states)
+            key_states = self.k_proj(hidden_states)
+            value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

        if position_embeddings is None:
            logger.warning_once(
@ -294,7 +330,12 @@ class DummyAttention(nn.Module):

        attn_output = attn_output.reshape(bsz, q_len, -1)

-        attn_output = self.o_proj(attn_output)
+        if self.config.pretraining_tp > 1:
+            attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
+            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
+            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
+        else:
+            attn_output = self.o_proj(attn_output)

        if not output_attentions:
            attn_weights = None
@ -467,10 +508,9 @@ class DummySdpaAttention(DummyAttention):
        key_states = self.k_proj(hidden_states)
        value_states = self.v_proj(hidden_states)

-        # use -1 to infer num_heads and num_key_value_heads as they may vary if tensor parallel is used
-        query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

        if position_embeddings is None:
            logger.warning_once(
@ -754,10 +794,7 @@ class DummyModel(DummyPreTrainedModel):
        )
        self.norm = DummyRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.rotary_emb = DummyRotaryEmbedding(config=config)
-
        self.gradient_checkpointing = False
-        if getattr(config, "pretraining_tp", 1) != 1:
-            logger.warn("`pretraining_tp` is deprecated, please use `model.tensor_parallel` instead.")

        # Initialize weights and apply final processing
        self.post_init()
@ -837,7 +874,7 @@ class DummyModel(DummyPreTrainedModel):
        all_self_attns = () if output_attentions else None
        next_decoder_cache = None

-        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+        for decoder_layer in self.layers:
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

--- a/examples/modular-transformers/modeling_my_new_model2.py
+++ b/examples/modular-transformers/modeling_my_new_model2.py
@ -667,10 +667,7 @@ class MyNewModel2Model(MyNewModel2PreTrainedModel):
            [MyNewModel2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
        )
        self.norm = MyNewModel2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
        self.gradient_checkpointing = False
-        if getattr(config, "pretraining_tp", 1) != 1:
-            logger.warn("`pretraining_tp` is deprecated, please use `model.tensor_parallel` instead.")

        # Initialize weights and apply final processing
        self.post_init()
@ -755,7 +752,7 @@ class MyNewModel2Model(MyNewModel2PreTrainedModel):
        all_self_attns = () if output_attentions else None
        next_decoder_cache = None

-        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+        for decoder_layer in self.layers:
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

--- a/examples/modular-transformers/modeling_super.py
+++ b/examples/modular-transformers/modeling_super.py
@ -8,6 +8,7 @@ import math
 from typing import List, Optional, Tuple, Union

 import torch
+import torch.nn.functional as F
 from torch import nn

 from ...activations import ACT2FN
@ -149,7 +150,25 @@ class SuperMLP(nn.Module):
        self.act_fn = ACT2FN[config.hidden_act]

    def forward(self, x):
-        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        if self.config.pretraining_tp > 1:
+            slice = self.intermediate_size // self.config.pretraining_tp
+            gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
+            up_proj_slices = self.up_proj.weight.split(slice, dim=0)
+            down_proj_slices = self.down_proj.weight.split(slice, dim=1)
+
+            gate_proj = torch.cat(
+                [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1
+            )
+            up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
+
+            intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
+            down_proj = [
+                F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
+            ]
+            down_proj = sum(down_proj)
+        else:
+            down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
        return down_proj


@ -245,14 +264,31 @@ class SuperAttention(nn.Module):
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        bsz, q_len, _ = hidden_states.size()

-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
+        if self.config.pretraining_tp > 1:
+            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
+            query_slices = self.q_proj.weight.split(
+                (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
+            )
+            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
+            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)

-        # use -1 to infer num_heads and num_key_value_heads as they may vary if tensor parallel is used
-        query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
+            query_states = torch.cat(query_states, dim=-1)
+
+            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
+            key_states = torch.cat(key_states, dim=-1)
+
+            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
+            value_states = torch.cat(value_states, dim=-1)
+
+        else:
+            query_states = self.q_proj(hidden_states)
+            key_states = self.k_proj(hidden_states)
+            value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

        if position_embeddings is None:
            logger.warning_once(
@ -294,7 +330,12 @@ class SuperAttention(nn.Module):

        attn_output = attn_output.reshape(bsz, q_len, -1)

-        attn_output = self.o_proj(attn_output)
+        if self.config.pretraining_tp > 1:
+            attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
+            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
+            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
+        else:
+            attn_output = self.o_proj(attn_output)

        if not output_attentions:
            attn_weights = None
@ -467,10 +508,9 @@ class SuperSdpaAttention(SuperAttention):
        key_states = self.k_proj(hidden_states)
        value_states = self.v_proj(hidden_states)

-        # use -1 to infer num_heads and num_key_value_heads as they may vary if tensor parallel is used
-        query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

        if position_embeddings is None:
            logger.warning_once(
@ -754,10 +794,7 @@ class SuperModel(SuperPreTrainedModel):
        )
        self.norm = SuperRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.rotary_emb = SuperRotaryEmbedding(config=config)
-
        self.gradient_checkpointing = False
-        if getattr(config, "pretraining_tp", 1) != 1:
-            logger.warn("`pretraining_tp` is deprecated, please use `model.tensor_parallel` instead.")

        # Initialize weights and apply final processing
        self.post_init()
--- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py
+++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
@ -331,7 +331,7 @@ def main():
    config = AutoConfig.from_pretrained(
        args.model_name_or_path,
        num_labels=len(labels),
-        id2label=id2label,
+        i2label=id2label,
        label2id=label2id,
        finetuning_task="image-classification",
        trust_remote_code=args.trust_remote_code,
--- a/examples/pytorch/instance-segmentation/README.md
+++ b/examples/pytorch/instance-segmentation/README.md
@ -148,7 +148,7 @@ with torch.no_grad():
    outputs = model(**inputs)

 # Post-process outputs
-outputs = image_processor.post_process_instance_segmentation(outputs, target_sizes=[(image.height, image.width)])
+outputs = image_processor.post_process_instance_segmentation(outputs, target_sizes=[image.size[::-1]])

 print("Mask shape: ", outputs[0]["segmentation"].shape)
 print("Mask values: ", outputs[0]["segmentation"].unique())
--- a/examples/research_projects/decision_transformer/requirements.txt
+++ b/examples/research_projects/decision_transformer/requirements.txt
@ -1,5 +1,5 @@
 absl-py==1.0.0
-aiohttp==3.10.11
+aiohttp==3.10.2
 aiosignal==1.2.0
 alembic==1.7.7
 appdirs==1.4.4
--- a/setup.py
+++ b/setup.py
@ -117,7 +117,7 @@ _deps = [
    "fugashi>=1.0",
    "GitPython<3.1.19",
    "hf-doc-builder>=0.3.0",
-    "huggingface-hub>=0.24.0,<1.0",
+    "huggingface-hub>=0.23.2,<1.0",
    "importlib_metadata",
    "ipadic>=1.0.0,<2.0",
    "isort>=5.5.4",
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@ -620,7 +620,6 @@ _import_structure = {
    "models.nougat": ["NougatProcessor"],
    "models.nystromformer": ["NystromformerConfig"],
    "models.olmo": ["OlmoConfig"],
-    "models.olmo_1124": ["Olmo1124Config"],
    "models.olmoe": ["OlmoeConfig"],
    "models.omdet_turbo": [
        "OmDetTurboConfig",
@ -1186,7 +1185,7 @@ else:
    )
    _import_structure["models.convnext"].extend(["ConvNextFeatureExtractor", "ConvNextImageProcessor"])
    _import_structure["models.deformable_detr"].extend(
-        ["DeformableDetrFeatureExtractor", "DeformableDetrImageProcessor", "DeformableDetrImageProcessorFast"]
+        ["DeformableDetrFeatureExtractor", "DeformableDetrImageProcessor"]
    )
    _import_structure["models.deit"].extend(["DeiTFeatureExtractor", "DeiTImageProcessor"])
    _import_structure["models.deprecated.deta"].append("DetaImageProcessor")
@ -2920,13 +2919,6 @@ else:
            "OlmoPreTrainedModel",
        ]
    )
-    _import_structure["models.olmo_1124"].extend(
-        [
-            "Olmo1124ForCausalLM",
-            "Olmo1124Model",
-            "Olmo1124PreTrainedModel",
-        ]
-    )
    _import_structure["models.olmoe"].extend(
        [
            "OlmoeForCausalLM",
@ -5514,7 +5506,6 @@ if TYPE_CHECKING:
        NystromformerConfig,
    )
    from .models.olmo import OlmoConfig
-    from .models.olmo_1124 import Olmo1124Config
    from .models.olmoe import OlmoeConfig
    from .models.omdet_turbo import (
        OmDetTurboConfig,
@ -6100,7 +6091,6 @@ if TYPE_CHECKING:
        from .models.deformable_detr import (
            DeformableDetrFeatureExtractor,
            DeformableDetrImageProcessor,
-            DeformableDetrImageProcessorFast,
        )
        from .models.deit import DeiTFeatureExtractor, DeiTImageProcessor
        from .models.deprecated.deta import DetaImageProcessor
@ -7533,11 +7523,6 @@ if TYPE_CHECKING:
            OlmoModel,
            OlmoPreTrainedModel,
        )
-        from .models.olmo_1124 import (
-            Olmo1124ForCausalLM,
-            Olmo1124Model,
-            Olmo1124PreTrainedModel,
-        )
        from .models.olmoe import (
            OlmoeForCausalLM,
            OlmoeModel,
--- a/src/transformers/agents/tools.py
+++ b/src/transformers/agents/tools.py
@ -23,7 +23,6 @@ import json
 import os
 import tempfile
 from functools import lru_cache, wraps
-from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Union

 from huggingface_hub import create_repo, get_collection, hf_hub_download, metadata_update, upload_folder
@ -46,7 +45,7 @@ from ..utils import (
    is_vision_available,
    logging,
 )
-from .agent_types import ImageType, handle_agent_inputs, handle_agent_outputs
+from .agent_types import handle_agent_inputs, handle_agent_outputs


 logger = logging.get_logger(__name__)
@ -419,9 +418,7 @@ class Tool:
            )

    @staticmethod
-    def from_space(
-        space_id: str, name: str, description: str, api_name: Optional[str] = None, token: Optional[str] = None
-    ):
+    def from_space(space_id, name, description):
        """
        Creates a [`Tool`] from a Space given its id on the Hub.

@ -432,73 +429,34 @@ class Tool:
                The name of the tool.
            description (`str`):
                The description of the tool.
-            api_name (`str`, *optional*):
-                The specific api_name to use, if the space has several tabs. If not precised, will default to the first available api.
-            token (`str`, *optional*):
-                Add your token to access private spaces or increase your GPU quotas.
+
        Returns:
            [`Tool`]:
-                The Space, as a tool.
+                The created tool.

-        Examples:
+        Example:
        ```
-        image_generator = Tool.from_space(
-            space_id="black-forest-labs/FLUX.1-schnell",
-            name="image-generator",
-            description="Generate an image from a prompt"
-        )
-        image = image_generator("Generate an image of a cool surfer in Tahiti")
-        ```
-        ```
-        face_swapper = Tool.from_space(
-            "tuan2308/face-swap",
-            "face_swapper",
-            "Tool that puts the face shown on the first image on the second image. You can give it paths to images.",
-        )
-        image = face_swapper('./aymeric.jpeg', './ruth.jpg')
+        tool = Tool.from_space("black-forest-labs/FLUX.1-schnell", "image-generator", "Generate an image from a prompt")
        ```
        """
-        from gradio_client import Client, handle_file
-        from gradio_client.utils import is_http_url_like
+        from gradio_client import Client

        class SpaceToolWrapper(Tool):
-            def __init__(
-                self,
-                space_id: str,
-                name: str,
-                description: str,
-                api_name: Optional[str] = None,
-                token: Optional[str] = None,
-            ):
-                self.client = Client(space_id, hf_token=token)
+            def __init__(self, space_id, name, description):
+                self.client = Client(space_id)
                self.name = name
                self.description = description
-                space_description = self.client.view_api(return_format="dict", print_info=False)["named_endpoints"]
-
-                # If api_name is not defined, take the first of the available APIs for this space
-                if api_name is None:
-                    api_name = list(space_description.keys())[0]
-                    logger.warning(
-                        f"Since `api_name` was not defined, it was automatically set to the first avilable API: `{api_name}`."
-                    )
-                self.api_name = api_name
-
-                try:
-                    space_description_api = space_description[api_name]
-                except KeyError:
-                    raise KeyError(f"Could not find specified {api_name=} among available api names.")
-
+                space_description = self.client.view_api(return_format="dict")["named_endpoints"]
+                route = list(space_description.keys())[0]
+                space_description_route = space_description[route]
                self.inputs = {}
-                for parameter in space_description_api["parameters"]:
+                for parameter in space_description_route["parameters"]:
                    if not parameter["parameter_has_default"]:
-                        parameter_type = parameter["type"]["type"]
-                        if parameter_type == "object":
-                            parameter_type = "any"
                        self.inputs[parameter["parameter_name"]] = {
-                            "type": parameter_type,
+                            "type": parameter["type"]["type"],
                            "description": parameter["python_type"]["description"],
                        }
-                output_component = space_description_api["returns"][0]["component"]
+                output_component = space_description_route["returns"][0]["component"]
                if output_component == "Image":
                    self.output_type = "image"
                elif output_component == "Audio":
@ -506,33 +464,10 @@ class Tool:
                else:
                    self.output_type = "any"

-            def sanitize_argument_for_prediction(self, arg):
-                if isinstance(arg, ImageType):
-                    temp_file = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
-                    arg.save(temp_file.name)
-                    arg = temp_file.name
-                if (isinstance(arg, (str, Path)) and Path(arg).exists() and Path(arg).is_file()) or is_http_url_like(
-                    arg
-                ):
-                    arg = handle_file(arg)
-                return arg
-
            def forward(self, *args, **kwargs):
-                # Preprocess args and kwargs:
-                args = list(args)
-                for i, arg in enumerate(args):
-                    args[i] = self.sanitize_argument_for_prediction(arg)
-                for arg_name, arg in kwargs.items():
-                    kwargs[arg_name] = self.sanitize_argument_for_prediction(arg)
+                return self.client.predict(*args, **kwargs)[0]  # Usually the first output is the result

-                output = self.client.predict(*args, api_name=self.api_name, **kwargs)
-                if isinstance(output, tuple) or isinstance(output, list):
-                    return output[
-                        0
-                    ]  # Sometime the space also returns the generation seed, in which case the result is at index 0
-                return output
-
-        return SpaceToolWrapper(space_id, name, description, api_name=api_name, token=token)
+        return SpaceToolWrapper(space_id, name, description)

    @staticmethod
    def from_gradio(gradio_tool):
--- a/src/transformers/cache_utils.py
+++ b/src/transformers/cache_utils.py
@ -433,22 +433,19 @@ class DynamicCache(Cache):
            self._seen_tokens += key_states.shape[-2]

        # Update the cache
-        if key_states is not None:
-            if len(self.key_cache) <= layer_idx:
-                # There may be skipped layers, fill them with empty lists
-                for _ in range(len(self.key_cache), layer_idx):
-                    self.key_cache.append([])
-                    self.value_cache.append([])
-                self.key_cache.append(key_states)
-                self.value_cache.append(value_states)
-            elif (
-                len(self.key_cache[layer_idx]) == 0
-            ):  # fills previously skipped layers; checking for tensor causes errors
-                self.key_cache[layer_idx] = key_states
-                self.value_cache[layer_idx] = value_states
-            else:
-                self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2)
-                self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=-2)
+        if len(self.key_cache) <= layer_idx:
+            # There may be skipped layers, fill them with empty lists
+            for _ in range(len(self.key_cache), layer_idx):
+                self.key_cache.append([])
+                self.value_cache.append([])
+            self.key_cache.append(key_states)
+            self.value_cache.append(value_states)
+        elif len(self.key_cache[layer_idx]) == 0:  # fills previously skipped layers; checking for tensor causes errors
+            self.key_cache[layer_idx] = key_states
+            self.value_cache[layer_idx] = value_states
+        else:
+            self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2)
+            self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=-2)

        return self.key_cache[layer_idx], self.value_cache[layer_idx]

@ -528,7 +525,7 @@ class DynamicCache(Cache):
        cache = cls()
        for idx in range(len(splits[0])):
            key_cache = [current.key_cache[idx] for current in splits if current.key_cache[idx] != []]
-            value_cache = [current.value_cache[idx] for current in splits if current.value_cache[idx] != []]
+            value_cache = [current.key_cache[idx] for current in splits if current.key_cache[idx] != []]
            if key_cache != []:
                layer_keys = torch.cat(key_cache, dim=0)
                layer_values = torch.cat(value_cache, dim=0)
@ -784,11 +781,6 @@ class QuantoQuantizedCache(QuantizedCache):
        super().__init__(cache_config)

        if is_optimum_quanto_available():
-            optimum_quanto_version = version.parse(importlib.metadata.version("optimum-quanto"))
-            if optimum_quanto_version <= version.parse("0.2.5"):
-                raise ImportError(
-                    f"You need optimum-quanto package version to be greater or equal than 0.2.5 to use `QuantoQuantizedCache`. Detected version {optimum_quanto_version}."
-                )
            from optimum.quanto import MaxOptimizer, qint2, qint4
        elif is_quanto_available():
            logger.warning_once(
@ -821,8 +813,7 @@ class QuantoQuantizedCache(QuantizedCache):
        if is_optimum_quanto_available():
            from optimum.quanto import quantize_weight

-            scale, zeropoint = self.optimizer(tensor, self.qtype, axis, self.q_group_size)
-            qtensor = quantize_weight(tensor, self.qtype, axis, scale, zeropoint, self.q_group_size)
+            qtensor = quantize_weight(tensor, self.qtype, axis, self.q_group_size)
            return qtensor
        elif is_quanto_available():
            logger.warning_once(
@ -1523,10 +1514,7 @@ class EncoderDecoderCache(Cache):
        self.check_dynamic_cache(self.crop.__name__)
        self.self_attention_cache.crop(maximum_length)

-    @deprecate_kwarg("num_hidden_layers", version="4.47.0")
-    def batch_split(
-        self, full_batch_size: int, split_size: int, num_hidden_layers: int = None
-    ) -> "List[EncoderDecoderCache]":
+    def batch_split(self, full_batch_size: int, split_size: int) -> "List[EncoderDecoderCache]":
        """Split the current instance into a list of `DynamicCache` by the batch size. This will be used by
        `_split_model_inputs()` in `generation.utils`"""
        self.check_dynamic_cache(self.batch_split.__name__)
@ -1539,10 +1527,7 @@ class EncoderDecoderCache(Cache):
        return out

    @classmethod
-    @deprecate_kwarg("num_hidden_layers", version="4.47.0")
-    def from_batch_splits(
-        cls, splits: List["EncoderDecoderCache"], num_hidden_layers: int = None
-    ) -> "EncoderDecoderCache":
+    def from_batch_splits(cls, splits: List["EncoderDecoderCache"]) -> "EncoderDecoderCache":
        """This is the opposite of the above `batch_split()` method. This will be used by `stack_model_outputs` in
        `generation.utils`"""
        self_attention_cache = DynamicCache()
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@ -71,8 +71,6 @@ class PretrainedConfig(PushToHubMixin):
      outputs of the model during inference.
    - **attribute_map** (`Dict[str, str]`) -- A dict that maps model specific attribute names to the standardized
      naming of attributes.
-    - **base_model_tp_plan** (`Dict[str, Any]`) -- A dict that maps sub-modules FQNs of a base model to a tensor
-      parallel plan applied to the sub-module when `model.tensor_parallel` is called.

    Common attributes (present in all subclasses):

@ -196,7 +194,6 @@ class PretrainedConfig(PushToHubMixin):
    sub_configs: Dict[str, "PretrainedConfig"] = {}
    is_composition: bool = False
    attribute_map: Dict[str, str] = {}
-    base_model_tp_plan: Optional[Dict[str, Any]] = None
    _auto_class: Optional[str] = None

    def __setattr__(self, key, value):
@ -851,9 +848,6 @@ class PretrainedConfig(PushToHubMixin):

        if "_attn_implementation_internal" in serializable_config_dict:
            del serializable_config_dict["_attn_implementation_internal"]
-        # Do not serialize `base_model_tp_plan` for now
-        if "base_model_tp_plan" in serializable_config_dict:
-            del serializable_config_dict["base_model_tp_plan"]

        return serializable_config_dict

@ -873,9 +867,6 @@ class PretrainedConfig(PushToHubMixin):
            del output["_commit_hash"]
        if "_attn_implementation_internal" in output:
            del output["_attn_implementation_internal"]
-        # Do not serialize `base_model_tp_plan` for now
-        if "base_model_tp_plan" in output:
-            del output["base_model_tp_plan"]

        # Transformers version when serializing the model
        output["transformers_version"] = __version__
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@ -24,7 +24,7 @@ deps = {
    "fugashi": "fugashi>=1.0",
    "GitPython": "GitPython<3.1.19",
    "hf-doc-builder": "hf-doc-builder>=0.3.0",
-    "huggingface-hub": "huggingface-hub>=0.24.0,<1.0",
+    "huggingface-hub": "huggingface-hub>=0.23.2,<1.0",
    "importlib_metadata": "importlib_metadata",
    "ipadic": "ipadic>=1.0.0,<2.0",
    "isort": "isort>=5.5.4",
--- a/src/transformers/generation/init.py
+++ b/src/transformers/generation/init.py
@ -49,7 +49,6 @@ else:
    _import_structure["candidate_generator"] = [
        "AssistedCandidateGenerator",
        "CandidateGenerator",
-        "EarlyExitCandidateGenerator",
        "PromptLookupCandidateGenerator",
    ]
    _import_structure["logits_process"] = [
@ -207,12 +206,7 @@ if TYPE_CHECKING:
    else:
        from .beam_constraints import Constraint, ConstraintListState, DisjunctiveConstraint, PhrasalConstraint
        from .beam_search import BeamHypotheses, BeamScorer, BeamSearchScorer, ConstrainedBeamSearchScorer
-        from .candidate_generator import (
-            AssistedCandidateGenerator,
-            CandidateGenerator,
-            EarlyExitCandidateGenerator,
-            PromptLookupCandidateGenerator,
-        )
+        from .candidate_generator import AssistedCandidateGenerator, CandidateGenerator, PromptLookupCandidateGenerator
        from .logits_process import (
            AlternatingCodebooksLogitsProcessor,
            ClassifierFreeGuidanceLogitsProcessor,
--- a/src/transformers/generation/candidate_generator.py
+++ b/src/transformers/generation/candidate_generator.py
@ -255,8 +255,7 @@ class AssistedCandidateGenerator(CandidateGenerator):
            "heuristic",
            "heuristic_transient",
        }:
-            # len(scores[0])-1 is the number of candidates according to the target tokenizer.
-            if num_matches == len(scores[0]) - 1:
+            if num_matches == int(self.num_assistant_tokens):
                self.num_assistant_tokens += 2.0
            else:
                self.num_assistant_tokens = max(1.0, self.num_assistant_tokens - 1.0)
@ -671,62 +670,6 @@ class PromptLookupCandidateGenerator(CandidateGenerator):
        return


-class EarlyExitCandidateGenerator(AssistedCandidateGenerator):
-    """
-    `CandidateGenerator` class to be used for assisted generation and speculative decoding. This class generates
-    candidates through the use of **the model itself**, exiting early. Can only be used with models that support early
-    exit, e.g., `facebook/layerskip-llama3.2-1B`.
-
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids)
-        assistant_model (`PreTrainedModel`):
-            The original model. This model must support early exit (i.e. is trained to compute logits in earlier
-            layers).
-        generation_config (`~generation.GenerationConfig`, *optional*):
-            The generation configuration to be used as base parametrization for the generation call.
-        logits_processor (`LogitsProcessorList`):
-            An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
-            used to modify the prediction scores of the language modeling head applied at each generation step.
-        model_kwargs (`Dict`):
-            The keyword arguments that will be passed to the main model, and are used as base inputs for the assistant
-            model as well.
-        inputs_tensor (`torch.Tensor`, *optional*):
-            The model input tensor. In encoder-decoder models, this is the encoder input.
-    """
-
-    def __init__(
-        self,
-        input_ids: torch.LongTensor,
-        assistant_model: "PreTrainedModel",
-        generation_config: "GenerationConfig",
-        model_kwargs: Dict,
-        inputs_tensor: Optional[torch.Tensor] = None,
-        logits_processor: "LogitsProcessorList" = None,
-    ):
-        super().__init__(
-            input_ids=input_ids,
-            assistant_model=assistant_model,
-            generation_config=generation_config,
-            model_kwargs=model_kwargs,
-            inputs_tensor=inputs_tensor,
-            logits_processor=logits_processor,
-        )
-        # We have to move early exit out of the generation config, otherwise the assistant will also call `generate`
-        # with early exit
-        self.assistant_early_exit = self.generation_config.assistant_early_exit
-        self.generation_config.assistant_early_exit = None
-
-    def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
-        # Temporarily sets the number of hidden layers to the early exit value
-        base_model = getattr(self.assistant_model, self.assistant_model.base_model_prefix)
-        original_num_hidden_layers = base_model.config.num_hidden_layers
-        base_model.config.num_hidden_layers = self.assistant_early_exit
-        candidate_ids, candidate_logits = super().get_candidates(input_ids)
-        base_model.config.num_hidden_layers = original_num_hidden_layers
-        return candidate_ids, candidate_logits
-
-
 def _crop_past_key_values(model, past_key_values, max_length):
    """Crops the past key values up to a certain maximum length."""
    new_past = []
--- a/src/transformers/generation/configuration_utils.py
+++ b/src/transformers/generation/configuration_utils.py
@ -353,13 +353,10 @@ class GenerationConfig(PushToHubMixin):
            than this threshold, the assistant model stops the current token generation iteration, even if the number of _speculative tokens_
            (defined by `num_assistant_tokens`) is not yet reached. It is an unsupervised version of the dynamic speculation lookahead
            from Dynamic Speculation Lookahead Accelerates Speculative Decoding of Large Language Models <https://arxiv.org/abs/2405.04304>.
-        prompt_lookup_num_tokens (`int`, *optional*):
+        prompt_lookup_num_tokens (`int`, *optional*, default to `None`):
            The number of tokens to be output as candidate tokens.
-        max_matching_ngram_size (`int`, *optional*):
+        max_matching_ngram_size (`int`, *optional*, default to `None`):
            The maximum ngram size to be considered for matching in the prompt. Default to 2 if not provided.
-        assistant_early_exit(`int`, *optional*):
-            If set to a positive integer, early exit of the model will be used as an assistant. Can only be used with
-            models that support early exit (i.e. models where logits from intermediate layers can be interpreted by the LM head).

        > Wild card

@ -457,9 +454,10 @@ class GenerationConfig(PushToHubMixin):
        self.num_assistant_tokens = kwargs.pop("num_assistant_tokens", 20)
        self.num_assistant_tokens_schedule = kwargs.pop("num_assistant_tokens_schedule", "constant")
        self.assistant_confidence_threshold = kwargs.pop("assistant_confidence_threshold", 0.4)
+
+        # Prompt lookup decoding
        self.prompt_lookup_num_tokens = kwargs.pop("prompt_lookup_num_tokens", None)
        self.max_matching_ngram_size = kwargs.pop("max_matching_ngram_size", None)
-        self.assistant_early_exit = kwargs.pop("assistant_early_exit", None)

        # Wild card
        self.generation_kwargs = kwargs.pop("generation_kwargs", {})
@ -536,11 +534,7 @@ class GenerationConfig(PushToHubMixin):
                generation_mode = GenerationMode.BEAM_SEARCH

        # Assisted generation may extend some generation modes
-        if (
-            assistant_model is not None
-            or self.prompt_lookup_num_tokens is not None
-            or self.assistant_early_exit is not None
-        ):
+        if assistant_model is not None or self.prompt_lookup_num_tokens is not None:
            if generation_mode in ("greedy_search", "sample"):
                generation_mode = GenerationMode.ASSISTED_GENERATION
            else:
--- a/src/transformers/generation/flax_utils.py
+++ b/src/transformers/generation/flax_utils.py
@ -398,11 +398,7 @@ class FlaxGenerationMixin:
                )
            generation_config.max_length = generation_config.max_new_tokens + input_ids_seq_length
        else:  # by default let's always generate 10 new tokens
-            if generation_config.max_length == GenerationConfig().max_length:
-                generation_config.max_length = generation_config.max_length + input_ids_seq_length
-                max_position_embeddings = getattr(self.config, "max_position_embeddings", None)
-                if max_position_embeddings is not None:
-                    generation_config.max_length = min(generation_config.max_length, max_position_embeddings)
+            generation_config.max_length = generation_config.max_length + input_ids_seq_length

        if generation_config.min_length is not None and generation_config.min_length > generation_config.max_length:
            raise ValueError(
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@ -54,7 +54,6 @@ from .candidate_generator import (
    AssistedCandidateGenerator,
    AssistedCandidateGeneratorDifferentTokenizers,
    CandidateGenerator,
-    EarlyExitCandidateGenerator,
    PromptLookupCandidateGenerator,
    _crop_past_key_values,
    _prepare_attention_mask,
@ -823,16 +822,7 @@ class GenerationMixin:
        """
        different_tokenizers = all(v is not None for v in (assistant_model, target_tokenizer, assistant_tokenizer))

-        if generation_config.assistant_early_exit is not None:
-            candidate_generator = EarlyExitCandidateGenerator(
-                input_ids=input_ids,
-                assistant_model=self,
-                generation_config=generation_config,
-                model_kwargs=model_kwargs,
-                inputs_tensor=inputs_tensor,
-                logits_processor=logits_processor,
-            )
-        elif generation_config.prompt_lookup_num_tokens is not None:
+        if generation_config.prompt_lookup_num_tokens is not None:
            candidate_generator = PromptLookupCandidateGenerator(
                eos_token_id=generation_config._eos_token_tensor,
                num_output_tokens=generation_config.prompt_lookup_num_tokens,
@ -1462,11 +1452,10 @@ class GenerationMixin:
        ):
            generation_config.max_length -= inputs_tensor.shape[1]
        elif has_default_max_length:  # by default let's always generate 20 new tokens
-            if generation_config.max_length == GenerationConfig().max_length:
-                generation_config.max_length = generation_config.max_length + input_ids_length
-                max_position_embeddings = getattr(self.config, "max_position_embeddings", None)
-                if max_position_embeddings is not None:
-                    generation_config.max_length = min(generation_config.max_length, max_position_embeddings)
+            generation_config.max_length = generation_config.max_length + input_ids_length
+            max_position_embeddings = getattr(self.config, "max_position_embeddings", None)
+            if max_position_embeddings is not None:
+                generation_config.max_length = min(generation_config.max_length, max_position_embeddings)

        # same for min length
        if generation_config.min_new_tokens is not None:
@ -1646,10 +1635,7 @@ class GenerationMixin:
            # This is needed here if we don't want to make changes in accelerate in order to save execution_device
            # For offloaded case, we need to get the execution device, not just the device where it is offloaded
            if hasattr(self, "hf_device_map"):
-                if set(self.hf_device_map.values()) == {"cpu"} or set(self.hf_device_map.values()) == {"cpu", "disk"}:
-                    main_device = "cpu"
-                else:
-                    main_device = [d for d in self.hf_device_map.values() if d not in ["cpu", "disk"]][0]
+                main_device = [d for d in self.hf_device_map.values() if d not in ["cpu", "disk"]][0]
                execution_device_map = {
                    name: main_device if device in ["cpu", "disk"] else device
                    for name, device in self.hf_device_map.items()
@ -3246,7 +3232,7 @@ class GenerationMixin:

            # Clone is needed to avoid keeping a hanging ref to outputs.logits which may be very large for first iteration
            # (the clone itself is always small)
-            next_token_logits = outputs.logits[:, -1, :].clone().float()
+            next_token_logits = outputs.logits.clone()[:, -1, :].float()
            next_token_logits = next_token_logits.to(input_ids.device)

            # pre-process distribution
--- a/src/transformers/integrations/ggml.py
+++ b/src/transformers/integrations/ggml.py
@ -248,20 +248,6 @@ GGUF_TENSOR_MAPPING = {
        "output_norm": "backbone.norm_f",
        "output.weight": "lm_head.weight",
    },
-    "nemotron": {
-        "token_embd": "model.embed_tokens",
-        "blk": "model.layers",
-        "ffn_up": "mlp.up_proj",
-        "ffn_down": "mlp.down_proj",
-        "ffn_norm": "post_attention_layernorm",
-        "attn_norm": "input_layernorm",
-        "attn_q": "self_attn.q_proj",
-        "attn_v": "self_attn.v_proj",
-        "attn_k": "self_attn.k_proj",
-        "attn_output": "self_attn.o_proj",
-        "output.weight": "lm_head.weight",
-        "output_norm": "model.norm",
-    },
 }


@ -411,18 +397,6 @@ GGUF_CONFIG_MAPPING = {
        "ssm.time_step_rank": "time_step_rank",
        "ssm.inner_size": "intermediate_size",
    },
-    "nemotron": {
-        "context_length": "max_position_embeddings",
-        "block_count": "num_hidden_layers",
-        "feed_forward_length": "intermediate_size",
-        "embedding_length": "hidden_size",
-        "rope.dimension_count": None,
-        "rope.freq_base": "rope_theta",
-        "attention.head_count": "num_attention_heads",
-        "attention.head_count_kv": "num_key_value_heads",
-        "attention.layer_norm_rms_epsilon": "norm_eps",
-        "vocab_size": "vocab_size",
-    },
 }

 GGUF_TOKENIZER_MAPPING = {
@ -819,7 +793,6 @@ GGUF_TO_FAST_CONVERTERS = {
    "starcoder2": GGUFGPTConverter,
    "t5": GGUFT5Converter,
    "mamba": GGUFGPTConverter,
-    "nemotron": GGUFGPTConverter,
 }


--- a/src/transformers/integrations/integration_utils.py
+++ b/src/transformers/integrations/integration_utils.py
@ -208,7 +208,7 @@ def hp_params(trial):
    if is_optuna_available():
        import optuna

-        if isinstance(trial, optuna.trial.BaseTrial):
+        if isinstance(trial, optuna.Trial):
            return trial.params
    if is_ray_tune_available():
        if isinstance(trial, dict):
@ -230,7 +230,7 @@ def run_hp_search_optuna(trainer, n_trials: int, direction: str, **kwargs) -> Be

    if trainer.args.process_index == 0:

-        def _objective(trial: optuna.Trial, checkpoint_dir=None):
+        def _objective(trial, checkpoint_dir=None):
            checkpoint = None
            if checkpoint_dir:
                for subdir in os.listdir(checkpoint_dir):
@ -240,11 +240,10 @@ def run_hp_search_optuna(trainer, n_trials: int, direction: str, **kwargs) -> Be
            if trainer.args.world_size > 1:
                if trainer.args.parallel_mode != ParallelMode.DISTRIBUTED:
                    raise RuntimeError("only support DDP optuna HPO for ParallelMode.DISTRIBUTED currently.")
-                trainer.hp_space(trial)
-                fixed_trial = optuna.trial.FixedTrial(trial.params, trial.number)
-                trial_main_rank_list = [fixed_trial]
-                torch.distributed.broadcast_object_list(trial_main_rank_list, src=0)
-                trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
+                trainer._hp_search_setup(trial)
+                args_main_rank_list = [pickle.dumps(trainer.args)]
+                torch.distributed.broadcast_object_list(args_main_rank_list, src=0)
+                trainer.train(resume_from_checkpoint=checkpoint)
            else:
                trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
            # If there hasn't been any evaluation during the training loop.
@ -269,11 +268,15 @@ def run_hp_search_optuna(trainer, n_trials: int, direction: str, **kwargs) -> Be
    else:
        for i in range(n_trials):
            trainer.objective = None
-            trial_main_rank_list = [None]
+            args_main_rank_list = [None]
            if trainer.args.parallel_mode != ParallelMode.DISTRIBUTED:
                raise RuntimeError("only support DDP optuna HPO for ParallelMode.DISTRIBUTED currently.")
-            torch.distributed.broadcast_object_list(trial_main_rank_list, src=0)
-            trainer.train(resume_from_checkpoint=None, trial=trial_main_rank_list[0])
+            torch.distributed.broadcast_object_list(args_main_rank_list, src=0)
+            args = pickle.loads(bytes(args_main_rank_list[0]))
+            for key, value in asdict(args).items():
+                if key != "local_rank":
+                    setattr(trainer.args, key, value)
+            trainer.train(resume_from_checkpoint=None)
            # If there hasn't been any evaluation during the training loop.
            if getattr(trainer, "objective", None) is None:
                metrics = trainer.evaluate()
@ -915,7 +918,7 @@ class WandbCallback(TrainerCallback):
        if self._log_model.is_enabled and self._initialized and state.is_world_process_zero:
            from ..trainer import Trainer

-            fake_trainer = Trainer(args=args, model=model, processing_class=tokenizer, eval_dataset=["fake"])
+            fake_trainer = Trainer(args=args, model=model, processing_class=tokenizer)
            with tempfile.TemporaryDirectory() as temp_dir:
                fake_trainer.save_model(temp_dir)
                metadata = (
--- a/src/transformers/modeling_gguf_pytorch_utils.py
+++ b/src/transformers/modeling_gguf_pytorch_utils.py
@ -15,7 +15,7 @@
 # limitations under the License.

 import re
-from typing import Dict, NamedTuple, Optional
+from typing import Dict, Optional

 import numpy as np
 from tqdm import tqdm
@ -55,200 +55,6 @@ GGUF_TO_TRANSFORMERS_MAPPING = {
 GGUF_SUPPORTED_ARCHITECTURES = list(GGUF_TO_TRANSFORMERS_MAPPING["tensors"].keys())


-class GGUFTensor(NamedTuple):
-    weights: np.ndarray
-    name: str
-    metadata: dict
-
-
-class TensorProcessor:
-    def __init__(self, config=None):
-        self.config = config or {}
-
-    def process(self, weights, name, **kwargs):
-        return GGUFTensor(weights, name, {})
-
-
-class LlamaTensorProcessor(TensorProcessor):
-    def __init__(self, config=None):
-        super().__init__(config=config)
-
-    def process(self, weights, name, **kwargs):
-        if ".attn_k." in name or ".attn_q." in name:
-            num_heads = self.config.get("num_attention_heads")
-            num_kv_heads = self.config.get("num_key_value_heads")
-
-            if None in (num_heads, num_kv_heads):
-                return GGUFTensor(weights, name, {})
-            if ".attn_q." in name:
-                weights = self._reverse_permute_weights(weights, num_heads, num_heads)
-            elif ".attn_k." in name:
-                weights = self._reverse_permute_weights(weights, num_heads, num_kv_heads)
-        return GGUFTensor(weights, name, {})
-
-    def _reverse_permute_weights(
-        self, weights: np.ndarray, n_head: int, num_kv_heads: Optional[int] = None
-    ) -> np.ndarray:
-        # Original permutation implementation
-        # https://github.com/ggerganov/llama.cpp/blob/a38b884c6c4b0c256583acfaaabdf556c62fabea/convert_hf_to_gguf.py#L1402-L1408
-        if num_kv_heads is not None and n_head != num_kv_heads:
-            n_head = num_kv_heads
-
-        dim = weights.shape[0] // n_head // 2
-        w = weights.reshape(n_head, dim, 2, *weights.shape[1:])
-        return w.swapaxes(2, 1).reshape(weights.shape)
-
-
-class Qwen2MoeTensorProcessor(TensorProcessor):
-    def __init__(self, config=None):
-        super().__init__(config=config)
-
-    def process(self, weights, name, **kwargs):
-        if "_exp" in name:
-            tensor_key_mapping = kwargs.get("tensor_key_mapping")
-            parsed_parameters = kwargs.get("parsed_parameters")
-            if tensor_key_mapping:
-                self._split_moe_expert_tensor(weights, parsed_parameters, name, tensor_key_mapping)
-                return GGUFTensor(weights, None, {})
-        if "ffn_gate_inp_shexp" in name:
-            # for compatibility tensor shared_expert_gate must be (1, 2048) dim,
-            # quantized one is (2048)
-            weights = np.expand_dims(weights, axis=0)
-        return GGUFTensor(weights, name, {})
-
-    def _split_moe_expert_tensor(
-        self, weights: np.ndarray, parsed_parameters: Dict[str, Dict], name: str, tensor_key_mapping: dict
-    ):
-        # Original merge implementation
-        # https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L1994-L2022
-        exp_name = ""
-        if "ffn_gate_exps" in name:
-            exp_name = "gate_proj"
-        elif "ffn_down_exps" in name:
-            exp_name = "down_proj"
-        elif "ffn_up_exps" in name:
-            exp_name = "up_proj"
-        else:
-            raise ValueError(f"Cannot map expert tensor {name} in Qwen2Moe architecture.")
-        for tensor_name in tensor_key_mapping:
-            if tensor_name in name:
-                name = name.replace(tensor_name, tensor_key_mapping[tensor_name])
-        w_counter = self.config.get("num_experts", 60)
-        for i in range(0, w_counter):
-            temp_name = name.replace(".weight", f".{i}.{exp_name}.weight")
-            exp_weight = weights[i]
-            parsed_parameters["tensors"][temp_name] = torch.from_numpy(np.copy(exp_weight))
-
-
-class BloomTensorProcessor(TensorProcessor):
-    def __init__(self, config=None):
-        super().__init__(config=config)
-
-    def process(self, weights, name, **kwargs):
-        if "attn_qkv" in name:
-            num_heads = self.config["n_head"]
-            n_embed = self.config["hidden_size"]
-            if "weight" in name:
-                weights = self._reverse_reshape_weights(weights, num_heads, n_embed)
-            else:
-                weights = self._reverse_reshape_bias(weights, num_heads, n_embed)
-        return GGUFTensor(weights, name, {})
-
-    def _reverse_reshape_weights(self, weights: np.ndarray, n_head: int, n_embed: int):
-        # Original reshape implementation
-        # https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L972-L985
-        q, k, v = np.array_split(weights, 3, axis=0)
-
-        q = q.reshape(n_head, n_embed // n_head, n_embed)
-        k = k.reshape(n_head, n_embed // n_head, n_embed)
-        v = v.reshape(n_head, n_embed // n_head, n_embed)
-        qkv_weights = np.stack([q, k, v], axis=1)
-
-        return qkv_weights.reshape(n_head * 3 * (n_embed // n_head), n_embed)
-
-    def _reverse_reshape_bias(self, weights: np.ndarray, n_head: int, n_embed: int):
-        # Original reshape implementation
-        # https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L986-L998
-        q_bias, k_bias, v_bias = np.array_split(weights, 3)
-
-        q_bias = q_bias.reshape(n_head, n_embed // n_head)
-        k_bias = k_bias.reshape(n_head, n_embed // n_head)
-        v_bias = v_bias.reshape(n_head, n_embed // n_head)
-
-        qkv_bias = np.stack([q_bias, k_bias, v_bias], axis=1).flatten()
-        return qkv_bias
-
-
-class T5TensorProcessor(TensorProcessor):
-    def __init__(self, config=None):
-        super().__init__(config=config)
-
-    def process(self, weights, name, **kwargs):
-        bid = None
-        for chunk in name.split("."):
-            if chunk.isdigit():
-                bid = int(chunk)
-                break
-        return GGUFTensor(weights, name, {"bid": bid})
-
-
-class GPT2TensorProcessor(TensorProcessor):
-    def __init__(self, config=None):
-        super().__init__(config=config)
-
-    def process(self, weights, name, **kwargs):
-        # Original transpose implementation
-        # https://github.com/ggerganov/llama.cpp/blob/a38b884c6c4b0c256583acfaaabdf556c62fabea/convert_hf_to_gguf.py#L2060-L2061
-        if (
-            "attn_qkv.weight" in name
-            or "ffn_down.weight" in name
-            or "ffn_up.weight" in name
-            or "attn_output.weight" in name
-        ):
-            weights = weights.T
-
-        # Handle special case for output.weight
-        if name == "output.weight":
-            # output.weight has conflicts with attn_output.weight in name checking
-            # Store the tensor directly and signal to skip further processing
-            name = "lm_head.weight"
-            parsed_parameters = kwargs.get("parsed_parameters", {})
-            parsed_parameters["tensors"][name] = torch.from_numpy(np.copy(weights))
-            name = None  # Signal to skip further processing
-        return GGUFTensor(weights, name, {})
-
-
-class MambaTensorProcessor(TensorProcessor):
-    def __init__(self, config=None):
-        super().__init__(config=config)
-
-    def process(self, weights, name, **kwargs):
-        if "ssm_d" in name and "bias" not in name and "weight" not in name:
-            # ssm_d has conflicts with ssm_dt in name checking
-            # we have to explicitly check that name is exactly ssm_d
-            name = name.replace("ssm_d", "mixer.D")
-        if "ssm_conv1d.weight" in name:
-            # for compatibility tensor ssm_conv1d must be (5120, 1, 4]) dim,
-            # quantized one is (5120, 4)
-            weights = np.expand_dims(weights, axis=1)
-        if "ssm_a" in name:
-            # Original exponential implementation
-            # https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L2975-L2977
-            weights = np.log(-weights)
-        return GGUFTensor(weights, name, {})
-
-
-TENSOR_PROCESSORS = {
-    "llama": LlamaTensorProcessor,
-    "qwen2moe": Qwen2MoeTensorProcessor,
-    "bloom": BloomTensorProcessor,
-    "t5": T5TensorProcessor,
-    "t5encoder": T5TensorProcessor,
-    "gpt2": GPT2TensorProcessor,
-    "mamba": MambaTensorProcessor,
-}
-
-
 def read_field(reader, field):
    value = reader.fields[field]
    return [_gguf_parse_value(value.parts[_data_index], value.types) for _data_index in value.data]
@ -371,28 +177,73 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):

    if return_tensors:
        tensor_key_mapping = GGUF_TO_TRANSFORMERS_MAPPING["tensors"][architecture + model_size]
-        config = parsed_parameters.get("config", {})
-
-        ProcessorClass = TENSOR_PROCESSORS.get(architecture, TensorProcessor)
-        processor = ProcessorClass(config=config)

        for tensor in tqdm(reader.tensors, desc="Converting and de-quantizing GGUF tensors..."):
            name = tensor.name
+
            weights = dequantize(tensor.data, tensor.tensor_type)

-            result = processor.process(
-                weights=weights,
-                name=name,
-                tensor_key_mapping=tensor_key_mapping,
-                parsed_parameters=parsed_parameters,
-            )
+            if architecture == "llama" and (".attn_k." in name or ".attn_q." in name):
+                num_heads = parsed_parameters["config"]["num_attention_heads"]
+                num_kv_heads = parsed_parameters["config"]["num_key_value_heads"]
+                if ".attn_q." in name:
+                    weights = reverse_permute_weights(weights, num_heads, num_heads)
+                elif ".attn_k." in name:
+                    weights = reverse_permute_weights(weights, num_heads, num_kv_heads)

-            weights = result.weights
-            name = result.name
-            bid = result.metadata.get("bid")
+            if architecture == "qwen2moe":
+                if "_exp" in name:
+                    split_moe_expert_tensor(weights, parsed_parameters, name, tensor_key_mapping)
+                    continue
+                if "ffn_gate_inp_shexp" in name:
+                    # for compatibility tensor shared_expert_gate must be (1, 2048) dim,
+                    # quantized one is (2048)
+                    weights = np.expand_dims(weights, axis=0)

-            if name is None:
-                continue
+            if architecture == "bloom" and "attn_qkv" in name:
+                num_heads = parsed_parameters["config"]["n_head"]
+                n_embed = parsed_parameters["config"]["hidden_size"]
+                if "weight" in name:
+                    weights = reverse_reshape_weights(weights, num_heads, n_embed)
+                else:
+                    weights = reverse_reshape_bias(weights, num_heads, n_embed)
+
+            bid = None
+            if architecture in ("t5", "t5encoder"):
+                for chunk in name.split("."):
+                    if chunk.isdigit():
+                        bid = int(chunk)
+                        break
+
+            if architecture == "gpt2":
+                if (
+                    "attn_qkv.weight" in name
+                    or "ffn_down.weight" in name
+                    or "ffn_up.weight" in name
+                    or "attn_output.weight" in name
+                ):
+                    # Original transpose implementation
+                    # https://github.com/ggerganov/llama.cpp/blob/a38b884c6c4b0c256583acfaaabdf556c62fabea/convert_hf_to_gguf.py#L2060-L2061
+                    weights = weights.T
+                if name == "output.weight":
+                    # output.weight has conflicts with attn_output.weight in name checking
+                    # we have to explicitly check that name is exactly output.weight
+                    name = "lm_head.weight"
+                    parsed_parameters["tensors"][name] = torch.from_numpy(np.copy(weights))
+                    continue
+            if architecture == "mamba":
+                if "ssm_d" in name and "bias" not in name and "weight" not in name:
+                    # ssm_d has conflicts with ssm_dt in name checking
+                    # we have to explicitly check that name is exactly ssm_d
+                    name = name.replace("ssm_d", "mixer.D")
+                if "ssm_conv1d.weight" in name:
+                    # for compatibility tensor ssm_conv1d must be (5120, 1, 4]) dim,
+                    # quantized one is (5120, 4)
+                    weights = np.expand_dims(weights, axis=1)
+                if "ssm_a" in name:
+                    # Original exponential implementation
+                    # https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L2975-L2977
+                    weights = np.log(-weights)

            for tensor_name in tensor_key_mapping:
                if tensor_name.format(bid=bid) in name:
@ -405,3 +256,64 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False):
        logger.info(f"Some keys of the GGUF file were not considered: {reader_keys}")

    return parsed_parameters
+
+
+def reverse_permute_weights(weights: np.ndarray, n_head: int, num_kv_heads: Optional[int] = None) -> np.ndarray:
+    # Original permutation implementation
+    # https://github.com/ggerganov/llama.cpp/blob/a38b884c6c4b0c256583acfaaabdf556c62fabea/convert_hf_to_gguf.py#L1402-L1408
+    if num_kv_heads is not None and n_head != num_kv_heads:
+        n_head = num_kv_heads
+
+    dim = weights.shape[0] // n_head // 2
+    w = weights.reshape(n_head, dim, 2, *weights.shape[1:])
+    return w.swapaxes(2, 1).reshape(weights.shape)
+
+
+def reverse_reshape_weights(weights: np.ndarray, n_head: int, n_embed: int):
+    # Original reshape implementation
+    # https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L972-L985
+    q, k, v = np.array_split(weights, 3, axis=0)
+
+    q = q.reshape(n_head, n_embed // n_head, n_embed)
+    k = k.reshape(n_head, n_embed // n_head, n_embed)
+    v = v.reshape(n_head, n_embed // n_head, n_embed)
+    qkv_weights = np.stack([q, k, v], axis=1)
+
+    return qkv_weights.reshape(n_head * 3 * (n_embed // n_head), n_embed)
+
+
+def reverse_reshape_bias(weights: np.ndarray, n_head: int, n_embed: int):
+    # Original reshape implementation
+    # https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L986-L998
+    q_bias, k_bias, v_bias = np.array_split(weights, 3)
+
+    q_bias = q_bias.reshape(n_head, n_embed // n_head)
+    k_bias = k_bias.reshape(n_head, n_embed // n_head)
+    v_bias = v_bias.reshape(n_head, n_embed // n_head)
+
+    qkv_bias = np.stack([q_bias, k_bias, v_bias], axis=1).flatten()
+    return qkv_bias
+
+
+def split_moe_expert_tensor(
+    weights: np.ndarray, parsed_parameters: Dict[str, Dict], name: str, tensor_key_mapping: dict
+):
+    # Original merge implementation
+    # https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L1994-L2022
+    exp_name = ""
+    if "ffn_gate_exps" in name:
+        exp_name = "gate_proj"
+    elif "ffn_down_exps" in name:
+        exp_name = "down_proj"
+    elif "ffn_up_exps" in name:
+        exp_name = "up_proj"
+    else:
+        raise ValueError(f"Cannot map expert tensor {name} in Qwen2Moe architecture.")
+    for tensor_name in tensor_key_mapping:
+        if tensor_name in name:
+            name = name.replace(tensor_name, tensor_key_mapping[tensor_name])
+    w_counter = parsed_parameters["config"].get("num_experts", 60)
+    for i in range(0, w_counter):
+        temp_name = name.replace(".weight", f".{i}.{exp_name}.weight")
+        exp_weight = weights[i]
+        parsed_parameters["tensors"][temp_name] = torch.from_numpy(np.copy(exp_weight))
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@ -52,11 +52,9 @@ from .pytorch_utils import (  # noqa: F401
    find_pruneable_heads_and_indices,
    id_tensor_storage,
    is_torch_greater_or_equal_than_1_13,
-    is_torch_greater_or_equal_than_2_4,
    prune_conv1d_layer,
    prune_layer,
    prune_linear_layer,
-    translate_to_torch_parallel_style,
 )
 from .quantizers import AutoHfQuantizer, HfQuantizer
 from .quantizers.quantizers_utils import get_module_from_name
@ -96,7 +94,7 @@ from .utils import (
    replace_return_docstrings,
    strtobool,
 )
-from .utils.hub import create_and_tag_model_card, get_checkpoint_shard_files
+from .utils.hub import convert_file_size_to_int, create_and_tag_model_card, get_checkpoint_shard_files
 from .utils.import_utils import (
    ENV_VARS_TRUE_VALUES,
    is_sagemaker_mp_enabled,
@ -139,7 +137,6 @@ logger = logging.get_logger(__name__)

 _init_weights = True
 _is_quantized = False
-_is_ds_init_called = False


 def is_fsdp_enabled():
@ -227,19 +224,6 @@ def set_quantized_state():
        _is_quantized = False


-# Skip recursive calls to deepspeed.zero.Init to avoid pinning errors.
-# This issue occurs with ZeRO stage 3 when using NVMe offloading.
-# For more details, refer to issue #34429.
-@contextmanager
-def set_zero3_state():
-    global _is_ds_init_called
-    _is_ds_init_called = True
-    try:
-        yield
-    finally:
-        _is_ds_init_called = False
-
-
 def get_parameter_device(parameter: Union[nn.Module, "ModuleUtilsMixin"]):
    try:
        return next(parameter.parameters()).device
@ -375,9 +359,6 @@ def check_support_param_buffer_assignment(model_to_load, state_dict, start_prefi

    Note: We fully disable this if we are using `deepspeed`
    """
-    if model_to_load.device.type == "meta":
-        return False
-
    if len([key for key in state_dict if key.startswith(start_prefix)]) == 0:
        return False

@ -392,7 +373,7 @@ def check_support_param_buffer_assignment(model_to_load, state_dict, start_prefi
        return False

    # If the model does, the incoming `state_dict` and the `model_to_load` must be the same dtype
-    first_key = next(iter(model_to_load.state_dict().keys()))
+    first_key = list(model_to_load.state_dict().keys())[0]
    if start_prefix + first_key in state_dict:
        return state_dict[start_prefix + first_key].dtype == model_to_load.state_dict()[first_key].dtype

@ -400,6 +381,92 @@ def check_support_param_buffer_assignment(model_to_load, state_dict, start_prefi
    return False


+def shard_checkpoint(
+    state_dict: Dict[str, torch.Tensor], max_shard_size: Union[int, str] = "10GB", weights_name: str = WEIGHTS_NAME
+):
+    """
+    Splits a model state dictionary in sub-checkpoints so that the final size of each sub-checkpoint does not exceed a
+    given size.
+
+    The sub-checkpoints are determined by iterating through the `state_dict` in the order of its keys, so there is no
+    optimization made to make each sub-checkpoint as close as possible to the maximum size passed. For example, if the
+    limit is 10GB and we have weights of sizes [6GB, 6GB, 2GB, 6GB, 2GB, 2GB] they will get sharded as [6GB], [6+2GB],
+    [6+2+2GB] and not [6+2+2GB], [6+2GB], [6GB].
+
+    <Tip warning={true}>
+
+    If one of the model's weight is bigger than `max_shard_size`, it will end up in its own sub-checkpoint which will
+    have a size greater than `max_shard_size`.
+
+    </Tip>
+
+    Args:
+        state_dict (`Dict[str, torch.Tensor]`): The state dictionary of a model to save.
+        max_shard_size (`int` or `str`, *optional*, defaults to `"10GB"`):
+            The maximum size of each sub-checkpoint. If expressed as a string, needs to be digits followed by a unit
+            (like `"5MB"`).
+        weights_name (`str`, *optional*, defaults to `"pytorch_model.bin"`):
+            The name of the model save file.
+    """
+    logger.warning(
+        "Note that `shard_checkpoint` is deprecated and will be removed in v4.44. We recommend you using "
+        "split_torch_state_dict_into_shards from huggingface_hub library"
+    )
+    max_shard_size = convert_file_size_to_int(max_shard_size)
+
+    sharded_state_dicts = [{}]
+    last_block_size = 0
+    total_size = 0
+    storage_id_to_block = {}
+
+    for key, weight in state_dict.items():
+        # when bnb serialization is used the weights in the state dict can be strings
+        # check: https://github.com/huggingface/transformers/pull/24416 for more details
+        if isinstance(weight, str):
+            continue
+        else:
+            storage_id = id_tensor_storage(weight)
+
+        # If a `weight` shares the same underlying storage as another tensor, we put `weight` in the same `block`
+        if storage_id in storage_id_to_block and weight.device != torch.device("meta"):
+            block_id = storage_id_to_block[storage_id]
+            sharded_state_dicts[block_id][key] = weight
+            continue
+
+        weight_size = weight.numel() * dtype_byte_size(weight.dtype)
+        # If this weight is going to tip up over the maximal size, we split, but only if we have put at least one
+        # weight in the current shard.
+        if last_block_size + weight_size > max_shard_size and len(sharded_state_dicts[-1]) > 0:
+            sharded_state_dicts.append({})
+            last_block_size = 0
+
+        sharded_state_dicts[-1][key] = weight
+        last_block_size += weight_size
+        total_size += weight_size
+        storage_id_to_block[storage_id] = len(sharded_state_dicts) - 1
+
+    # If we only have one shard, we return it
+    if len(sharded_state_dicts) == 1:
+        return {weights_name: sharded_state_dicts[0]}, None
+
+    # Otherwise, let's build the index
+    weight_map = {}
+    shards = {}
+    for idx, shard in enumerate(sharded_state_dicts):
+        shard_file = weights_name.replace(".bin", f"-{idx+1:05d}-of-{len(sharded_state_dicts):05d}.bin")
+        shard_file = shard_file.replace(
+            ".safetensors", f"-{idx + 1:05d}-of-{len(sharded_state_dicts):05d}.safetensors"
+        )
+        shards[shard_file] = shard
+        for key in shard.keys():
+            weight_map[key] = shard_file
+
+    # Add the metadata
+    metadata = {"total_size": total_size}
+    index = {"metadata": metadata, "weight_map": weight_map}
+    return shards, index
+
+
 def load_sharded_checkpoint(model, folder, strict=True, prefer_safe=True):
    """
    This is the same as
@ -946,10 +1013,7 @@ def _load_state_dict_into_meta_model(
                param_to = "cpu"
                if is_fsdp_enabled() and not is_local_dist_rank_0():
                    param_to = "meta"
-                val_kwargs = {}
-                if hasattr(module, "weight") and module.weight.__class__.__name__ == "Int8Params":
-                    val_kwargs["requires_grad"] = False
-                value = type(value)(value.data.to(param_to), **val_kwargs, **value.__dict__)
+                value = type(value)(value.data.to(param_to), **value.__dict__)
                setattr(module, tensor_name, value)
            # TODO: consider removing used param_parts from state_dict before return

@ -1345,12 +1409,6 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
    # Has support for a `QuantoQuantizedCache` instance as `past_key_values`
    _supports_quantized_cache = False

-    # A tensor parallel plan to be applied to the model when TP is enabled. For
-    # top-level models, this attribute is currently defined in respective model
-    # code. For base models, this attribute comes from
-    # `config.base_model_tp_plan` during `post_init`.
-    _tp_plan = None
-
    @property
    def dummy_inputs(self) -> Dict[str, torch.Tensor]:
        """
@ -1395,9 +1453,6 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
        """
        self.init_weights()
        self._backward_compatibility_gradient_checkpointing()
-        # If current model is a base model, attach `base_model_tp_plan` from config
-        if self.base_model is self:
-            self._tp_plan = self.config.base_model_tp_plan

    def dequantize(self):
        """
@ -1487,14 +1542,13 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
                torch_dtype=torch_dtype,
            )

-        if is_deepspeed_zero3_enabled() and not _is_quantized and not _is_ds_init_called:
+        if is_deepspeed_zero3_enabled() and not _is_quantized:
            import deepspeed

            logger.info("Detected DeepSpeed ZeRO-3: activating zero.init() for this model")
            # this immediately partitions the model across all gpus, to avoid the overhead in time
            # and memory copying it on CPU or each GPU first
-            init_contexts = [deepspeed.zero.Init(config_dict_or_path=deepspeed_config()), set_zero3_state()]
-            with ContextManagers(init_contexts):
+            with deepspeed.zero.Init(config_dict_or_path=deepspeed_config()):
                model = cls(config, **kwargs)

        else:
@ -1537,7 +1591,6 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
                "eager",
                "sdpa",
                "flash_attention_2",
-                "flex_attention",
            ]:
                message = f'Specified `attn_implementation="{config._attn_implementation}"` is not supported. The only possible arguments are `attn_implementation="eager"` (manual attention implementation)'
                if cls._supports_flash_attn_2:
@ -3429,11 +3482,6 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
        # Cache path to the GGUF file
        gguf_path = None

-        tp_plan = kwargs.pop("tp_plan", None)
-        if tp_plan is not None and tp_plan != "auto":
-            # TODO: we can relax this check when we support taking tp_plan from a json file, for example.
-            raise ValueError(f"tp_plan supports 'auto' only for now but got {tp_plan}.")
-
        if is_fsdp_enabled():
            low_cpu_mem_usage = True

@ -3617,11 +3665,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix

        if hf_quantizer is not None:
            hf_quantizer.validate_environment(
-                torch_dtype=torch_dtype,
-                from_tf=from_tf,
-                from_flax=from_flax,
-                device_map=device_map,
-                weights_only=weights_only,
+                torch_dtype=torch_dtype, from_tf=from_tf, from_flax=from_flax, device_map=device_map
            )
            torch_dtype = hf_quantizer.update_torch_dtype(torch_dtype)
            device_map = hf_quantizer.update_device_map(device_map)
@ -4039,32 +4083,18 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix

        # Instantiate model.
        init_contexts = [no_init_weights(_enable=_fast_init)]
-        tp_device = None

-        if is_deepspeed_zero3_enabled() and not is_quantized and not _is_ds_init_called:
+        if is_deepspeed_zero3_enabled() and not is_quantized:
            import deepspeed

            logger.info("Detected DeepSpeed ZeRO-3: activating zero.init() for this model")
-            init_contexts = [
-                deepspeed.zero.Init(config_dict_or_path=deepspeed_config()),
-                set_zero3_state(),
-            ] + init_contexts
+            init_contexts = [deepspeed.zero.Init(config_dict_or_path=deepspeed_config())] + init_contexts
        elif low_cpu_mem_usage:
            if not is_accelerate_available():
                raise ImportError(
                    f"Using `low_cpu_mem_usage=True` or a `device_map` requires Accelerate: `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`"
                )
            init_contexts.append(init_empty_weights())
-        elif tp_plan is not None:
-            if not torch.distributed.is_initialized():
-                raise ValueError("Tensor Parallel requires torch.distributed to be initialized first.")
-
-            # Detect the accelerator on the machine. If no accelerator is available, it returns CPU.
-            device_type = torch._C._get_accelerator().type
-            device_module = torch.get_device_module(device_type)
-            # Get device with index assuming equal number of devices per host
-            tp_device = torch.device(device_type, torch.distributed.get_rank() % device_module.device_count())
-            init_contexts.append(tp_device)

        if is_deepspeed_zero3_enabled() and is_quantized:
            init_contexts.append(set_quantized_state())
@ -4198,38 +4228,32 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
            if dtype_orig is not None:
                torch.set_default_dtype(dtype_orig)

-            load_contexts = []
-            # Make sure we load onto targeted device
-            if tp_device is not None:
-                load_contexts.append(tp_device)
-
-            with ContextManagers(load_contexts):
-                (
-                    model,
-                    missing_keys,
-                    unexpected_keys,
-                    mismatched_keys,
-                    offload_index,
-                    error_msgs,
-                ) = cls._load_pretrained_model(
-                    model,
-                    state_dict,
-                    loaded_state_dict_keys,  # XXX: rename?
-                    resolved_archive_file,
-                    pretrained_model_name_or_path,
-                    ignore_mismatched_sizes=ignore_mismatched_sizes,
-                    sharded_metadata=sharded_metadata,
-                    _fast_init=_fast_init,
-                    low_cpu_mem_usage=low_cpu_mem_usage,
-                    device_map=device_map,
-                    offload_folder=offload_folder,
-                    offload_state_dict=offload_state_dict,
-                    dtype=torch_dtype,
-                    hf_quantizer=hf_quantizer,
-                    keep_in_fp32_modules=keep_in_fp32_modules,
-                    gguf_path=gguf_path,
-                    weights_only=weights_only,
-                )
+            (
+                model,
+                missing_keys,
+                unexpected_keys,
+                mismatched_keys,
+                offload_index,
+                error_msgs,
+            ) = cls._load_pretrained_model(
+                model,
+                state_dict,
+                loaded_state_dict_keys,  # XXX: rename?
+                resolved_archive_file,
+                pretrained_model_name_or_path,
+                ignore_mismatched_sizes=ignore_mismatched_sizes,
+                sharded_metadata=sharded_metadata,
+                _fast_init=_fast_init,
+                low_cpu_mem_usage=low_cpu_mem_usage,
+                device_map=device_map,
+                offload_folder=offload_folder,
+                offload_state_dict=offload_state_dict,
+                dtype=torch_dtype,
+                hf_quantizer=hf_quantizer,
+                keep_in_fp32_modules=keep_in_fp32_modules,
+                gguf_path=gguf_path,
+                weights_only=weights_only,
+            )

        # make sure token embedding weights are still tied if needed
        model.tie_weights()
@ -4313,16 +4337,6 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
                }
            return model, loading_info

-        if tp_plan is not None:
-            assert tp_device is not None, "tp_device not set!"
-            if not model.supports_tp_plan:
-                raise NotImplementedError("This model does not have a tensor parallel plan.")
-            # Assuming sharding the model onto the world
-            world_size = torch.distributed.get_world_size()
-            device_mesh = torch.distributed.init_device_mesh(tp_device.type, (world_size,))
-            # Apply Tensor Parallelism
-            model.tensor_parallel(device_mesh)
-
        return model

    @classmethod
@ -5012,56 +5026,6 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix

        return self.hf_quantizer.is_trainable

-    @property
-    def supports_tp_plan(self):
-        """
-        Returns whether the model has a tensor parallelism plan.
-        """
-        if self._tp_plan is not None:
-            return True
-        # Check if base model has a TP plan
-        if getattr(self.base_model, "_tp_plan", None) is not None:
-            return True
-        return False
-
-    def tensor_parallel(self, device_mesh):
-        """
-        Tensor parallelize the model across the given device mesh.
-
-        Args:
-            device_mesh (`torch.distributed.DeviceMesh`):
-                The device mesh to use for tensor parallelism.
-        """
-        if not is_torch_greater_or_equal_than_2_4:
-            raise EnvironmentError("tensor parallel is only supported for `torch>=2.5`.")
-
-        # Tensor parallelize a nn.Module based on the `_tp_plan` attribute of the module.
-        # No op if `_tp_plan` attribute does not exist under the module.
-        # This is a helper function to be used with `model.apply` to recursively
-        # parallelize a model.
-        def tplize(mod: torch.nn.Module) -> None:
-            tp_plan = getattr(mod, "_tp_plan", None)
-            if tp_plan is None:
-                return
-            logger.debug(f"Applying tensor parallel to {mod.__class__.__name__}: {tp_plan}")
-            # In model configs, we use a neutral type (string) to specify
-            # parallel styles, here we translate them into torch TP types.
-            # Using tree_map because `tp_plan` is a dict.
-            tp_plan = torch.utils._pytree.tree_map(
-                translate_to_torch_parallel_style,
-                tp_plan,
-            )
-            # Apply TP to current module.
-            torch.distributed.tensor.parallel.parallelize_module(
-                mod,
-                device_mesh=device_mesh,
-                parallelize_plan=tp_plan,
-            )
-
-        # `apply` is a native method of `nn.Module` that recursively applies a
-        # function to every submodule.
-        self.apply(tplize)
-
    @property
    def loss_function(self):
        if getattr(self.config, "loss_type", None) is not None:
--- a/src/transformers/models/init.py
+++ b/src/transformers/models/init.py
@ -177,7 +177,6 @@ from . import (
    nougat,
    nystromformer,
    olmo,
-    olmo_1124,
    olmoe,
    omdet_turbo,
    oneformer,
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@ -195,7 +195,6 @@ CONFIG_MAPPING_NAMES = OrderedDict(
        ("nougat", "VisionEncoderDecoderConfig"),
        ("nystromformer", "NystromformerConfig"),
        ("olmo", "OlmoConfig"),
-        ("olmo_1124", "Olmo1124Config"),
        ("olmoe", "OlmoeConfig"),
        ("omdet-turbo", "OmDetTurboConfig"),
        ("oneformer", "OneFormerConfig"),
@ -511,7 +510,6 @@ MODEL_NAMES_MAPPING = OrderedDict(
        ("nougat", "Nougat"),
        ("nystromformer", "Nyströmformer"),
        ("olmo", "OLMo"),
-        ("olmo_1124", "OLMo November 2024"),
        ("olmoe", "OLMoE"),
        ("omdet-turbo", "OmDet-Turbo"),
        ("oneformer", "OneFormer"),
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@ -68,7 +68,7 @@ else:
            ("convnextv2", ("ConvNextImageProcessor",)),
            ("cvt", ("ConvNextImageProcessor",)),
            ("data2vec-vision", ("BeitImageProcessor",)),
-            ("deformable_detr", ("DeformableDetrImageProcessor", "DeformableDetrImageProcessorFast")),
+            ("deformable_detr", ("DeformableDetrImageProcessor",)),
            ("deit", ("DeiTImageProcessor",)),
            ("depth_anything", ("DPTImageProcessor",)),
            ("deta", ("DetaImageProcessor",)),
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@ -184,7 +184,6 @@ MODEL_MAPPING_NAMES = OrderedDict(
        ("nllb-moe", "NllbMoeModel"),
        ("nystromformer", "NystromformerModel"),
        ("olmo", "OlmoModel"),
-        ("olmo_1124", "Olmo1124Model"),
        ("olmoe", "OlmoeModel"),
        ("omdet-turbo", "OmDetTurboForObjectDetection"),
        ("oneformer", "OneFormerModel"),
@ -517,7 +516,6 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
        ("mvp", "MvpForCausalLM"),
        ("nemotron", "NemotronForCausalLM"),
        ("olmo", "OlmoForCausalLM"),
-        ("olmo_1124", "Olmo1124ForCausalLM"),
        ("olmoe", "OlmoeForCausalLM"),
        ("open-llama", "OpenLlamaForCausalLM"),
        ("openai-gpt", "OpenAIGPTLMHeadModel"),
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@ -348,7 +348,6 @@ else:
                ),
            ),
            ("olmo", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
-            ("olmo_1124", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
            ("olmoe", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)),
            (
                "omdet-turbo",
--- a/src/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py
+++ b/src/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py
@ -249,7 +249,7 @@ def convert_blip2_checkpoint(
                {"image": original_pixel_values, "text_input": [caption]}, match_head="itm"
            )
            logits = hf_model(
-                pixel_values=pixel_values,
+                pixel_values=original_pixel_values,
                input_ids=input_ids,
                attention_mask=attention_mask,
                use_image_text_matching_head=True,
@ -274,7 +274,7 @@ def convert_blip2_checkpoint(
                {"image": original_pixel_values, "text_input": [caption]}, match_head="itc"
            )
            logits = hf_model(
-                pixel_values=pixel_values,
+                pixel_values=original_pixel_values,
                input_ids=input_ids,
                attention_mask=attention_mask,
                use_image_text_matching_head=False,
--- a/src/transformers/models/blip_2/modeling_blip_2.py
+++ b/src/transformers/models/blip_2/modeling_blip_2.py
@ -2203,7 +2203,7 @@ class Blip2ForConditionalGeneration(Blip2PreTrainedModel, GenerationMixin):
            logger.warning_once(
                "Expanding inputs for image tokens in BLIP-2 should be done in processing. "
                "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. "
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
+                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
            )
            inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
            attention_mask = torch.cat(
@ -2326,7 +2326,7 @@ class Blip2ForConditionalGeneration(Blip2PreTrainedModel, GenerationMixin):
            logger.warning_once(
                "Expanding inputs for image tokens in BLIP-2 should be done in processing. "
                "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. "
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
+                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
            )
            inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
            attention_mask = torch.cat(
--- a/src/transformers/models/blip_2/processing_blip_2.py
+++ b/src/transformers/models/blip_2/processing_blip_2.py
@ -153,7 +153,7 @@ class Blip2Processor(ProcessorMixin):
                logger.warning_once(
                    "Expanding inputs for image tokens in BLIP-2 should be done in processing. "
                    "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. "
-                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
+                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
                )

            # cast to desired return tensors type
--- a/src/transformers/models/cohere/modeling_cohere.py
+++ b/src/transformers/models/cohere/modeling_cohere.py
@ -890,7 +890,7 @@ class CohereModel(CoherePreTrainedModel):
        all_self_attns = () if output_attentions else None
        next_decoder_cache = None

-        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+        for decoder_layer in self.layers:
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

@ -1068,7 +1068,7 @@ class CohereModel(CoherePreTrainedModel):
        return causal_mask


-# TODO: re-enable check: Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with Llama->Cohere
+# Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with Llama->Cohere
 class CohereForCausalLM(CoherePreTrainedModel, GenerationMixin):
    _tied_weights_keys = ["lm_head.weight"]

--- a/src/transformers/models/deformable_detr/init.py
+++ b/src/transformers/models/deformable_detr/init.py
@ -29,7 +29,6 @@ except OptionalDependencyNotAvailable:
 else:
    _import_structure["feature_extraction_deformable_detr"] = ["DeformableDetrFeatureExtractor"]
    _import_structure["image_processing_deformable_detr"] = ["DeformableDetrImageProcessor"]
-    _import_structure["image_processing_deformable_detr_fast"] = ["DeformableDetrImageProcessorFast"]

 try:
    if not is_torch_available():
@ -55,7 +54,6 @@ if TYPE_CHECKING:
    else:
        from .feature_extraction_deformable_detr import DeformableDetrFeatureExtractor
        from .image_processing_deformable_detr import DeformableDetrImageProcessor
-        from .image_processing_deformable_detr_fast import DeformableDetrImageProcessorFast

    try:
        if not is_torch_available():
--- a/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py
+++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py
--- a/src/transformers/models/depth_anything/modeling_depth_anything.py
+++ b/src/transformers/models/depth_anything/modeling_depth_anything.py
@ -224,16 +224,16 @@ class DepthAnythingFeatureFusionStage(nn.Module):
        hidden_states = hidden_states[::-1]

        fused_hidden_states = []
-        fused_hidden_state = None
+        # first layer only uses the last hidden_state
+        size = hidden_states[1].shape[2:]
+        fused_hidden_state = self.layers[0](hidden_states[0], size=size)
+        fused_hidden_states.append(fused_hidden_state)

-        for idx, (hidden_state, layer) in enumerate(zip(hidden_states, self.layers)):
-            size = hidden_states[idx + 1].shape[2:] if idx != (len(hidden_states) - 1) else None
+        # looping from the last layer to the second
+        for idx, (hidden_state, layer) in enumerate(zip(hidden_states[1:], self.layers[1:])):
+            size = hidden_states[1:][idx + 1].shape[2:] if idx != (len(hidden_states[1:]) - 1) else None

-            if fused_hidden_state is None:
-                # first layer only uses the last hidden_state
-                fused_hidden_state = layer(hidden_state, size=size)
-            else:
-                fused_hidden_state = layer(fused_hidden_state, hidden_state, size=size)
+            fused_hidden_state = layer(fused_hidden_state, hidden_state, size=size)

            fused_hidden_states.append(fused_hidden_state)

--- a/src/transformers/models/detr/image_processing_detr_fast.py
+++ b/src/transformers/models/detr/image_processing_detr_fast.py
@ -416,7 +416,7 @@ class DetrImageProcessorFast(BaseImageProcessorFast):
    def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
        """
        Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is
-        created using from_dict and kwargs e.g. `DetrImageProcessorFast.from_pretrained(checkpoint, size=600,
+        created using from_dict and kwargs e.g. `DetrImageProcessor.from_pretrained(checkpoint, size=600,
        max_size=800)`
        """
        image_processor_dict = image_processor_dict.copy()
@ -863,7 +863,6 @@ class DetrImageProcessorFast(BaseImageProcessorFast):
            input_data_format = infer_channel_dimension_format(images[0])
        if input_data_format == ChannelDimension.LAST:
            images = [image.permute(2, 0, 1).contiguous() for image in images]
-            input_data_format = ChannelDimension.FIRST

        if do_rescale and do_normalize:
            # fused rescale and normalize
--- a/src/transformers/models/dpt/modeling_dpt.py
+++ b/src/transformers/models/dpt/modeling_dpt.py
@ -689,13 +689,12 @@ class DPTFeatureFusionStage(nn.Module):
        hidden_states = hidden_states[::-1]

        fused_hidden_states = []
-        fused_hidden_state = None
-        for hidden_state, layer in zip(hidden_states, self.layers):
-            if fused_hidden_state is None:
-                # first layer only uses the last hidden_state
-                fused_hidden_state = layer(hidden_state)
-            else:
-                fused_hidden_state = layer(fused_hidden_state, hidden_state)
+        # first layer only uses the last hidden_state
+        fused_hidden_state = self.layers[0](hidden_states[0])
+        fused_hidden_states.append(fused_hidden_state)
+        # looping from the last layer to the second
+        for hidden_state, layer in zip(hidden_states[1:], self.layers[1:]):
+            fused_hidden_state = layer(fused_hidden_state, hidden_state)
            fused_hidden_states.append(fused_hidden_state)

        return fused_hidden_states
--- a/src/transformers/models/falcon/modeling_falcon.py
+++ b/src/transformers/models/falcon/modeling_falcon.py
@ -1277,18 +1277,12 @@ class FalconForCausalLM(FalconPreTrainedModel, GenerationMixin):
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
-        num_logits_to_keep: int = 0,
    ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
-
-        num_logits_to_keep (`int`, *optional*):
-            Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
-            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
-            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
        """

        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@ -1308,7 +1302,7 @@ class FalconForCausalLM(FalconPreTrainedModel, GenerationMixin):
        )
        hidden_states = transformer_outputs[0]

-        lm_logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
+        lm_logits = self.lm_head(hidden_states)

        loss = None
        if labels is not None:
--- a/src/transformers/models/fuyu/modeling_fuyu.py
+++ b/src/transformers/models/fuyu/modeling_fuyu.py
@ -346,7 +346,7 @@ class FuyuForCausalLM(FuyuPreTrainedModel, GenerationMixin):
    ):
        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model

-        if past_key_values is not None:
+        if past_key_values:
            input_ids = input_ids[:, -1:]

        position_ids = kwargs.get("position_ids", None)
@ -355,7 +355,7 @@ class FuyuForCausalLM(FuyuPreTrainedModel, GenerationMixin):
            position_ids = attention_mask.long().cumsum(-1) - 1
            position_ids.masked_fill_(attention_mask == 0, 1)
            if past_key_values:
-                position_ids = position_ids[:, -1:]
+                position_ids = position_ids[:, -1].unsqueeze(-1)

        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
        if inputs_embeds is not None and past_key_values is None:
@ -377,12 +377,3 @@ class FuyuForCausalLM(FuyuPreTrainedModel, GenerationMixin):
            }
        )
        return model_inputs
-
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (
-                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
-            )
-        return reordered_past
--- a/src/transformers/models/gemma/modeling_gemma.py
+++ b/src/transformers/models/gemma/modeling_gemma.py
@ -720,10 +720,7 @@ class GemmaModel(GemmaPreTrainedModel):
            [GemmaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
        )
        self.norm = GemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
        self.gradient_checkpointing = False
-        if getattr(config, "pretraining_tp", 1) != 1:
-            logger.warn("`pretraining_tp` is deprecated, please use `model.tensor_parallel` instead.")

        # Initialize weights and apply final processing
        self.post_init()
@ -808,7 +805,7 @@ class GemmaModel(GemmaPreTrainedModel):
        all_self_attns = () if output_attentions else None
        next_decoder_cache = None

-        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+        for decoder_layer in self.layers:
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

@ -985,7 +982,6 @@ class GemmaModel(GemmaPreTrainedModel):

 class GemmaForCausalLM(GemmaPreTrainedModel, GenerationMixin):
    _tied_weights_keys = ["lm_head.weight"]
-    _tp_plan = {"lm_head": "colwise_rep"}

    def __init__(self, config):
        super().__init__(config)
--- a/src/transformers/models/gemma/modular_gemma.py
+++ b/src/transformers/models/gemma/modular_gemma.py
@ -886,7 +886,7 @@ class GemmaModel(LlamaModel):
        all_self_attns = () if output_attentions else None
        next_decoder_cache = None

-        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+        for decoder_layer in self.layers:
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

--- a/src/transformers/models/gemma2/modeling_gemma2.py
+++ b/src/transformers/models/gemma2/modeling_gemma2.py
@ -41,7 +41,7 @@ from ...utils import (
    add_start_docstrings_to_model_forward,
    is_flash_attn_2_available,
    is_flash_attn_greater_or_equal,
-    is_torch_greater_or_equal,
+    is_flash_attn_greater_or_equal_2_10,
    logging,
    replace_return_docstrings,
 )
@ -51,8 +51,6 @@ from .configuration_gemma2 import Gemma2Config
 if is_flash_attn_2_available():
    from ...modeling_flash_attention_utils import _flash_attention_forward

-if is_torch_greater_or_equal("2.5"):
-    from torch.nn.attention.flex_attention import flex_attention

 logger = logging.get_logger(__name__)

@ -170,127 +168,6 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)


-def eager_attention_forward(config, query, key, value, mask, **_kwargs):
-    key_states = repeat_kv(key, config.num_key_value_groups)
-    value_states = repeat_kv(value, config.num_key_value_groups)
-
-    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * config.scaling
-
-    if config.attn_logit_softcapping is not None:
-        attn_weights = attn_weights / config.attn_logit_softcapping
-        attn_weights = torch.tanh(attn_weights)
-        attn_weights = attn_weights * config.attn_logit_softcapping
-    if mask is not None:  # no matter the length, we just slice it
-        causal_mask = mask[:, :, :, : key_states.shape[-2]]
-        attn_weights = attn_weights + causal_mask
-
-    # upcast attention to fp32
-    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
-    attn_weights = nn.functional.dropout(attn_weights, p=config.attention_dropout, training=config.training)
-    attn_output = torch.matmul(attn_weights, value_states)
-    attn_output = attn_output.transpose(1, 2).contiguous()
-    return attn_output, attn_weights
-
-
-def flash_attention_forward(config, query, key, value, mask, target_dtype=torch.float16, **_kwargs):
-    if mask is not None:
-        seq_len = mask.shape[1]
-        query = query[:, :, :seq_len]
-        value = value[:, :, :seq_len]
-
-    # TODO: These transpose are quite inefficient but Flash Attention requires the layout
-    # [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor rotary embedding
-    query_states = query.transpose(1, 2)
-    key_states = key.transpose(1, 2)
-    value_states = value.transpose(1, 2)
-
-    dropout_rate = config.attention_dropout if config.training else 0.0
-
-    input_dtype = query_states.dtype
-    if input_dtype == torch.float32:
-        query_states = query_states.to(target_dtype)
-        key_states = key_states.to(target_dtype)
-        value_states = value_states.to(target_dtype)
-
-    attn_output = _flash_attention_forward(
-        query_states,
-        key_states,
-        value_states,
-        mask,
-        seq_len,
-        dropout=dropout_rate,
-        softmax_scale=config.scaling,
-        is_causal=config.is_causal,
-        sliding_window=config.sliding_window,
-        use_top_left_mask=config._flash_attn_uses_top_left_mask,
-        softcap=config.attn_logit_softcapping if is_flash_attn_greater_or_equal("2.6.0") else None,
-    )
-
-    return attn_output, None
-
-
-def flex_attention_forward(config, query, key, value, mask, output_attentions=False, **_kwargs):
-    def tanh_softcap(score, b, h, q_idx, kv_idx):
-        soft_cap = config.attn_logit_softcapping
-        score = soft_cap * torch.tanh(score / soft_cap)
-        if mask is not None:
-            return score + mask[b][0][q_idx][kv_idx]
-        return score
-
-    attn_output = flex_attention(
-        query,
-        key,
-        value,
-        score_mod=tanh_softcap,
-        enable_gqa=True,
-        scale=config.scaling,
-        return_lse=output_attentions,
-    )
-    if not output_attentions:
-        return attn_output, None
-    else:
-        return attn_output[0], attn_output[1]
-
-
-def sdpa_attention_forward(config, query, key, value, mask, **_kwargs):
-    key = repeat_kv(key, config.num_key_value_groups)
-    value = repeat_kv(value, config.num_key_value_groups)
-
-    causal_mask = mask
-    if mask is not None:
-        causal_mask = causal_mask[:, :, :, : key.shape[-2]]
-
-    # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
-    # Reference: https://github.com/pytorch/pytorch/issues/112577.
-    if query.device.type == "cuda" and causal_mask is not None:
-        query = query.contiguous()
-        key = key.contiguous()
-        value = value.contiguous()
-
-    # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
-    # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
-    is_causal = True if causal_mask is None and query.shape[1] > 1 else False
-
-    attn_output = torch.nn.functional.scaled_dot_product_attention(
-        query,
-        key,
-        value,
-        attn_mask=causal_mask,
-        dropout_p=config.attention_dropout if config.training else 0.0,
-        is_causal=is_causal,
-        scale=config.scaling,
-    )
-    return attn_output, None
-
-
-GEMMA2_ATTENTION_FUNCTION = {
-    "flash_attention_2": flash_attention_forward,
-    "flex_attention": flex_attention_forward,
-    "eager": eager_attention_forward,
-    "sdpa": sdpa_attention_forward,
-}
-
-
 class Gemma2Attention(nn.Module):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

@ -298,6 +175,12 @@ class Gemma2Attention(nn.Module):
        super().__init__()
        self.config = config
        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )

        self.attention_dropout = config.attention_dropout
        self.hidden_size = config.hidden_size
@ -309,8 +192,7 @@ class Gemma2Attention(nn.Module):
        self.rope_theta = config.rope_theta
        self.is_causal = True
        self.scaling = config.query_pre_attn_scalar**-0.5
-        self.sliding_window = config.sliding_window if not bool(layer_idx % 2) else None
-        self.attn_logit_softcapping = config.attn_logit_softcapping
+
        if self.hidden_size % self.num_heads != 0:
            raise ValueError(
                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
@ -326,6 +208,7 @@ class Gemma2Attention(nn.Module):
            max_position_embeddings=self.max_position_embeddings,
            base=self.rope_theta,
        )
+        self.sliding_window = config.sliding_window if not bool(layer_idx % 2) else None

    def forward(
        self,
@ -360,14 +243,145 @@ class Gemma2Attention(nn.Module):
            }
            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)

-        if output_attentions and self.config._attn_implementation in ["sdpa", "flash_attention_2"]:
-            logger.warning_once("Setting `attention_type` to `flex_attention` because `output_attentions=True`")
-            attention_type = "eager"
-        else:
-            attention_type = self.config._attn_implementation
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)

-        attn_output, attn_weights = GEMMA2_ATTENTION_FUNCTION[attention_type](
-            self, query_states, key_states, value_states, attention_mask, output_attentions=output_attentions
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scaling
+
+        if self.config.attn_logit_softcapping is not None:
+            attn_weights = attn_weights / self.config.attn_logit_softcapping
+            attn_weights = torch.tanh(attn_weights)
+            attn_weights = attn_weights * self.config.attn_logit_softcapping
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+
+        attn_output = attn_output.view(bsz, q_len, -1)
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class Gemma2FlashAttention2(Gemma2Attention):
+    """
+    Gemma2 flash attention module. This module inherits from `Gemma2Attention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        output_attentions = False
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = self.rotary_emb(value_states, position_ids)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {
+                "sin": sin,
+                "cos": cos,
+                "sliding_window": self.sliding_window,
+                "cache_position": cache_position,
+            }
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        if attention_mask is not None:
+            seq_len = attention_mask.shape[1]
+            key_states = key_states[:, :, :seq_len]
+            value_states = value_states[:, :, :seq_len]
+
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        dropout_rate = self.attention_dropout if self.training else 0.0
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (Gemma2RMSNorm handles it correctly)
+
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        attn_output = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            dropout=dropout_rate,
+            softmax_scale=self.scaling,
+            is_causal=self.is_causal,
+            sliding_window=self.sliding_window,
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+            softcap=self.config.attn_logit_softcapping if is_flash_attn_greater_or_equal("2.6.0") else None,
        )

        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
@ -379,37 +393,116 @@ class Gemma2Attention(nn.Module):
        return attn_output, attn_weights, past_key_value


-class Gemma2FlashAttention2(Gemma2Attention):
-    def __init__(self, config: Gemma2Config, layer_idx: Optional[int] = None):
-        super().__init__(config, layer_idx)
-        self.config._attn_implementation = "flash_attention_2"
-        logger.warning_once(
-            "The `Gemma2FlashAttention2` class is deprecated in favor of simply modifying the `config._attn_implementation`"
-            "attribute of the `GemmaAttention` class! It will be removed in v4.48"
-        )
-
-
 class Gemma2SdpaAttention(Gemma2Attention):
-    def __init__(self, config: Gemma2Config, layer_idx: Optional[int] = None):
-        super().__init__(config, layer_idx)
-        self.config._attn_implementation = "sdpa"
-        logger.warning_once(
-            "The `Gemma2FlashAttention2` class is deprecated in favor of simply modifying the `config._attn_implementation`"
-            "attribute of the `GemmaAttention` class! It will be removed in v4.48"
+    """
+    Gemma2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `Gemma2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    # Adapted from Gemma2Attention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "Gemma2Model is using Gemma2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = self.rotary_emb(value_states, position_ids)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {
+                "sin": sin,
+                "cos": cos,
+                "sliding_window": self.sliding_window,
+                "cache_position": cache_position,
+            }
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        causal_mask = attention_mask
+        if attention_mask is not None:
+            causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and causal_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        is_causal = True if causal_mask is None and q_len > 1 else False
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=is_causal,
+            scale=self.scaling,
        )

+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, -1)
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+
+GEMMA2_ATTENTION_CLASSES = {
+    "eager": Gemma2Attention,
+    "flash_attention_2": Gemma2FlashAttention2,
+    "sdpa": Gemma2SdpaAttention,
+}
+

 class Gemma2DecoderLayer(nn.Module):
    def __init__(self, config: Gemma2Config, layer_idx: int):
        super().__init__()
        self.hidden_size = config.hidden_size
-        self.config = config
-        self.is_sliding = not bool(layer_idx % 2)
-        self.self_attn = Gemma2Attention(config=config, layer_idx=layer_idx)
+        self.self_attn = GEMMA2_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
        self.mlp = Gemma2MLP(config)
        self.input_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.post_attention_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
+        self.config = config
+        self.is_sliding = not bool(layer_idx % 2)
        self.pre_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.post_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.sliding_window = config.sliding_window
@ -424,6 +517,25 @@ class Gemma2DecoderLayer(nn.Module):
        use_cache: Optional[bool] = False,
        cache_position: Optional[torch.LongTensor] = None,
    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
        if self.is_sliding and attention_mask is not None:  # efficient SDPA and no padding
            # Flash-attn is a 2D tensor
            if self.config._attn_implementation == "flash_attention_2":
@ -628,10 +740,7 @@ class Gemma2Model(Gemma2PreTrainedModel):
            [Gemma2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
        )
        self.norm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
        self.gradient_checkpointing = False
-        if getattr(config, "pretraining_tp", 1) != 1:
-            logger.warn("`pretraining_tp` is deprecated, please use `model.tensor_parallel` instead.")

        # Initialize weights and apply final processing
        self.post_init()
@ -711,7 +820,7 @@ class Gemma2Model(Gemma2PreTrainedModel):
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None

-        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+        for decoder_layer in self.layers:
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

@ -852,7 +961,6 @@ class Gemma2Model(Gemma2PreTrainedModel):

 class Gemma2ForCausalLM(Gemma2PreTrainedModel, GenerationMixin):
    _tied_weights_keys = ["lm_head.weight"]
-    _tp_plan = {"lm_head": "colwise_rep"}

    def __init__(self, config):
        super().__init__(config)
--- a/src/transformers/models/gemma2/modular_gemma2.py
+++ b/src/transformers/models/gemma2/modular_gemma2.py
@ -29,17 +29,18 @@ from ...modeling_outputs import (
 from ...utils import (
    is_flash_attn_2_available,
    is_flash_attn_greater_or_equal,
-    is_torch_greater_or_equal,
+    is_flash_attn_greater_or_equal_2_10,
    logging,
 )
 from ..gemma.modeling_gemma import (
+    GemmaAttention,
+    GemmaDecoderLayer,
    GemmaForCausalLM,
    GemmaForSequenceClassification,
    GemmaForTokenClassification,
    GemmaModel,
    GemmaPreTrainedModel,
    GemmaRMSNorm,
-    GemmaRotaryEmbedding,
    apply_rotary_pos_emb,
    repeat_kv,
 )
@ -48,9 +49,6 @@ from ..gemma.modeling_gemma import (
 if is_flash_attn_2_available():
    from ...modeling_flash_attention_utils import _flash_attention_forward

-if is_torch_greater_or_equal("2.5"):
-    from torch.nn.attention.flex_attention import flex_attention
-

 _CHECKPOINT_FOR_DOC = "google/gemma2-7b"

@ -209,166 +207,13 @@ class Gemma2MLP(nn.Module):
        return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))


-class Gemma2RotaryEmbedding(GemmaRotaryEmbedding):
-    pass
-
-
-def eager_attention_forward(config, query, key, value, mask, **_kwargs):
-    key_states = repeat_kv(key, config.num_key_value_groups)
-    value_states = repeat_kv(value, config.num_key_value_groups)
-
-    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * config.scaling
-
-    if config.attn_logit_softcapping is not None:
-        attn_weights = attn_weights / config.attn_logit_softcapping
-        attn_weights = torch.tanh(attn_weights)
-        attn_weights = attn_weights * config.attn_logit_softcapping
-    if mask is not None:  # no matter the length, we just slice it
-        causal_mask = mask[:, :, :, : key_states.shape[-2]]
-        attn_weights = attn_weights + causal_mask
-
-    # upcast attention to fp32
-    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
-    attn_weights = nn.functional.dropout(attn_weights, p=config.attention_dropout, training=config.training)
-    attn_output = torch.matmul(attn_weights, value_states)
-    attn_output = attn_output.transpose(1, 2).contiguous()
-    return attn_output, attn_weights
-
-
-def flash_attention_forward(config, query, key, value, mask, target_dtype=torch.float16, **_kwargs):
-    if mask is not None:
-        seq_len = mask.shape[1]
-        query = query[:, :, :seq_len]
-        value = value[:, :, :seq_len]
-
-    # TODO: These transpose are quite inefficient but Flash Attention requires the layout
-    # [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor rotary embedding
-    query_states = query.transpose(1, 2)
-    key_states = key.transpose(1, 2)
-    value_states = value.transpose(1, 2)
-
-    dropout_rate = config.attention_dropout if config.training else 0.0
-
-    input_dtype = query_states.dtype
-    if input_dtype == torch.float32:
-        query_states = query_states.to(target_dtype)
-        key_states = key_states.to(target_dtype)
-        value_states = value_states.to(target_dtype)
-
-    attn_output = _flash_attention_forward(
-        query_states,
-        key_states,
-        value_states,
-        mask,
-        seq_len,
-        dropout=dropout_rate,
-        softmax_scale=config.scaling,
-        is_causal=config.is_causal,
-        sliding_window=config.sliding_window,
-        use_top_left_mask=config._flash_attn_uses_top_left_mask,
-        softcap=config.attn_logit_softcapping if is_flash_attn_greater_or_equal("2.6.0") else None,
-    )
-
-    return attn_output, None
-
-
-def flex_attention_forward(config, query, key, value, mask, output_attentions=False, **_kwargs):
-    def tanh_softcap(score, b, h, q_idx, kv_idx):
-        soft_cap = config.attn_logit_softcapping
-        score = soft_cap * torch.tanh(score / soft_cap)
-        if mask is not None:
-            return score + mask[b][0][q_idx][kv_idx]
-        return score
-
-    attn_output = flex_attention(
-        query,
-        key,
-        value,
-        score_mod=tanh_softcap,
-        enable_gqa=True,
-        scale=config.scaling,
-        return_lse=output_attentions,
-    )
-    if not output_attentions:
-        return attn_output, None
-    else:
-        return attn_output[0], attn_output[1]
-
-
-def sdpa_attention_forward(config, query, key, value, mask, **_kwargs):
-    key = repeat_kv(key, config.num_key_value_groups)
-    value = repeat_kv(value, config.num_key_value_groups)
-
-    causal_mask = mask
-    if mask is not None:
-        causal_mask = causal_mask[:, :, :, : key.shape[-2]]
-
-    # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
-    # Reference: https://github.com/pytorch/pytorch/issues/112577.
-    if query.device.type == "cuda" and causal_mask is not None:
-        query = query.contiguous()
-        key = key.contiguous()
-        value = value.contiguous()
-
-    # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
-    # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
-    is_causal = True if causal_mask is None and query.shape[1] > 1 else False
-
-    attn_output = torch.nn.functional.scaled_dot_product_attention(
-        query,
-        key,
-        value,
-        attn_mask=causal_mask,
-        dropout_p=config.attention_dropout if config.training else 0.0,
-        is_causal=is_causal,
-        scale=config.scaling,
-    )
-    return attn_output, None
-
-
-GEMMA2_ATTENTION_FUNCTION = {
-    "flash_attention_2": flash_attention_forward,
-    "flex_attention": flex_attention_forward,
-    "eager": eager_attention_forward,
-    "sdpa": sdpa_attention_forward,
-}
-
-
-class Gemma2Attention(nn.Module):
+class Gemma2Attention(GemmaAttention):
    """Multi-headed attention from 'Attention Is All You Need' paper"""

    def __init__(self, config: Gemma2Config, layer_idx: Optional[int] = None):
-        super().__init__()
-        self.config = config
-        self.layer_idx = layer_idx
-
-        self.attention_dropout = config.attention_dropout
-        self.hidden_size = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = config.head_dim
-        self.num_key_value_heads = config.num_key_value_heads
-        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
-        self.max_position_embeddings = config.max_position_embeddings
-        self.rope_theta = config.rope_theta
-        self.is_causal = True
+        super().__init__(config, layer_idx)
        self.scaling = config.query_pre_attn_scalar**-0.5
        self.sliding_window = config.sliding_window if not bool(layer_idx % 2) else None
-        self.attn_logit_softcapping = config.attn_logit_softcapping
-        if self.hidden_size % self.num_heads != 0:
-            raise ValueError(
-                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
-                f" and `num_heads`: {self.num_heads})."
-            )
-
-        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
-        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
-        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
-        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
-        self.rotary_emb = Gemma2RotaryEmbedding(
-            self.head_dim,
-            max_position_embeddings=self.max_position_embeddings,
-            base=self.rope_theta,
-        )

    def forward(
        self,
@ -403,14 +248,145 @@ class Gemma2Attention(nn.Module):
            }
            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)

-        if output_attentions and self.config._attn_implementation in ["sdpa", "flash_attention_2"]:
-            logger.warning_once("Setting `attention_type` to `flex_attention` because `output_attentions=True`")
-            attention_type = "eager"
-        else:
-            attention_type = self.config._attn_implementation
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)

-        attn_output, attn_weights = GEMMA2_ATTENTION_FUNCTION[attention_type](
-            self, query_states, key_states, value_states, attention_mask, output_attentions=output_attentions
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scaling
+
+        if self.config.attn_logit_softcapping is not None:
+            attn_weights = attn_weights / self.config.attn_logit_softcapping
+            attn_weights = torch.tanh(attn_weights)
+            attn_weights = attn_weights * self.config.attn_logit_softcapping
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+
+        attn_output = attn_output.view(bsz, q_len, -1)
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+class Gemma2FlashAttention2(Gemma2Attention):
+    """
+    Gemma2 flash attention module. This module inherits from `Gemma2Attention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        output_attentions = False
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = self.rotary_emb(value_states, position_ids)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {
+                "sin": sin,
+                "cos": cos,
+                "sliding_window": self.sliding_window,
+                "cache_position": cache_position,
+            }
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        if attention_mask is not None:
+            seq_len = attention_mask.shape[1]
+            key_states = key_states[:, :, :seq_len]
+            value_states = value_states[:, :, :seq_len]
+
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        dropout_rate = self.attention_dropout if self.training else 0.0
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (Gemma2RMSNorm handles it correctly)
+
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = torch.get_autocast_gpu_dtype()
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        attn_output = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            dropout=dropout_rate,
+            softmax_scale=self.scaling,
+            is_causal=self.is_causal,
+            sliding_window=self.sliding_window,
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+            softcap=self.config.attn_logit_softcapping if is_flash_attn_greater_or_equal("2.6.0") else None,
        )

        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
@ -422,37 +398,105 @@ class Gemma2Attention(nn.Module):
        return attn_output, attn_weights, past_key_value


-class Gemma2FlashAttention2(Gemma2Attention):
-    def __init__(self, config: Gemma2Config, layer_idx: Optional[int] = None):
-        super().__init__(config, layer_idx)
-        self.config._attn_implementation = "flash_attention_2"
-        logger.warning_once(
-            "The `Gemma2FlashAttention2` class is deprecated in favor of simply modifying the `config._attn_implementation`"
-            "attribute of the `GemmaAttention` class! It will be removed in v4.48"
-        )
-
-
 class Gemma2SdpaAttention(Gemma2Attention):
-    def __init__(self, config: Gemma2Config, layer_idx: Optional[int] = None):
-        super().__init__(config, layer_idx)
-        self.config._attn_implementation = "sdpa"
-        logger.warning_once(
-            "The `Gemma2FlashAttention2` class is deprecated in favor of simply modifying the `config._attn_implementation`"
-            "attribute of the `GemmaAttention` class! It will be removed in v4.48"
+    """
+    Gemma2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `Gemma2Attention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    # Adapted from Gemma2Attention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "Gemma2Model is using Gemma2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = self.rotary_emb(value_states, position_ids)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_value is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {
+                "sin": sin,
+                "cos": cos,
+                "sliding_window": self.sliding_window,
+                "cache_position": cache_position,
+            }
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        causal_mask = attention_mask
+        if attention_mask is not None:
+            causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and causal_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        is_causal = True if causal_mask is None and q_len > 1 else False
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=is_causal,
+            scale=self.scaling,
        )

+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, -1)

-class Gemma2DecoderLayer(nn.Module):
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+
+class Gemma2DecoderLayer(GemmaDecoderLayer):
    def __init__(self, config: Gemma2Config, layer_idx: int):
-        super().__init__()
-        self.hidden_size = config.hidden_size
+        super().__init__(config, layer_idx)
        self.config = config
        self.is_sliding = not bool(layer_idx % 2)
-        self.self_attn = Gemma2Attention(config=config, layer_idx=layer_idx)
        self.mlp = Gemma2MLP(config)
-        self.input_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
        self.pre_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.post_feedforward_layernorm = Gemma2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.sliding_window = config.sliding_window
@ -609,7 +653,7 @@ class Gemma2Model(GemmaModel, Gemma2PreTrainedModel):
        all_hidden_states = () if output_hidden_states else None
        all_self_attns = () if output_attentions else None

-        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+        for decoder_layer in self.layers:
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

--- a/src/transformers/models/glm/modeling_glm.py
+++ b/src/transformers/models/glm/modeling_glm.py
@ -708,8 +708,6 @@ class GlmModel(GlmPreTrainedModel):
            dim=config.head_dim // 2, max_position_embeddings=config.max_position_embeddings, base=config.rope_theta
        )
        self.gradient_checkpointing = False
-        if getattr(config, "pretraining_tp", 1) != 1:
-            logger.warn("`pretraining_tp` is deprecated, please use `model.tensor_parallel` instead.")

        # Initialize weights and apply final processing
        self.post_init()
@ -789,7 +787,7 @@ class GlmModel(GlmPreTrainedModel):
        all_self_attns = () if output_attentions else None
        next_decoder_cache = None

-        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+        for decoder_layer in self.layers:
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

@ -969,7 +967,6 @@ class GlmModel(GlmPreTrainedModel):

 class GlmForCausalLM(GlmPreTrainedModel, GenerationMixin):
    _tied_weights_keys = ["lm_head.weight"]
-    _tp_plan = {"lm_head": "colwise_rep"}

    def __init__(self, config: GlmConfig):
        super().__init__(config)
--- a/src/transformers/models/instructblip/modeling_instructblip.py
+++ b/src/transformers/models/instructblip/modeling_instructblip.py
@ -1471,7 +1471,7 @@ class InstructBlipForConditionalGeneration(InstructBlipPreTrainedModel, Generati
            logger.warning_once(
                "Expanding inputs for image tokens in InstructBLIP should be done in processing. "
                "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your InstructBLIP model. "
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
+                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
            )
            inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
            attention_mask = torch.cat(
@ -1610,7 +1610,7 @@ class InstructBlipForConditionalGeneration(InstructBlipPreTrainedModel, Generati
            logger.warning_once(
                "Expanding inputs for image tokens in InstructBLIP should be done in processing. "
                "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your InstructBLIP model. "
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
+                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
            )
            inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1)
            attention_mask = torch.cat(
--- a/src/transformers/models/instructblip/processing_instructblip.py
+++ b/src/transformers/models/instructblip/processing_instructblip.py
@ -148,7 +148,7 @@ class InstructBlipProcessor(ProcessorMixin):
                    logger.warning_once(
                        "Expanding inputs for image tokens in InstructBLIP should be done in processing. "
                        "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your InstructBLIP model. "
-                        "Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
+                        "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
                    )

            # cast to desired return tensors type after concatenating
--- a/src/transformers/models/llama/configuration_llama.py
+++ b/src/transformers/models/llama/configuration_llama.py
@ -141,16 +141,6 @@ class LlamaConfig(PretrainedConfig):

    model_type = "llama"
    keys_to_ignore_at_inference = ["past_key_values"]
-    # Default tensor parallel plan for base model `LlamaModel`
-    base_model_tp_plan = {
-        "layers.*.self_attn.q_proj": "colwise",
-        "layers.*.self_attn.k_proj": "colwise",
-        "layers.*.self_attn.v_proj": "colwise",
-        "layers.*.self_attn.o_proj": "rowwise",
-        "layers.*.mlp.gate_proj": "colwise",
-        "layers.*.mlp.up_proj": "colwise",
-        "layers.*.mlp.down_proj": "rowwise",
-    }

    def __init__(
        self,
--- a/src/transformers/models/llama/modeling_llama.py
+++ b/src/transformers/models/llama/modeling_llama.py
@ -21,6 +21,7 @@ import math
 from typing import List, Optional, Tuple, Union

 import torch
+import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn

@ -239,7 +240,25 @@ class LlamaMLP(nn.Module):
        self.act_fn = ACT2FN[config.hidden_act]

    def forward(self, x):
-        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        if self.config.pretraining_tp > 1:
+            slice = self.intermediate_size // self.config.pretraining_tp
+            gate_proj_slices = self.gate_proj.weight.split(slice, dim=0)
+            up_proj_slices = self.up_proj.weight.split(slice, dim=0)
+            down_proj_slices = self.down_proj.weight.split(slice, dim=1)
+
+            gate_proj = torch.cat(
+                [F.linear(x, gate_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1
+            )
+            up_proj = torch.cat([F.linear(x, up_proj_slices[i]) for i in range(self.config.pretraining_tp)], dim=-1)
+
+            intermediate_states = (self.act_fn(gate_proj) * up_proj).split(slice, dim=2)
+            down_proj = [
+                F.linear(intermediate_states[i], down_proj_slices[i]) for i in range(self.config.pretraining_tp)
+            ]
+            down_proj = sum(down_proj)
+        else:
+            down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+
        return down_proj


@ -301,14 +320,31 @@ class LlamaAttention(nn.Module):
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        bsz, q_len, _ = hidden_states.size()

-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
+        if self.config.pretraining_tp > 1:
+            key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
+            query_slices = self.q_proj.weight.split(
+                (self.num_heads * self.head_dim) // self.config.pretraining_tp, dim=0
+            )
+            key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
+            value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)

-        # use -1 to infer num_heads and num_key_value_heads as they may vary if tensor parallel is used
-        query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+            query_states = [F.linear(hidden_states, query_slices[i]) for i in range(self.config.pretraining_tp)]
+            query_states = torch.cat(query_states, dim=-1)
+
+            key_states = [F.linear(hidden_states, key_slices[i]) for i in range(self.config.pretraining_tp)]
+            key_states = torch.cat(key_states, dim=-1)
+
+            value_states = [F.linear(hidden_states, value_slices[i]) for i in range(self.config.pretraining_tp)]
+            value_states = torch.cat(value_states, dim=-1)
+
+        else:
+            query_states = self.q_proj(hidden_states)
+            key_states = self.k_proj(hidden_states)
+            value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

        if position_embeddings is None:
            logger.warning_once(
@ -350,7 +386,12 @@ class LlamaAttention(nn.Module):

        attn_output = attn_output.reshape(bsz, q_len, -1)

-        attn_output = self.o_proj(attn_output)
+        if self.config.pretraining_tp > 1:
+            attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
+            o_proj_slices = self.o_proj.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
+            attn_output = sum([F.linear(attn_output[i], o_proj_slices[i]) for i in range(self.config.pretraining_tp)])
+        else:
+            attn_output = self.o_proj(attn_output)

        if not output_attentions:
            attn_weights = None
@ -523,10 +564,9 @@ class LlamaSdpaAttention(LlamaAttention):
        key_states = self.k_proj(hidden_states)
        value_states = self.v_proj(hidden_states)

-        # use -1 to infer num_heads and num_key_value_heads as they may vary if tensor parallel is used
-        query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

        if position_embeddings is None:
            logger.warning_once(
@ -810,10 +850,7 @@ class LlamaModel(LlamaPreTrainedModel):
        )
        self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.rotary_emb = LlamaRotaryEmbedding(config=config)
-
        self.gradient_checkpointing = False
-        if getattr(config, "pretraining_tp", 1) != 1:
-            logger.warn("`pretraining_tp` is deprecated, please use `model.tensor_parallel` instead.")

        # Initialize weights and apply final processing
        self.post_init()
@ -893,7 +930,7 @@ class LlamaModel(LlamaPreTrainedModel):
        all_self_attns = () if output_attentions else None
        next_decoder_cache = None

-        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+        for decoder_layer in self.layers:
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

@ -1076,7 +1113,6 @@ class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ...

 class LlamaForCausalLM(LlamaPreTrainedModel, GenerationMixin):
    _tied_weights_keys = ["lm_head.weight"]
-    _tp_plan = {"lm_head": "colwise_rep"}

    def __init__(self, config):
        super().__init__(config)
@ -1175,8 +1211,13 @@ class LlamaForCausalLM(LlamaPreTrainedModel, GenerationMixin):
        )

        hidden_states = outputs[0]
-        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
-        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
+        if self.config.pretraining_tp > 1:
+            lm_head_slices = self.lm_head.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
+            logits = [F.linear(hidden_states, lm_head_slices[i]) for i in range(self.config.pretraining_tp)]
+            logits = torch.cat(logits, dim=-1)
+        else:
+            # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+            logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])

        loss = None
        if labels is not None:
--- a/src/transformers/models/llava/modeling_llava.py
+++ b/src/transformers/models/llava/modeling_llava.py
@ -485,7 +485,7 @@ class LlavaForConditionalGeneration(LlavaPreTrainedModel, GenerationMixin):
                "Expanding inputs for image tokens in LLaVa should be done in processing. "
                "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
                "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
+                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
            )
            # prefill stage vs decoding stage (legacy behavior copied)
            if input_ids.shape[1] != 1:
--- a/src/transformers/models/llava/processing_llava.py
+++ b/src/transformers/models/llava/processing_llava.py
@ -58,19 +58,10 @@ class LlavaProcessor(ProcessorMixin):
            in a chat into a tokenizable string.
        image_token (`str`, *optional*, defaults to `"<image>"`):
            Special token used to denote image location.
-        num_additional_image_tokens (`int`, *optional*, defaults to 0):
-            Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other
-            extra tokens appended, no need to set this arg.
    """

    attributes = ["image_processor", "tokenizer"]
-    valid_kwargs = [
-        "chat_template",
-        "patch_size",
-        "vision_feature_select_strategy",
-        "image_token",
-        "num_additional_image_tokens",
-    ]
+    valid_kwargs = ["chat_template", "patch_size", "vision_feature_select_strategy", "image_token"]
    image_processor_class = "AutoImageProcessor"
    tokenizer_class = "AutoTokenizer"

@ -82,11 +73,9 @@ class LlavaProcessor(ProcessorMixin):
        vision_feature_select_strategy=None,
        chat_template=None,
        image_token="<image>",  # set the default and let users change if they have peculiar special tokens in rare cases
-        num_additional_image_tokens=0,
        **kwargs,
    ):
        self.patch_size = patch_size
-        self.num_additional_image_tokens = num_additional_image_tokens
        self.vision_feature_select_strategy = vision_feature_select_strategy
        self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
        super().__init__(image_processor, tokenizer, chat_template=chat_template)
@ -158,11 +147,9 @@ class LlavaProcessor(ProcessorMixin):
                # Replace the image token with the expanded image token sequence
                pixel_values = image_inputs["pixel_values"]
                height, width = get_image_size(to_numpy_array(pixel_values[0]))
-                num_image_tokens = (height // self.patch_size) * (
-                    width // self.patch_size
-                ) + self.num_additional_image_tokens
+                num_image_tokens = (height // self.patch_size) * (width // self.patch_size) + 1
                if self.vision_feature_select_strategy == "default":
-                    num_image_tokens -= self.num_additional_image_tokens
+                    num_image_tokens -= 1

                prompt_strings = []
                for sample in text:
@ -173,7 +160,7 @@ class LlavaProcessor(ProcessorMixin):
                    "Expanding inputs for image tokens in LLaVa should be done in processing. "
                    "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
                    "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
-                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
+                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
                )

        text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
--- a/src/transformers/models/llava_next/modeling_llava_next.py
+++ b/src/transformers/models/llava_next/modeling_llava_next.py
@ -868,7 +868,7 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel, GenerationMixi
                "Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. "
                "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
                "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
-                "Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
+                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
            )
            if input_ids.shape[1] != 1:
                inputs_embeds = inputs_embeds.to(image_features.dtype)
--- a/src/transformers/models/llava_next/processing_llava_next.py
+++ b/src/transformers/models/llava_next/processing_llava_next.py
@ -61,19 +61,10 @@ class LlavaNextProcessor(ProcessorMixin):
            in a chat into a tokenizable string.
        image_token (`str`, *optional*, defaults to `"<image>"`):
            Special token used to denote image location.
-        num_additional_image_tokens (`int`, *optional*, defaults to 0):
-            Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other
-            extra tokens appended, no need to set this arg.
    """

    attributes = ["image_processor", "tokenizer"]
-    valid_kwargs = [
-        "chat_template",
-        "patch_size",
-        "vision_feature_select_strategy",
-        "image_token",
-        "num_additional_image_tokens",
-    ]
+    valid_kwargs = ["chat_template", "patch_size", "vision_feature_select_strategy", "image_token"]
    image_processor_class = "AutoImageProcessor"
    tokenizer_class = "AutoTokenizer"

@ -85,11 +76,9 @@ class LlavaNextProcessor(ProcessorMixin):
        vision_feature_select_strategy=None,
        chat_template=None,
        image_token="<image>",  # set the default and let users change if they have peculiar special tokens in rare cases
-        num_additional_image_tokens=0,
        **kwargs,
    ):
        self.patch_size = patch_size
-        self.num_additional_image_tokens = num_additional_image_tokens
        self.vision_feature_select_strategy = vision_feature_select_strategy
        self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
        super().__init__(image_processor, tokenizer, chat_template=chat_template)
@ -154,7 +143,7 @@ class LlavaNextProcessor(ProcessorMixin):
                    "Expanding inputs for image tokens in LLaVa-NeXT should be done in processing. "
                    "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
                    "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
-                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.50."
+                    "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
                )
            else:
                image_sizes = iter(image_inputs["image_sizes"])
@ -163,12 +152,10 @@ class LlavaNextProcessor(ProcessorMixin):
                for sample in text:
                    while self.image_token in sample:
                        image_size = next(image_sizes)
-                        if not isinstance(image_size, (list, tuple)):
-                            # cast to list to avoid numerical precision errors when calculating unpadding
-                            orig_height, orig_width = image_size.tolist()
+                        orig_height, orig_width = image_size
                        num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width)
                        if self.vision_feature_select_strategy == "default":
-                            num_image_tokens -= self.num_additional_image_tokens
+                            num_image_tokens -= 1
                        sample = sample.replace(self.image_token, "<placeholder>" * num_image_tokens, 1)
                    prompt_strings.append(sample)
                prompt_strings = [sample.replace("<placeholder>", self.image_token) for sample in prompt_strings]
@ -191,7 +178,7 @@ class LlavaNextProcessor(ProcessorMixin):
            orig_height, orig_width, patches_height, patches_width, scale_height, scale_width
        )
        # The base patch covers the entire image (+1 for the CLS)
-        base_features = patches_height * patches_width + self.num_additional_image_tokens
+        base_features = patches_height * patches_width + 1
        num_image_tokens = unpadded_features + newline_features + base_features
        return num_image_tokens

--- a/src/transformers/models/llava_next_video/processing_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/processing_llava_next_video.py
@ -58,22 +58,12 @@ class LlavaNextVideoProcessor(ProcessorMixin):
            Special token used to denote video location.
        image_token (`str`, *optional*, defaults to `"<image>"`):
            Special token used to denote image location.
-        num_additional_image_tokens (`int`, *optional*, defaults to 0):
-            Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other
-            extra tokens appended, no need to set this arg.
    """

    # video and image processor share same args, but have different processing logic
    # only image processor config is saved in the hub
    attributes = ["video_processor", "image_processor", "tokenizer"]
-    valid_kwargs = [
-        "chat_template",
-        "patch_size",
-        "vision_feature_select_strategy",
-        "image_token",
-        "video_token",
-        "num_additional_image_tokens",
-    ]
+    valid_kwargs = ["chat_template", "patch_size", "vision_feature_select_strategy", "image_token", "video_token"]
    image_processor_class = "LlavaNextImageProcessor"
    video_processor_class = "LlavaNextVideoImageProcessor"
    tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
@ -88,11 +78,9 @@ class LlavaNextVideoProcessor(ProcessorMixin):
        vision_feature_select_strategy=None,
        video_token="<video>",
        image_token="<image>",
-        num_additional_image_tokens=0,
        **kwargs,
    ):
        self.patch_size = patch_size
-        self.num_additional_image_tokens = num_additional_image_tokens
        self.vision_feature_select_strategy = vision_feature_select_strategy
        self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
        self.video_token = tokenizer.video_token if hasattr(tokenizer, "video_token") else video_token
@ -176,9 +164,8 @@ class LlavaNextVideoProcessor(ProcessorMixin):
        if self.patch_size is None or self.vision_feature_select_strategy is None:
            logger.warning_once(
                "Expanding inputs for image/video tokens in LLaVa-NeXT-Video should be done in processing. "
-                "Please add `patch_size`, `num_additional_image_tokens` and `vision_feature_select_strategy` to the model's processing config or set directly "
-                "with `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` "
-                "and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
+                "Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly "
+                "with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. "
                "Using processors without these attributes in the config is deprecated and will throw an error in v4.47."
            )
        else:
@ -190,12 +177,10 @@ class LlavaNextVideoProcessor(ProcessorMixin):
                for sample in text:
                    while self.image_token in sample:
                        image_size = next(image_sizes)
-                        if not isinstance(image_size, (list, tuple)):
-                            # cast to list to avoid numerical precision errors when calculating unpadding
-                            orig_height, orig_width = image_size.tolist()
+                        orig_height, orig_width = image_size
                        num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width)
                        if self.vision_feature_select_strategy == "default":
-                            num_image_tokens -= self.num_additional_image_tokens
+                            num_image_tokens -= 1
                        sample = sample.replace(self.image_token, "<placeholder>" * num_image_tokens, 1)
                    prompt_strings.append(sample)
                text = [sample.replace("<placeholder>", self.image_token) for sample in prompt_strings]
@ -205,8 +190,6 @@ class LlavaNextVideoProcessor(ProcessorMixin):
                one_video = to_numpy_array(videos_inputs.get("pixel_values_videos")[0])
                height, width = get_image_size(one_video[0])
                num_frames = one_video.shape[0]  # frame dim is always after batch dim
-
-                # no `self.num_additional_image_tokens` added because video always has a default feature selection strategy
                num_image_tokens = (height // self.patch_size) * (width // self.patch_size)
                num_video_tokens = num_image_tokens // 4 * num_frames  # divide by 4 needed for avg pooling layer
                prompt_strings = []
@ -239,7 +222,7 @@ class LlavaNextVideoProcessor(ProcessorMixin):
            orig_height, orig_width, patches_height, patches_width, scale_height, scale_width
        )
        # The base patch covers the entire image (+1 for the CLS)
-        base_features = patches_height * patches_width + self.num_additional_image_tokens
+        base_features = patches_height * patches_width + 1
        num_image_tokens = unpadded_features + newline_features + base_features
        return num_image_tokens

--- a/src/transformers/models/llava_onevision/processing_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/processing_llava_onevision.py
@ -188,10 +188,7 @@ class LlavaOnevisionProcessor(ProcessorMixin):
        for sample in text:
            while special_token in sample:
                image_size_list = next(image_sizes)
-                original_size = image_size_list[0] if num_frames != 1 else image_size_list
-                if not isinstance(original_size, (list, tuple)):
-                    # cast to list to avoid numerical precision errors when calculating unpadding
-                    orig_height, orig_width = original_size.tolist()
+                orig_height, orig_width = image_size_list[0] if num_frames != 1 else image_size_list
                num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width)
                if self.vision_feature_select_strategy == "default":
                    num_image_tokens -= 1
--- a/src/transformers/models/mask2former/image_processing_mask2former.py
+++ b/src/transformers/models/mask2former/image_processing_mask2former.py
@ -1034,8 +1034,7 @@ class Mask2FormerImageProcessor(BaseImageProcessor):
    ) -> List[Dict]:
        """
        Converts the output of [`Mask2FormerForUniversalSegmentationOutput`] into instance segmentation predictions.
-        Only supports PyTorch. If instances could overlap, set either return_coco_annotation or return_binary_maps
-        to `True` to get the correct segmentation result.
+        Only supports PyTorch.

        Args:
            outputs ([`Mask2FormerForUniversalSegmentation`]):
@ -1057,10 +1056,9 @@ class Mask2FormerImageProcessor(BaseImageProcessor):
                (one per detected instance).
        Returns:
            `List[Dict]`: A list of dictionaries, one per image, each dictionary containing two keys:
-            - **segmentation** -- A tensor of shape `(height, width)` where each pixel represents a `segment_id`, or
+            - **segmentation** -- A tensor of shape `(height, width)` where each pixel represents a `segment_id` or
              `List[List]` run-length encoding (RLE) of the segmentation map if return_coco_annotation is set to
-              `True`, or a tensor of shape `(num_instances, height, width)` if return_binary_maps is set to `True`.
-              Set to `None` if no mask if found above `threshold`.
+              `True`. Set to `None` if no mask if found above `threshold`.
            - **segments_info** -- A dictionary that contains additional information on each segment.
                - **id** -- An integer representing the `segment_id`.
                - **label_id** -- An integer representing the label / semantic class id corresponding to `segment_id`.
--- a/src/transformers/models/mask2former/modeling_mask2former.py
+++ b/src/transformers/models/mask2former/modeling_mask2former.py
@ -926,7 +926,7 @@ class Mask2FormerPixelDecoderEncoderMultiscaleDeformableAttention(nn.Module):
        encoder_attention_mask=None,
        position_embeddings: Optional[torch.Tensor] = None,
        reference_points=None,
-        spatial_shapes_list=None,
+        spatial_shapes=None,
        level_start_index=None,
        output_attentions: bool = False,
    ):
@ -936,8 +936,7 @@ class Mask2FormerPixelDecoderEncoderMultiscaleDeformableAttention(nn.Module):

        batch_size, num_queries, _ = hidden_states.shape
        batch_size, sequence_length, _ = encoder_hidden_states.shape
-        total_elements = sum(height * width for height, width in spatial_shapes_list)
-        if total_elements != sequence_length:
+        if (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() != sequence_length:
            raise ValueError(
                "Make sure to align the spatial shapes with the sequence length of the encoder hidden states"
            )
@ -958,11 +957,7 @@ class Mask2FormerPixelDecoderEncoderMultiscaleDeformableAttention(nn.Module):
        )
        # batch_size, num_queries, n_heads, n_levels, n_points, 2
        if reference_points.shape[-1] == 2:
-            offset_normalizer = torch.tensor(
-                [[shape[1], shape[0]] for shape in spatial_shapes_list],
-                dtype=torch.long,
-                device=reference_points.device,
-            )
+            offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
            sampling_locations = (
                reference_points[:, :, None, :, None, :]
                + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
@ -975,7 +970,7 @@ class Mask2FormerPixelDecoderEncoderMultiscaleDeformableAttention(nn.Module):
        else:
            raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}")

-        output = multi_scale_deformable_attention(value, spatial_shapes_list, sampling_locations, attention_weights)
+        output = multi_scale_deformable_attention(value, spatial_shapes, sampling_locations, attention_weights)
        output = self.output_proj(output)

        return output, attention_weights
@ -1006,7 +1001,7 @@ class Mask2FormerPixelDecoderEncoderLayer(nn.Module):
        attention_mask: torch.Tensor,
        position_embeddings: torch.Tensor = None,
        reference_points=None,
-        spatial_shapes_list=None,
+        spatial_shapes=None,
        level_start_index=None,
        output_attentions: bool = False,
    ):
@ -1020,8 +1015,8 @@ class Mask2FormerPixelDecoderEncoderLayer(nn.Module):
                Position embeddings, to be added to `hidden_states`.
            reference_points (`torch.FloatTensor`, *optional*):
                Reference points.
-            spatial_shapes_list (`list` of `tuple`):
-                Spatial shapes of the backbone feature maps as a list of tuples.
+            spatial_shapes (`torch.LongTensor`, *optional*):
+                Spatial shapes of the backbone feature maps.
            level_start_index (`torch.LongTensor`, *optional*):
                Level start index.
            output_attentions (`bool`, *optional*):
@ -1038,7 +1033,7 @@ class Mask2FormerPixelDecoderEncoderLayer(nn.Module):
            encoder_attention_mask=attention_mask,
            position_embeddings=position_embeddings,
            reference_points=reference_points,
-            spatial_shapes_list=spatial_shapes_list,
+            spatial_shapes=spatial_shapes,
            level_start_index=level_start_index,
            output_attentions=output_attentions,
        )
@ -1091,13 +1086,13 @@ class Mask2FormerPixelDecoderEncoderOnly(nn.Module):
        )

    @staticmethod
-    def get_reference_points(spatial_shapes_list, valid_ratios, device):
+    def get_reference_points(spatial_shapes, valid_ratios, device):
        """
        Get reference points for each feature map. Used in decoder.

        Args:
-            spatial_shapes_list (`list` of `tuple`):
-                Spatial shapes of the backbone feature maps as a list of tuples.
+            spatial_shapes (`torch.LongTensor`):
+                Spatial shapes of each feature map, has shape of `(num_feature_levels, 2)`.
            valid_ratios (`torch.FloatTensor`):
                Valid ratios of each feature map, has shape of `(batch_size, num_feature_levels, 2)`.
            device (`torch.device`):
@ -1106,7 +1101,7 @@ class Mask2FormerPixelDecoderEncoderOnly(nn.Module):
            `torch.FloatTensor` of shape `(batch_size, num_queries, num_feature_levels, 2)`
        """
        reference_points_list = []
-        for lvl, (height, width) in enumerate(spatial_shapes_list):
+        for lvl, (height, width) in enumerate(spatial_shapes):
            ref_y, ref_x = torch.meshgrid(
                torch.linspace(0.5, height - 0.5, height, dtype=valid_ratios.dtype, device=device),
                torch.linspace(0.5, width - 0.5, width, dtype=valid_ratios.dtype, device=device),
@ -1127,7 +1122,7 @@ class Mask2FormerPixelDecoderEncoderOnly(nn.Module):
        inputs_embeds=None,
        attention_mask=None,
        position_embeddings=None,
-        spatial_shapes_list=None,
+        spatial_shapes=None,
        level_start_index=None,
        valid_ratios=None,
        output_attentions=None,
@ -1145,8 +1140,8 @@ class Mask2FormerPixelDecoderEncoderOnly(nn.Module):
                [What are attention masks?](../glossary#attention-mask)
            position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Position embeddings that are added to the queries and keys in each self-attention layer.
-            spatial_shapes_list (`list` of `tuple`):
-                Spatial shapes of each feature map as a list of tuples.
+            spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`):
+                Spatial shapes of each feature map.
            level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`):
                Starting index of each feature map.
            valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
@ -1167,7 +1162,7 @@ class Mask2FormerPixelDecoderEncoderOnly(nn.Module):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        hidden_states = inputs_embeds
-        reference_points = self.get_reference_points(spatial_shapes_list, valid_ratios, device=inputs_embeds.device)
+        reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=inputs_embeds.device)

        all_hidden_states = () if output_hidden_states else None
        all_attentions = () if output_attentions else None
@ -1181,7 +1176,7 @@ class Mask2FormerPixelDecoderEncoderOnly(nn.Module):
                attention_mask,
                position_embeddings=position_embeddings,
                reference_points=reference_points,
-                spatial_shapes_list=spatial_shapes_list,
+                spatial_shapes=spatial_shapes,
                level_start_index=level_start_index,
                output_attentions=output_attentions,
            )
@ -1307,9 +1302,9 @@ class Mask2FormerPixelDecoder(nn.Module):
        ]

        # Prepare encoder inputs (by flattening)
-        spatial_shapes_list = [(embed.shape[2], embed.shape[3]) for embed in input_embeds]
+        spatial_shapes = [(embed.shape[2], embed.shape[3]) for embed in input_embeds]
        input_embeds_flat = torch.cat([embed.flatten(2).transpose(1, 2) for embed in input_embeds], 1)
-        spatial_shapes = torch.as_tensor(spatial_shapes_list, dtype=torch.long, device=input_embeds_flat.device)
+        spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=input_embeds_flat.device)
        masks_flat = torch.cat([mask.flatten(1) for mask in masks], 1)

        position_embeddings = [embed.flatten(2).transpose(1, 2) for embed in position_embeddings]
@ -1325,7 +1320,7 @@ class Mask2FormerPixelDecoder(nn.Module):
                inputs_embeds=input_embeds_flat,
                attention_mask=masks_flat,
                position_embeddings=level_pos_embed_flat,
-                spatial_shapes_list=spatial_shapes_list,
+                spatial_shapes=spatial_shapes,
                level_start_index=level_start_index,
                valid_ratios=valid_ratios,
                output_attentions=output_attentions,
@ -1336,23 +1331,18 @@ class Mask2FormerPixelDecoder(nn.Module):
        last_hidden_state = encoder_outputs.last_hidden_state
        batch_size = last_hidden_state.shape[0]

-        # We compute level_start_index_list separately from the tensor version level_start_index
-        # to avoid iterating over a tensor which breaks torch.compile/export.
-        level_start_index_list = [0]
-        for height, width in spatial_shapes_list[:-1]:
-            level_start_index_list.append(level_start_index_list[-1] + height * width)
        split_sizes = [None] * self.num_feature_levels
        for i in range(self.num_feature_levels):
            if i < self.num_feature_levels - 1:
-                split_sizes[i] = level_start_index_list[i + 1] - level_start_index_list[i]
+                split_sizes[i] = level_start_index[i + 1] - level_start_index[i]
            else:
-                split_sizes[i] = last_hidden_state.shape[1] - level_start_index_list[i]
+                split_sizes[i] = last_hidden_state.shape[1] - level_start_index[i]

-        encoder_output = torch.split(last_hidden_state, split_sizes, dim=1)
+        encoder_output = torch.split(last_hidden_state, [size.item() for size in split_sizes], dim=1)

        # Compute final features
        outputs = [
-            x.transpose(1, 2).view(batch_size, -1, spatial_shapes_list[i][0], spatial_shapes_list[i][1])
+            x.transpose(1, 2).view(batch_size, -1, spatial_shapes[i][0], spatial_shapes[i][1])
            for i, x in enumerate(encoder_output)
        ]

@ -1886,9 +1876,7 @@ class Mask2FormerMaskedAttentionDecoder(nn.Module):
            else:
                level_index = idx % self.num_feature_levels

-                where = (attention_mask.sum(-1) != attention_mask.shape[-1]).to(attention_mask.dtype)
-                # Multiply the attention mask instead of indexing to avoid issue in torch.export.
-                attention_mask = attention_mask * where.unsqueeze(-1)
+                attention_mask[torch.where(attention_mask.sum(-1) == attention_mask.shape[-1])] = False

                layer_outputs = decoder_layer(
                    hidden_states,
@ -2428,8 +2416,8 @@ class Mask2FormerForUniversalSegmentation(Mask2FormerPreTrainedModel):
        >>> masks_queries_logits = outputs.masks_queries_logits

        >>> # Perform post-processing to get instance segmentation map
-        >>> pred_instance_map = image_processor.post_process_instance_segmentation(
-        ...     outputs, target_sizes=[(image.height, image.width)]
+        >>> pred_instance_map = image_processor.post_process_semantic_segmentation(
+        ...     outputs, target_sizes=[image.size[::-1]]
        ... )[0]
        >>> print(pred_instance_map.shape)
        torch.Size([480, 640])
@ -2462,7 +2450,7 @@ class Mask2FormerForUniversalSegmentation(Mask2FormerPreTrainedModel):

        >>> # Perform post-processing to get semantic segmentation map
        >>> pred_semantic_map = image_processor.post_process_semantic_segmentation(
-        ...     outputs, target_sizes=[(image.height, image.width)]
+        ...     outputs, target_sizes=[image.size[::-1]]
        ... )[0]
        >>> print(pred_semantic_map.shape)
        torch.Size([512, 683])
@ -2496,7 +2484,7 @@ class Mask2FormerForUniversalSegmentation(Mask2FormerPreTrainedModel):

        >>> # Perform post-processing to get panoptic segmentation map
        >>> pred_panoptic_map = image_processor.post_process_panoptic_segmentation(
-        ...     outputs, target_sizes=[(image.height, image.width)]
+        ...     outputs, target_sizes=[image.size[::-1]]
        ... )[0]["segmentation"]
        >>> print(pred_panoptic_map.shape)
        torch.Size([338, 676])
--- a/src/transformers/models/maskformer/image_processing_maskformer.py
+++ b/src/transformers/models/maskformer/image_processing_maskformer.py
@ -1080,8 +1080,7 @@ class MaskFormerImageProcessor(BaseImageProcessor):
    ) -> List[Dict]:
        """
        Converts the output of [`MaskFormerForInstanceSegmentationOutput`] into instance segmentation predictions. Only
-        supports PyTorch. If instances could overlap, set either return_coco_annotation or return_binary_maps
-        to `True` to get the correct segmentation result.
+        supports PyTorch.

        Args:
            outputs ([`MaskFormerForInstanceSegmentation`]):
@ -1103,10 +1102,9 @@ class MaskFormerImageProcessor(BaseImageProcessor):
                (one per detected instance).
        Returns:
            `List[Dict]`: A list of dictionaries, one per image, each dictionary containing two keys:
-            - **segmentation** -- A tensor of shape `(height, width)` where each pixel represents a `segment_id`, or
+            - **segmentation** -- A tensor of shape `(height, width)` where each pixel represents a `segment_id` or
              `List[List]` run-length encoding (RLE) of the segmentation map if return_coco_annotation is set to
-              `True`, or a tensor of shape `(num_instances, height, width)` if return_binary_maps is set to `True`.
-              Set to `None` if no mask if found above `threshold`.
+              `True`. Set to `None` if no mask if found above `threshold`.
            - **segments_info** -- A dictionary that contains additional information on each segment.
                - **id** -- An integer representing the `segment_id`.
                - **label_id** -- An integer representing the label / semantic class id corresponding to `segment_id`.
--- a/src/transformers/models/maskformer/modeling_maskformer.py
+++ b/src/transformers/models/maskformer/modeling_maskformer.py
@ -1780,7 +1780,7 @@ class MaskFormerForInstanceSegmentation(MaskFormerPreTrainedModel):

        >>> # you can pass them to image_processor for postprocessing
        >>> predicted_semantic_map = image_processor.post_process_semantic_segmentation(
-        ...     outputs, target_sizes=[(image.height, image.width)]
+        ...     outputs, target_sizes=[image.size[::-1]]
        ... )[0]

        >>> # we refer to the demo notebooks for visualization (see "Resources" section in the MaskFormer docs)
@ -1810,7 +1810,7 @@ class MaskFormerForInstanceSegmentation(MaskFormerPreTrainedModel):
        >>> masks_queries_logits = outputs.masks_queries_logits

        >>> # you can pass them to image_processor for postprocessing
-        >>> result = image_processor.post_process_panoptic_segmentation(outputs, target_sizes=[(image.height, image.width)])[0]
+        >>> result = image_processor.post_process_panoptic_segmentation(outputs, target_sizes=[image.size[::-1]])[0]

        >>> # we refer to the demo notebooks for visualization (see "Resources" section in the MaskFormer docs)
        >>> predicted_panoptic_map = result["segmentation"]
--- a/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py
+++ b/src/transformers/models/megatron_gpt2/checkpoint_reshaping_and_interoperability.py
@ -21,11 +21,10 @@ import sys
 import types

 import torch
-from huggingface_hub import split_torch_state_dict_into_shards
 from packaging import version

 from transformers import AutoTokenizer, GPT2Config
-from transformers.modeling_utils import WEIGHTS_INDEX_NAME, WEIGHTS_NAME
+from transformers.modeling_utils import WEIGHTS_INDEX_NAME, WEIGHTS_NAME, shard_checkpoint


 def add_checkpointing_args(parser):
@ -572,15 +571,7 @@ def convert_checkpoint_from_megatron_to_transformers(args):

    # Store the state_dict to file.
    max_shard_size = int(args.max_shard_size) if args.max_shard_size.isdigit() else args.max_shard_size
-    state_dict_split = split_torch_state_dict_into_shards(output_state_dict, max_shard_size=max_shard_size)
-    shards = index = None
-    for tensors in state_dict_split.filename_to_tensors.values():
-        shards = {tensor: state_dict[tensor] for tensor in tensors}
-    if state_dict_split.is_sharded:
-        index = {
-            "metadata": state_dict_split.metadata,
-            "weight_map": state_dict_split.tensor_to_filename,
-        }
+    shards, index = shard_checkpoint(output_state_dict, max_shard_size=max_shard_size)

    # Save the model
    for shard_file, shard in shards.items():
--- a/src/transformers/models/nemotron/modeling_nemotron.py
+++ b/src/transformers/models/nemotron/modeling_nemotron.py
@ -980,7 +980,7 @@ class NemotronModel(NemotronPreTrainedModel):
        return causal_mask


-# TODO: re-enable check: Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with LLAMA->NEMOTRON,Llama->Nemotron,llama->nemotron
+# Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with LLAMA->NEMOTRON,Llama->Nemotron,llama->nemotron
 class NemotronForCausalLM(NemotronPreTrainedModel, GenerationMixin):
    _tied_weights_keys = ["lm_head.weight"]

--- a/src/transformers/models/olmo/modeling_olmo.py
+++ b/src/transformers/models/olmo/modeling_olmo.py
@ -1020,7 +1020,7 @@ class OlmoModel(OlmoPreTrainedModel):
        return causal_mask


-# TODO: re-enable check: Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with LLAMA->OLMO,Llama->Olmo
+# Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with LLAMA->OLMO,Llama->Olmo
 class OlmoForCausalLM(OlmoPreTrainedModel, GenerationMixin):
    _tied_weights_keys = ["lm_head.weight"]

--- a/src/transformers/models/olmo_1124/init.py
+++ b/src/transformers/models/olmo_1124/init.py
@ -1,27 +0,0 @@
-# Copyright 2024 EleutherAI and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import TYPE_CHECKING
-
-from ...utils import _LazyModule
-from ...utils.import_utils import define_import_structure
-
-
-if TYPE_CHECKING:
-    from .configuration_olmo_1124 import *
-    from .modeling_olmo_1124 import *
-else:
-    import sys
-
-    _file = globals()["__file__"]
-    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
--- a/src/transformers/models/olmo_1124/configuration_olmo_1124.py
+++ b/src/transformers/models/olmo_1124/configuration_olmo_1124.py
@ -1,166 +0,0 @@
-#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-#           This file was automatically generated from src/transformers/models/olmo_1124/modular_olmo_1124.py.
-#               Do NOT edit this file manually as any edits will be overwritten by the generation of
-#             the file from the modular. If any change should be done, please apply the change to the
-#                          modular_olmo_1124.py file directly. One of our CI enforces this.
-#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
-
-from ...configuration_utils import PretrainedConfig
-
-
-class Olmo1124Config(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`Olmo1124Model`]. It is used to instantiate an OLMo November 2024
-    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    defaults will yield a similar configuration to that of the [allenai/Olmo1124-7B-hf](https://huggingface.co/allenai/Olmo1124-7B-hf).
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 50304):
-            Vocabulary size of the Olmo1124 model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`Olmo1124Model`]
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 11008):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of hidden layers in the Transformer decoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer decoder.
-        num_key_value_heads (`int`, *optional*):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
-            `num_attention_heads`.
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 2048):
-            The maximum sequence length that this model might ever be used with.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        pad_token_id (`int`, *optional*, defaults to 1):
-            Padding token id.
-        bos_token_id (`int`, *optional*):
-            Beginning of stream token id.
-        eos_token_id (`int`, *optional*, defaults to 50279):
-            End of stream token id.
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
-            Whether to tie weight embeddings
-        rope_theta (`float`, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings.
-        rope_scaling (`Dict`, *optional*):
-            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
-            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
-            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
-            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
-            these scaling strategies behave:
-            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
-            experimental feature, subject to breaking API changes in future versions.
-        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
-            Whether to use a bias in the query, key, value and output projection layers during self-attention.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
-            The epsilon used by the rms normalization layers.
-
-    ```python
-    >>> from transformers import Olmo1124Model, Olmo1124Config
-
-    >>> # Initializing a Olmo November 2024 7B style configuration
-    >>> configuration = Olmo1124Config()
-
-    >>> # Initializing a model from the Olmo November 2024 7B style configuration
-    >>> model = Olmo1124Model(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```
-    """
-
-    model_type = "olmo_1124"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=50304,
-        hidden_size=4096,
-        intermediate_size=11008,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=None,
-        hidden_act="silu",
-        max_position_embeddings=2048,
-        initializer_range=0.02,
-        use_cache=True,
-        pad_token_id=1,
-        bos_token_id=None,
-        eos_token_id=50279,
-        tie_word_embeddings=False,
-        rope_theta=10000.0,
-        rope_scaling=None,
-        attention_bias=False,
-        attention_dropout=0.0,
-        rms_norm_eps=1e-5,
-        **kwargs,
-    ):
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
-        self.num_key_value_heads = num_key_value_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
-        self._rope_scaling_validation()
-        self.attention_bias = attention_bias
-        self.attention_dropout = attention_dropout
-
-        self.rms_norm_eps = rms_norm_eps
-
-    def _rope_scaling_validation(self):
-        """
-        Validate the `rope_scaling` configuration.
-        """
-        if self.rope_scaling is None:
-            return
-
-        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
-            raise ValueError(
-                "`rope_scaling` must be a dictionary with two fields, `type` and `factor`, " f"got {self.rope_scaling}"
-            )
-        rope_scaling_type = self.rope_scaling.get("type", None)
-        rope_scaling_factor = self.rope_scaling.get("factor", None)
-        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
-            raise ValueError(
-                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
-            )
-        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
-            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
-
-
-__all__ = ["Olmo1124Config"]
--- a/src/transformers/models/olmo_1124/convert_olmo_1124_weights_to_hf.py
+++ b/src/transformers/models/olmo_1124/convert_olmo_1124_weights_to_hf.py
@ -1,304 +0,0 @@
-# Copyright 2024 EleutherAI and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import gc
-import json
-import os
-import shutil
-from pathlib import Path
-from typing import Any, Dict
-
-import torch
-import yaml
-from tokenizers import Tokenizer
-
-from transformers import Olmo1124Config, Olmo1124ForCausalLM
-from transformers.models.gpt2.tokenization_gpt2_fast import GPT2TokenizerFast
-
-
-"""
-Sample usage:
-
-```
-python src/transformers/models/olmo_1124/convert_olmo_1124_weights_to_hf.py \
-    --input_dir /path/to/downloaded/olmo_1124/weights --model_size 7B --output_dir /output/path
-```
-
-Thereafter, models can be loaded via:
-
-```py
-from transformers import Olmo1124ForCausalLM, AutoTokenizer
-
-model = Olmo1124ForCausalLM.from_pretrained("/output/path")
-tokenizer = AutoTokenizer.from_pretrained("/output/path")
-```
-
-Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
-come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
-"""
-
-
-def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256):
-    return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of)
-
-
-def read_json(path):
-    with open(path, "r") as f:
-        return json.load(f)
-
-
-def write_json(text, path):
-    with open(path, "w") as f:
-        json.dump(text, f)
-
-
-def write_model(
-    model_path,
-    input_base_path,
-    include_tokenizer=True,
-    tokenizer_path=None,
-    safe_serialization=True,
-    fix_eos_token_id=True,
-    tmp_cleanup=True,
-):
-    os.makedirs(model_path, exist_ok=True)
-    tmp_model_path = os.path.join(model_path, "tmp")
-    os.makedirs(tmp_model_path, exist_ok=True)
-
-    config_path = Path(input_base_path) / "config.yaml"
-    olmo_1124_config = yaml.safe_load(config_path.read_text())["model"]
-
-    if not olmo_1124_config.get("attention_layer_norm", False):
-        raise RuntimeError("OLMo November 2024 checkpoints must have attention layer norm")
-    if not olmo_1124_config.get("norm_after", False):
-        raise RuntimeError("OLMo November 2024 checkpoints must set norm_after to True")
-
-    n_layers = olmo_1124_config["n_layers"]
-    n_heads = olmo_1124_config["n_heads"]
-    dim = olmo_1124_config["d_model"]
-    dims_per_head = dim // n_heads
-    base = olmo_1124_config["rope_theta"]
-    inv_freq = 1.0 / (base ** (torch.arange(0, dims_per_head, 2).float() / dims_per_head))
-    max_position_embeddings = olmo_1124_config["max_sequence_length"]
-
-    vocab_size = olmo_1124_config.get("embedding_size", olmo_1124_config["vocab_size"])
-
-    if olmo_1124_config.get("n_kv_heads", None) is not None:
-        num_key_value_heads = olmo_1124_config["n_kv_heads"]  # for GQA / MQA
-    elif olmo_1124_config["multi_query_attention"]:  # compatibility with other checkpoints
-        num_key_value_heads = 1
-    else:
-        num_key_value_heads = n_heads
-
-    print(f"Fetching all parameters from the checkpoint at {input_base_path}.")
-
-    # Not sharded
-    # (The sharded implementation would also work, but this is simpler.)
-    loaded = torch.load(os.path.join(input_base_path, "model.pt"), map_location="cpu")
-
-    param_count = 0
-    index_dict: Dict[str, Any] = {"weight_map": {}}
-    for layer_i in range(n_layers):
-        filename = f"pytorch_model-{layer_i + 1}-of-{n_layers + 1}.bin"
-        # Unsharded
-        # TODO: Layernorm stuff
-        # TODO: multi query attention
-        fused_dims = [dim, dims_per_head * num_key_value_heads, dims_per_head * num_key_value_heads]
-        q_proj_weight, k_proj_weight, v_proj_weight = torch.split(
-            loaded[f"transformer.blocks.{layer_i}.att_proj.weight"], fused_dims, dim=0
-        )
-        up_proj_weight, gate_proj_weight = torch.chunk(
-            loaded[f"transformer.blocks.{layer_i}.ff_proj.weight"], 2, dim=0
-        )
-        state_dict = {
-            f"model.layers.{layer_i}.self_attn.q_proj.weight": q_proj_weight,
-            f"model.layers.{layer_i}.self_attn.k_proj.weight": k_proj_weight,
-            f"model.layers.{layer_i}.self_attn.v_proj.weight": v_proj_weight,
-            f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"transformer.blocks.{layer_i}.attn_out.weight"],
-            f"model.layers.{layer_i}.self_attn.q_norm.weight": loaded[f"transformer.blocks.{layer_i}.q_norm.weight"],
-            f"model.layers.{layer_i}.self_attn.k_norm.weight": loaded[f"transformer.blocks.{layer_i}.k_norm.weight"],
-            f"model.layers.{layer_i}.mlp.gate_proj.weight": gate_proj_weight,
-            f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[f"transformer.blocks.{layer_i}.ff_out.weight"],
-            f"model.layers.{layer_i}.mlp.up_proj.weight": up_proj_weight,
-            f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[
-                f"transformer.blocks.{layer_i}.attn_norm.weight"
-            ],
-            f"model.layers.{layer_i}.post_feedforward_layernorm.weight": loaded[
-                f"transformer.blocks.{layer_i}.ff_norm.weight"
-            ],
-        }
-
-        state_dict[f"model.layers.{layer_i}.self_attn.rotary_emb.inv_freq"] = inv_freq
-
-        for k, v in state_dict.items():
-            index_dict["weight_map"][k] = filename
-            param_count += v.numel()
-        torch.save(state_dict, os.path.join(tmp_model_path, filename))
-
-    filename = f"pytorch_model-{n_layers + 1}-of-{n_layers + 1}.bin"
-
-    # Unsharded
-    # TODO: Deal with weight-tying
-    state_dict = {
-        "model.embed_tokens.weight": loaded["transformer.wte.weight"],
-        "model.norm.weight": loaded["transformer.ln_f.weight"],
-        "lm_head.weight": loaded["transformer.ff_out.weight"]
-        if "transformer.ff_out.weight" in loaded
-        else loaded["transformer.wte.weight"],
-    }
-
-    for k, v in state_dict.items():
-        index_dict["weight_map"][k] = filename
-        param_count += v.numel()
-    torch.save(state_dict, os.path.join(tmp_model_path, filename))
-
-    # Write configs
-    index_dict["metadata"] = {"total_size": param_count * 2}
-    write_json(index_dict, os.path.join(tmp_model_path, "pytorch_model.bin.index.json"))
-
-    if olmo_1124_config.get("mlp_hidden_size", None) is not None:
-        intermediate_size = olmo_1124_config["mlp_hidden_size"] // 2
-    else:
-        intermediate_size = (dim * olmo_1124_config["mlp_ratio"]) // 2
-
-    if fix_eos_token_id and olmo_1124_config["eos_token_id"] == 0:
-        # Fixing a bug in OLMo where eos token id was incorrectly set
-        print("Changing eos_token_id from 0 to 50279.")
-        olmo_1124_config["eos_token_id"] = 50279
-
-    config = Olmo1124Config(
-        vocab_size=vocab_size,
-        hidden_size=dim,
-        intermediate_size=intermediate_size,
-        num_hidden_layers=n_layers,
-        num_attention_heads=n_heads,
-        num_key_value_heads=num_key_value_heads,
-        max_position_embeddings=max_position_embeddings,
-        pad_token_id=olmo_1124_config["pad_token_id"],
-        bos_token_id=None,
-        eos_token_id=olmo_1124_config["eos_token_id"],
-        tie_word_embeddings=olmo_1124_config["weight_tying"],
-        rms_norm_eps=olmo_1124_config["layer_norm_eps"],
-        rope_theta=base,
-    )
-    config.save_pretrained(tmp_model_path)
-
-    # Make space so we can load the model properly now.
-    del state_dict
-    del loaded
-    gc.collect()
-
-    if include_tokenizer:
-        _write_tokenizer(model_path, config, input_base_path, tokenizer_path)
-
-    print("Loading the checkpoint in a OLMo November 2024 model.")
-    model = Olmo1124ForCausalLM.from_pretrained(tmp_model_path, torch_dtype=torch.float32, low_cpu_mem_usage=True)
-    # Avoid saving this as part of the config.
-    del model.config._name_or_path
-    print("Saving in the Transformers format.")
-    model.save_pretrained(model_path, safe_serialization=safe_serialization)
-    if tmp_cleanup:
-        # Make cleanup optional; attempting to `rmtree` the `tmp_model_path` causes
-        # errors if using NFS.
-        shutil.rmtree(tmp_model_path)
-
-
-def _write_tokenizer(
-    output_path: Path,
-    config: Olmo1124Config,
-    checkpoint_dir: str,
-    input_tokenizer_path: Path | None,
-) -> None:
-    print(f"Saving a {GPT2TokenizerFast.__name__} to {output_path}.")
-
-    if input_tokenizer_path is not None:
-        base_tokenizer = Tokenizer.from_file(str(input_tokenizer_path))
-    else:
-        config_path = Path(checkpoint_dir) / "config.yaml"
-        tokenizer_config = yaml.safe_load(config_path.read_text())["tokenizer"]
-
-        # Initialize tokenizer and validate vocab size.
-        if Path(tokenizer_config["identifier"]).is_file():
-            base_tokenizer = Tokenizer.from_file(tokenizer_config["identifier"])
-        else:
-            base_tokenizer = Tokenizer.from_pretrained(tokenizer_config["identifier"])
-
-    eos_token_id = config.eos_token_id if config.eos_token_id is not None else base_tokenizer.get_vocab_size() - 1
-    pad_token_id = config.pad_token_id if config.pad_token_id is not None else eos_token_id
-
-    tokenizer = GPT2TokenizerFast(
-        tokenizer_object=base_tokenizer,
-        eos_token=base_tokenizer.decode([eos_token_id], skip_special_tokens=False),
-        pad_token=base_tokenizer.decode([pad_token_id], skip_special_tokens=False),
-    )
-
-    tokenizer.save_pretrained(output_path)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--input_dir",
-        required=True,
-        help="Location of OLMo November 2024 weights, which contains config.yaml and model.pt.",
-    )
-    parser.add_argument(
-        "--no_tokenizer",
-        action="store_false",
-        dest="include_tokenizer",
-        help="If set, do not convert OLMo tokenizer to HF tokenizer.",
-    )
-    parser.add_argument(
-        "--tokenizer_json_path",
-        type=Path,
-        default=None,
-        help="Location of OLMo November 2024 tokenizer json file. Defaults to what is set in the config file.",
-    )
-    parser.add_argument(
-        "--output_dir",
-        required=True,
-        help="Location to write HF model and tokenizer",
-    )
-    parser.add_argument(
-        "--no_fix_eos_token_id",
-        action="store_false",
-        dest="fix_eos_token_id",
-        help="If set, does not change eos token id from 0 to 50279 if it is 0. Changing 0 to 50279 is a bug fix, so use this option with care.",
-    )
-    parser.add_argument(
-        "--no_tmp_cleanup",
-        action="store_false",
-        dest="tmp_cleanup",
-        help="If passed, don't remove temp dir at end of HF conversion.",
-    )
-    parser.add_argument(
-        "--no_safe_serialization",
-        action="store_false",
-        dest="safe_serialization",
-        help="Whether or not to save using `safetensors`.",
-    )
-    args = parser.parse_args()
-    write_model(
-        model_path=args.output_dir,
-        input_base_path=args.input_dir,
-        safe_serialization=args.safe_serialization,
-        include_tokenizer=args.include_tokenizer,
-        tokenizer_path=args.tokenizer_json_path,
-        fix_eos_token_id=args.fix_eos_token_id,
-        tmp_cleanup=args.tmp_cleanup,
-    )
-
-
-if __name__ == "__main__":
-    main()
--- a/src/transformers/models/olmo_1124/modeling_olmo_1124.py
+++ b/src/transformers/models/olmo_1124/modeling_olmo_1124.py
--- a/src/transformers/models/olmo_1124/modular_olmo_1124.py
+++ b/src/transformers/models/olmo_1124/modular_olmo_1124.py
@ -1,489 +0,0 @@
-import math
-from typing import Optional, Tuple
-
-import torch
-from torch import nn
-
-from ...cache_utils import Cache
-from ...pytorch_utils import ALL_LAYERNORM_LAYERS
-from ...utils import is_flash_attn_2_available, is_flash_attn_greater_or_equal_2_10, logging
-from ..llama.modeling_llama import LlamaRMSNorm
-from ..olmo.configuration_olmo import OlmoConfig
-from ..olmo.modeling_olmo import (
-    OlmoAttention,
-    OlmoDecoderLayer,
-    OlmoFlashAttention2,
-    OlmoForCausalLM,
-    OlmoModel,
-    OlmoPreTrainedModel,
-    OlmoSdpaAttention,
-    apply_rotary_pos_emb,
-    repeat_kv,
-)
-
-
-if is_flash_attn_2_available():
-    from ...modeling_flash_attention_utils import _flash_attention_forward
-
-logger = logging.get_logger(__name__)
-
-
-class Olmo1124Config(OlmoConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`Olmo1124Model`]. It is used to instantiate an OLMo November 2024
-    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    defaults will yield a similar configuration to that of the [allenai/Olmo1124-7B-hf](https://huggingface.co/allenai/Olmo1124-7B-hf).
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 50304):
-            Vocabulary size of the Olmo1124 model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`Olmo1124Model`]
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 11008):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
-            Number of hidden layers in the Transformer decoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer decoder.
-        num_key_value_heads (`int`, *optional*):
-            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
-            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
-            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
-            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
-            by meanpooling all the original heads within that group. For more details checkout [this
-            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
-            `num_attention_heads`.
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 2048):
-            The maximum sequence length that this model might ever be used with.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`.
-        pad_token_id (`int`, *optional*, defaults to 1):
-            Padding token id.
-        bos_token_id (`int`, *optional*):
-            Beginning of stream token id.
-        eos_token_id (`int`, *optional*, defaults to 50279):
-            End of stream token id.
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
-            Whether to tie weight embeddings
-        rope_theta (`float`, *optional*, defaults to 10000.0):
-            The base period of the RoPE embeddings.
-        rope_scaling (`Dict`, *optional*):
-            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
-            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
-            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
-            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
-            these scaling strategies behave:
-            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
-            experimental feature, subject to breaking API changes in future versions.
-        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
-            Whether to use a bias in the query, key, value and output projection layers during self-attention.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
-            The epsilon used by the rms normalization layers.
-
-    ```python
-    >>> from transformers import Olmo1124Model, Olmo1124Config
-
-    >>> # Initializing a Olmo November 2024 7B style configuration
-    >>> configuration = Olmo1124Config()
-
-    >>> # Initializing a model from the Olmo November 2024 7B style configuration
-    >>> model = Olmo1124Model(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```
-    """
-
-    model_type = "olmo_1124"
-
-    def __init__(
-        self,
-        vocab_size=50304,
-        hidden_size=4096,
-        intermediate_size=11008,
-        num_hidden_layers=32,
-        num_attention_heads=32,
-        num_key_value_heads=None,
-        hidden_act="silu",
-        max_position_embeddings=2048,
-        initializer_range=0.02,
-        use_cache=True,
-        pad_token_id=1,
-        bos_token_id=None,
-        eos_token_id=50279,
-        tie_word_embeddings=False,
-        rope_theta=10000.0,
-        rope_scaling=None,
-        attention_bias=False,
-        attention_dropout=0.0,
-        rms_norm_eps=1e-5,
-        **kwargs,
-    ):
-        super().__init__(
-            vocab_size=vocab_size,
-            hidden_size=hidden_size,
-            intermediate_size=intermediate_size,
-            num_hidden_layers=num_hidden_layers,
-            num_attention_heads=num_attention_heads,
-            num_key_value_heads=num_key_value_heads,
-            hidden_act=hidden_act,
-            max_position_embeddings=max_position_embeddings,
-            initializer_range=initializer_range,
-            use_cache=use_cache,
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            rope_theta=rope_theta,
-            rope_scaling=rope_scaling,
-            attention_bias=attention_bias,
-            attention_dropout=attention_dropout,
-            **kwargs,
-        )
-
-        self.rms_norm_eps = rms_norm_eps
-        del self.clip_qkv
-
-
-class Olmo1124RMSNorm(LlamaRMSNorm):
-    pass
-
-
-ALL_LAYERNORM_LAYERS.append(Olmo1124RMSNorm)
-
-
-# Olmo1124 attention is identical to OLMo attention except:
-# - Norm is applied to attention queries and keys.
-# - No qkv clipping.
-class Olmo1124Attention(OlmoAttention):
-    def __init__(self, config: Olmo1124Config, layer_idx: Optional[int] = None):
-        super().__init__(config, layer_idx=layer_idx)
-        self.q_norm = Olmo1124RMSNorm(self.num_heads * self.head_dim, config.rms_norm_eps)
-        self.k_norm = Olmo1124RMSNorm(self.num_key_value_heads * self.head_dim, config.rms_norm_eps)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        cache_position: Optional[torch.LongTensor] = None,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_norm(self.q_proj(hidden_states))
-        key_states = self.k_norm(self.k_proj(hidden_states))
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        cos, sin = self.rotary_emb(value_states, position_ids)
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
-
-        if past_key_value is not None:
-            # sin and cos are specific to RoPE models; cache_position needed for the static cache
-            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
-
-        if attention_mask is not None:  # no matter the length, we just slice it
-            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
-            attn_weights = attn_weights + causal_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-class Olmo1124FlashAttention2(OlmoFlashAttention2, Olmo1124Attention):
-    """
-    OLMo November 2024 flash attention module. This module inherits from `Olmo1124Attention` as the weights of the module stays
-    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
-    flash attention and deal with padding tokens in case the input contains any of them.
-    """
-
-    def __init__(self, *args, **kwargs):
-        Olmo1124Attention.__init__(*args, **kwargs)
-
-        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
-        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.LongTensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        cache_position: Optional[torch.LongTensor] = None,
-        **kwargs,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        output_attentions = False
-
-        bsz, q_len, _ = hidden_states.size()
-
-        query_states = self.q_norm(self.q_proj(hidden_states))
-        key_states = self.k_norm(self.k_proj(hidden_states))
-        value_states = self.v_proj(hidden_states)
-
-        # Flash attention requires the input to have the shape
-        # batch_size x seq_length x head_dim x hidden_dim
-        # therefore we just need to keep the original shape
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-
-        cos, sin = self.rotary_emb(value_states, position_ids)
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
-
-        if past_key_value is not None:
-            # sin and cos are specific to RoPE models; cache_position needed for the static cache
-            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-
-        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
-        # to be able to avoid many of these transpose/reshape/view.
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        dropout_rate = self.attention_dropout if self.training else 0.0
-
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in the correct dtype just to be sure everything works as expected.
-        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
-        # in fp32. (OlmoRMSNorm handles it correctly)
-
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            if torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
-            else:
-                target_dtype = self.q_proj.weight.dtype
-
-            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to"
-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}."
-            )
-
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
-
-        attn_output = _flash_attention_forward(
-            query_states,
-            key_states,
-            value_states,
-            attention_mask,
-            q_len,
-            position_ids=position_ids,
-            dropout=dropout_rate,
-            use_top_left_mask=self._flash_attn_uses_top_left_mask,
-            is_causal=self.is_causal,
-        )
-
-        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
-        attn_output = self.o_proj(attn_output)
-
-        if not output_attentions:
-            attn_weights = None
-
-        return attn_output, attn_weights, past_key_value
-
-
-class Olmo1124SdpaAttention(OlmoSdpaAttention, Olmo1124Attention):
-    # Adapted from Olmo1124Attention.forward
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: bool = False,
-        use_cache: bool = False,
-        cache_position: Optional[torch.LongTensor] = None,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        if output_attentions:
-            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
-            logger.warning_once(
-                "Olmo1124Model is using Olmo1124SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
-                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states=hidden_states,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-                past_key_value=past_key_value,
-                output_attentions=output_attentions,
-                use_cache=use_cache,
-                cache_position=cache_position,
-            )
-        bsz, q_len, _ = hidden_states.size()
-        query_states = self.q_norm(self.q_proj(hidden_states))
-        key_states = self.k_norm(self.k_proj(hidden_states))
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        cos, sin = self.rotary_emb(value_states, position_ids)
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
-        if past_key_value is not None:
-            # sin and cos are specific to RoPE models; cache_position needed for the static cache
-            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-        key_states = repeat_kv(key_states, self.num_key_value_groups)
-        value_states = repeat_kv(value_states, self.num_key_value_groups)
-        causal_mask = attention_mask
-        # if attention_mask is not None and cache_position is not None:
-        if attention_mask is not None:
-            causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
-        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
-        # Reference: https://github.com/pytorch/pytorch/issues/112577.
-        if query_states.device.type == "cuda" and causal_mask is not None:
-            query_states = query_states.contiguous()
-            key_states = key_states.contiguous()
-            value_states = value_states.contiguous()
-        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
-        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
-        is_causal = True if causal_mask is None and q_len > 1 else False
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=causal_mask,
-            dropout_p=self.attention_dropout if self.training else 0.0,
-            is_causal=is_causal,
-        )
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.view(bsz, q_len, self.hidden_size)
-        attn_output = self.o_proj(attn_output)
-        return attn_output, None, past_key_value
-
-
-# The OLMo November 2024 layers are identical to those of the OLMo model except:
-# - RMSNorm is used instead of standard layer norm.
-# - Norm is applied after attention/feedforward rather than before.
-class Olmo1124DecoderLayer(OlmoDecoderLayer):
-    def __init__(self, config: Olmo1124Config, layer_idx: int):
-        super().__init__(config, layer_idx=layer_idx)
-        self.post_attention_layernorm = Olmo1124RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_feedforward_layernorm = Olmo1124RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        del self.input_layernorm
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Cache] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = False,
-        cache_position: Optional[torch.LongTensor] = None,
-        **kwargs,
-    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
-        residual = hidden_states
-
-        # Self Attention
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_value=past_key_value,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-            cache_position=cache_position,
-            **kwargs,
-        )
-        hidden_states = self.post_attention_layernorm(hidden_states)
-        hidden_states = residual + hidden_states
-
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = self.post_feedforward_layernorm(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states,)
-        if output_attentions:
-            outputs += (self_attn_weights,)
-        if use_cache:
-            outputs += (present_key_value,)
-        return outputs
-
-
-class Olmo1124PreTrainedModel(OlmoPreTrainedModel):
-    pass
-
-
-# The OLMo November 2024 model is identical to the OLMo model, except RMSNorm is used instead of
-# standard layer norm for the output norm.
-class Olmo1124Model(OlmoModel):
-    def __init__(self, config: Olmo1124Config):
-        super().__init__(config)
-        self.layers = nn.ModuleList(
-            [Olmo1124DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
-        )
-        self.norm = Olmo1124RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-
-
-# The heads now only need to redefine the model inside to the correct `RobertaModel`
-class Olmo1124ForCausalLM(OlmoForCausalLM):
-    def __init__(self, config: Olmo1124Config):
-        super().__init__(config)
-        self.model = Olmo1124Model(config)
-
-
-__all__ = [
-    "Olmo1124Config",
-    "Olmo1124ForCausalLM",
-    "Olmo1124Model",
-    "Olmo1124PreTrainedModel",
-]
--- a/src/transformers/models/olmoe/modeling_olmoe.py
+++ b/src/transformers/models/olmoe/modeling_olmoe.py
@ -888,7 +888,7 @@ OLMOE_INPUTS_DOCSTRING = r"""
    "The bare Olmoe Model outputting raw hidden-states without any specific head on top.",
    OLMOE_START_DOCSTRING,
 )
-# TODO: re-enable check: Copied from transformers.models.llama.modeling_llama.LlamaModel with Llama->Olmoe
+# Copied from transformers.models.llama.modeling_llama.LlamaModel with Llama->Olmoe
 class OlmoeModel(OlmoePreTrainedModel):
    """
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`OlmoeDecoderLayer`]
@ -995,7 +995,7 @@ class OlmoeModel(OlmoePreTrainedModel):
        all_router_logits = () if output_router_logits else None
        next_decoder_cache = None

-        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+        for decoder_layer in self.layers:
            if output_hidden_states:
                all_hidden_states += (hidden_states,)

--- a/src/transformers/models/oneformer/modeling_oneformer.py
+++ b/src/transformers/models/oneformer/modeling_oneformer.py
@ -3161,7 +3161,7 @@ class OneFormerForUniversalSegmentation(OneFormerPreTrainedModel):

        >>> # you can pass them to processor for semantic postprocessing
        >>> predicted_semantic_map = processor.post_process_semantic_segmentation(
-        ...     outputs, target_sizes=[(image.height, image.width)]
+        ...     outputs, target_sizes=[image.size[::-1]]
        ... )[0]
        >>> f"👉 Semantic Predictions Shape: {list(predicted_semantic_map.shape)}"
        '👉 Semantic Predictions Shape: [512, 683]'
@ -3178,7 +3178,7 @@ class OneFormerForUniversalSegmentation(OneFormerPreTrainedModel):

        >>> # you can pass them to processor for instance postprocessing
        >>> predicted_instance_map = processor.post_process_instance_segmentation(
-        ...     outputs, target_sizes=[(image.height, image.width)]
+        ...     outputs, target_sizes=[image.size[::-1]]
        ... )[0]["segmentation"]
        >>> f"👉 Instance Predictions Shape: {list(predicted_instance_map.shape)}"
        '👉 Instance Predictions Shape: [512, 683]'
@ -3195,7 +3195,7 @@ class OneFormerForUniversalSegmentation(OneFormerPreTrainedModel):

        >>> # you can pass them to processor for panoptic postprocessing
        >>> predicted_panoptic_map = processor.post_process_panoptic_segmentation(
-        ...     outputs, target_sizes=[(image.height, image.width)]
+        ...     outputs, target_sizes=[image.size[::-1]]
        ... )[0]["segmentation"]
        >>> f"👉 Panoptic Predictions Shape: {list(predicted_panoptic_map.shape)}"
        '👉 Panoptic Predictions Shape: [512, 683]'
--- a/src/transformers/models/pix2struct/configuration_pix2struct.py
+++ b/src/transformers/models/pix2struct/configuration_pix2struct.py
@ -91,10 +91,6 @@ class Pix2StructTextConfig(PretrainedConfig):
        "hidden_size": "hidden_size",
        "num_attention_heads": "num_heads",
        "num_hidden_layers": "num_layers",
-        "decoder_attention_heads": "num_heads",
-        "encoder_attention_heads": "num_heads",
-        "encoder_layers": "num_layers",
-        "decoder_layers": "num_layers",
    }

    def __init__(
@ -358,8 +354,6 @@ class Pix2StructConfig(PretrainedConfig):
            vision_config = {}
            logger.info("vision_config is None. Initializing the Pix2StructVisionConfig with default values.")

-        text_config["is_encoder_decoder"] = is_encoder_decoder
-        text_config["tie_word_embeddings"] = tie_word_embeddings
        self.text_config = Pix2StructTextConfig(**text_config)
        self.vision_config = Pix2StructVisionConfig(**vision_config)

--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
ydshieh	756c2edae2	run_ci_manually	2024-11-15 22:01:07 +01:00
ydshieh	3441993d3b	run_ci_manually	2024-11-15 19:12:25 +01:00