allow disabling compile

Remove Multi-threaded image conversion for fast image processors (#36105 )
remove multithreaded image conversion Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>
2025-11-03 11:24:34 +08:00 · 2025-02-10 16:53:48 +01:00 · 2025-02-10 07:59:34 -05:00 · 2025-02-10 13:36:20 +01:00 · 2025-02-10 13:21:55 +01:00 · 2025-02-10 11:32:45 +00:00
466 changed files with 25924 additions and 5951 deletions
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@ -18,7 +18,8 @@ jobs:
    name: Benchmark
    strategy:
      matrix:
-        group: [aws-g5-4xlarge-cache, aws-p4d-24xlarge-plus]
+        # group: [aws-g5-4xlarge-cache, aws-p4d-24xlarge-plus] (A100 runner is not enabled)
+        group: [aws-g5-4xlarge-cache]
    runs-on:
      group: ${{ matrix.group }}
    if: |
--- a/.github/workflows/self-comment-ci.yml
+++ b/.github/workflows/self-comment-ci.yml
@ -30,7 +30,7 @@ jobs:
    runs-on: ubuntu-22.04
    name: Get PR number
    # For security: only allow team members to run
-    if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
+    if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
    outputs:
      PR_NUMBER: ${{ steps.set_pr_number.outputs.PR_NUMBER }}
    steps:
@ -98,6 +98,7 @@ jobs:
    if: ${{ needs.get-pr-number.outputs.PR_NUMBER != ''}}
    outputs:
      models: ${{ steps.models_to_run.outputs.models }}
+      quantizations: ${{ steps.models_to_run.outputs.quantizations }}
    steps:
      - uses: actions/checkout@v4
        with:
@ -121,6 +122,8 @@ jobs:
          python -m pip install GitPython
          python utils/pr_slow_ci_models.py --message "$PR_COMMENT" | tee output.txt
          echo "models=$(tail -n 1 output.txt)" >> $GITHUB_ENV
+          python utils/pr_slow_ci_models.py --message "$PR_COMMENT" --quantization | tee output2.txt
+          echo "quantizations=$(tail -n 1 output2.txt)" >> $GITHUB_ENV

      - name: Show models to test
        id: models_to_run
@ -128,10 +131,12 @@ jobs:
          echo "${{ env.models }}"
          echo "models=${{ env.models }}" >> $GITHUB_ENV
          echo "models=${{ env.models }}" >> $GITHUB_OUTPUT
+          echo "${{ env.quantizations }}"
+          echo "quantizations=${{ env.quantizations }}" >> $GITHUB_OUTPUT

  reply_to_comment:
    name: Reply to the comment
-    if: ${{ needs.get-tests.outputs.models != '[]' }}
+    if: ${{ needs.get-tests.outputs.models != '[]'  || needs.get-tests.outputs.quantizations != '[]' }}
    needs: [get-pr-number, get-tests]
    permissions:
      pull-requests: write
@ -141,17 +146,18 @@ jobs:
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          MODELS: ${{ needs.get-tests.outputs.models }}
+          BODY: "This comment contains run-slow, running the specified jobs:\n\nmodels: ${{ needs.get-tests.outputs.models }}\nquantizations: ${{ needs.get-tests.outputs.quantizations }}"
        run: |
          gh api \
            --method POST \
            -H "Accept: application/vnd.github+json" \
            -H "X-GitHub-Api-Version: 2022-11-28" \
            repos/${{ github.repository }}/issues/${{ needs.get-pr-number.outputs.PR_NUMBER }}/comments \
-            -f "body=This comment contains run-slow, running the specified jobs: ${{ env.MODELS }} ..."
+            -f "body=This comment contains run-slow, running the specified jobs: ${{ env.BODY }} ..."

  create_run:
    name: Create run
-    if: ${{ needs.get-tests.outputs.models != '[]' }}
+    if: ${{ needs.get-tests.outputs.models != '[]' || needs.get-tests.outputs.quantizations != '[]' }}
    needs: [get-sha, get-tests, reply_to_comment]
    permissions:
      statuses: write
@ -173,20 +179,20 @@ jobs:
            -f "target_url=$GITHUB_RUN_URL" -f "state=pending" -f "description=Slow CI job" -f "context=pytest/custom-tests"

  run_models_gpu:
-      name: Run all tests for the model
-      if: ${{ needs.get-tests.outputs.models != '[]' }}
-      needs: [get-pr-number, get-sha, get-tests, create_run]
-      strategy:
-        fail-fast: false
-        matrix:
-          folders: ${{ fromJson(needs.get-tests.outputs.models) }}
-          machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
-      runs-on:
-         group: '${{ matrix.machine_type }}'
-      container:
-        image: huggingface/transformers-all-latest-gpu
-        options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-      steps:
+    name: Run all tests for the model
+    if: ${{ needs.get-tests.outputs.models != '[]' }}
+    needs: [get-pr-number, get-sha, get-tests, create_run]
+    strategy:
+      fail-fast: false
+      matrix:
+        folders: ${{ fromJson(needs.get-tests.outputs.models) }}
+        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
+    runs-on:
+       group: '${{ matrix.machine_type }}'
+    container:
+      image: huggingface/transformers-all-latest-gpu
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
      - name: Echo input and matrix info
        shell: bash
        run: |
@ -206,20 +212,20 @@ jobs:
      - name: Checkout to PR merge commit
        working-directory: /transformers
        run: |
-            git fetch origin refs/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge:refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge
-            git checkout refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge
-            git log -1 --format=%H
+          git fetch origin refs/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge:refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge
+          git checkout refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge
+          git log -1 --format=%H

      - name: Verify merge commit SHA
        env:
          VERIFIED_PR_MERGE_SHA: ${{ needs.get-sha.outputs.PR_MERGE_SHA }}
        working-directory: /transformers
        run: |
-            PR_MERGE_SHA=$(git log -1 --format=%H)
-            if [ $PR_MERGE_SHA != $VERIFIED_PR_MERGE_SHA ]; then
-              echo "The merged commit SHA is not the same as the verified one! Security issue detected, abort the workflow!";
-              exit -1;
-            fi
+          PR_MERGE_SHA=$(git log -1 --format=%H)
+          if [ $PR_MERGE_SHA != $VERIFIED_PR_MERGE_SHA ]; then
+            echo "The merged commit SHA is not the same as the verified one! Security issue detected, abort the workflow!";
+            exit -1;
+          fi

      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
        working-directory: /transformers
@ -279,9 +285,106 @@ jobs:
          name: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports
          path: /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports

+  run_quantization_torch_gpu:
+    name: Run all tests for a quantization
+    if: ${{ needs.get-tests.outputs.quantizations != '[]' }}
+    needs: [get-pr-number, get-sha, get-tests, create_run]
+    strategy:
+      fail-fast: false
+      matrix:
+        folders: ${{ fromJson(needs.get-tests.outputs.quantizations) }}
+        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
+    runs-on:
+      group: '${{ matrix.machine_type }}'
+    container:
+      image: huggingface/transformers-quantization-latest-gpu
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: Echo folder ${{ matrix.folders }}
+        shell: bash
+        run: |
+          echo "${{ matrix.folders }}"
+          matrix_folders=${{ matrix.folders }}
+          matrix_folders=${matrix_folders/'quantization/'/'quantization_'}
+          echo "$matrix_folders"
+          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
+
+      - name: Checkout to PR merge commit
+        working-directory: /transformers
+        run: |
+          git fetch origin refs/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge:refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge
+          git checkout refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge
+          git log -1 --format=%H
+
+      - name: Verify merge commit SHA
+        env:
+          VERIFIED_PR_MERGE_SHA: ${{ needs.get-sha.outputs.PR_MERGE_SHA }}
+        working-directory: /transformers
+        run: |
+          PR_MERGE_SHA=$(git log -1 --format=%H)
+          if [ $PR_MERGE_SHA != $VERIFIED_PR_MERGE_SHA ]; then
+            echo "The merged commit SHA is not the same as the verified one! Security issue detected, abort the workflow!";
+            exit -1;
+          fi
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
+      - name: Set `machine_type` for report and artifact names
+        working-directory: /transformers
+        shell: bash
+        run: |
+          echo "${{ matrix.machine_type }}"
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
+            machine_type=single-gpu
+          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
+            machine_type=multi-gpu
+          else
+            machine_type=${{ matrix.machine_type }}
+          fi
+          echo "$machine_type"
+          echo "machine_type=$machine_type" >> $GITHUB_ENV
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Run quantization tests on GPU
+        working-directory: /transformers
+        run: |
+          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
+
+      - name: Make sure report directory exists
+        shell: bash
+        run: |
+          mkdir -p /transformers/reports/${{ env.machine_type }}_run_quantization_gpu_${{ matrix.folders }}_test_reports
+          echo "hello" > /transformers/reports/${{ env.machine_type }}_run_quantization_gpu_${{ matrix.folders }}_test_reports/hello.txt
+          echo "${{ env.machine_type }}_run_quantization_gpu_${{ matrix.folders }}_test_reports"
+
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports"
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ env.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports
+          path: /transformers/reports/${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports
+
  update_run_status:
    name: Update Check Run Status
-    needs: [get-sha, create_run, run_models_gpu]
+    needs: [get-sha, create_run, run_models_gpu, run_quantization_torch_gpu]
    permissions:
      statuses: write
    if: ${{ always() && needs.create_run.result == 'success' }}
@ -289,16 +392,17 @@ jobs:
    env:
      GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
+      STATUS_OK: ${{ contains(fromJSON('["skipped", "success"]'), needs.run_models_gpu.result) && contains(fromJSON('["skipped", "success"]'), needs.run_quantization_torch_gpu.result) }}
    steps:
      - name: Get `run_models_gpu` job status
        run: |
          echo "${{ needs.run_models_gpu.result }}"
-          if [ "${{ needs.run_models_gpu.result }}" = "cancelled" ]; then
-            echo "STATUS=failure" >> $GITHUB_ENV
-          elif [ "${{ needs.run_models_gpu.result }}" = "skipped" ]; then
+          echo "${{ needs.run_quantization_torch_gpu.result }}"
+          echo $STATUS_OK
+          if [ "$STATUS_OK" = "true" ]; then
            echo "STATUS=success" >> $GITHUB_ENV
          else
-            echo "STATUS=${{ needs.run_models_gpu.result }}" >> $GITHUB_ENV
+            echo "STATUS=failure" >> $GITHUB_ENV
          fi

      - name: Update PR commit statuses
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@ -366,7 +366,7 @@ jobs:
        run: |
          python3 -m pip uninstall -y deepspeed
          rm -rf DeepSpeed
-          git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build
+          git clone https://github.com/deepspeedai/DeepSpeed && cd DeepSpeed && rm -rf build
          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check

      - name: NVIDIA-SMI
--- a/.github/workflows/trufflehog.yml
+++ b/.github/workflows/trufflehog.yml
@ -16,3 +16,5 @@ jobs:
          fetch-depth: 0
      - name: Secret Scanning
        uses: trufflesecurity/trufflehog@main
+        with:
+          extra_args: --results=verified,unknown
--- a/README.md
+++ b/README.md
@ -283,7 +283,7 @@ If you'd like to play with the examples or need the bleeding edge of the code an
 ```
 git clone https://github.com/huggingface/transformers.git
 cd transformers
-pip install
+pip install .
 ```

 ### With conda
--- a/docker/consistency.dockerfile
+++ b/docker/consistency.dockerfile
@ -1,4 +1,4 @@
-FROM python:3.10-slim
+FROM python:3.9-slim
 ENV PYTHONDONTWRITEBYTECODE=1
 USER root
 ARG REF=main
--- a/docker/custom-tokenizers.dockerfile
+++ b/docker/custom-tokenizers.dockerfile
@ -1,4 +1,4 @@
-FROM python:3.10-slim
+FROM python:3.9-slim
 ENV PYTHONDONTWRITEBYTECODE=1
 USER root
 RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git cmake wget xz-utils build-essential g++5 libprotobuf-dev protobuf-compiler
--- a/docker/examples-tf.dockerfile
+++ b/docker/examples-tf.dockerfile
@ -1,4 +1,4 @@
-FROM python:3.10-slim
+FROM python:3.9-slim
 ENV PYTHONDONTWRITEBYTECODE=1
 USER root
 RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git
--- a/docker/examples-torch.dockerfile
+++ b/docker/examples-torch.dockerfile
@ -1,4 +1,4 @@
-FROM python:3.10-slim
+FROM python:3.9-slim
 ENV PYTHONDONTWRITEBYTECODE=1
 USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git
--- a/docker/exotic-models.dockerfile
+++ b/docker/exotic-models.dockerfile
@ -1,4 +1,4 @@
-FROM python:3.10-slim
+FROM python:3.9-slim
 ENV PYTHONDONTWRITEBYTECODE=1
 ARG REF=main
 USER root
--- a/docker/jax-light.dockerfile
+++ b/docker/jax-light.dockerfile
@ -1,4 +1,4 @@
-FROM python:3.10-slim
+FROM python:3.9-slim
 ENV PYTHONDONTWRITEBYTECODE=1
 ARG REF=main
 USER root
--- a/docker/pipeline-tf.dockerfile
+++ b/docker/pipeline-tf.dockerfile
@ -1,4 +1,4 @@
-FROM python:3.10-slim
+FROM python:3.9-slim
 ENV PYTHONDONTWRITEBYTECODE=1
 ARG REF=main
 USER root
--- a/docker/pipeline-torch.dockerfile
+++ b/docker/pipeline-torch.dockerfile
@ -1,4 +1,4 @@
-FROM python:3.10-slim
+FROM python:3.9-slim
 ENV PYTHONDONTWRITEBYTECODE=1
 ARG REF=main
 USER root
--- a/docker/quality.dockerfile
+++ b/docker/quality.dockerfile
@ -1,4 +1,4 @@
-FROM python:3.10-slim
+FROM python:3.9-slim
 ENV PYTHONDONTWRITEBYTECODE=1
 ARG REF=main
 USER root
--- a/docker/tf-light.dockerfile
+++ b/docker/tf-light.dockerfile
@ -1,4 +1,4 @@
-FROM python:3.10-slim
+FROM python:3.9-slim
 ENV PYTHONDONTWRITEBYTECODE=1
 ARG REF=main
 USER root
--- a/docker/torch-jax-light.dockerfile
+++ b/docker/torch-jax-light.dockerfile
@ -1,4 +1,4 @@
-FROM python:3.10-slim
+FROM python:3.9-slim
 ENV PYTHONDONTWRITEBYTECODE=1
 ARG REF=main
 USER root
--- a/docker/torch-light.dockerfile
+++ b/docker/torch-light.dockerfile
@ -1,4 +1,4 @@
-FROM python:3.10-slim
+FROM python:3.9-slim
 ENV PYTHONDONTWRITEBYTECODE=1
 ARG REF=main
 USER root
--- a/docker/torch-tf-light.dockerfile
+++ b/docker/torch-tf-light.dockerfile
@ -1,4 +1,4 @@
-FROM python:3.10-slim
+FROM python:3.9-slim
 ENV PYTHONDONTWRITEBYTECODE=1
 ARG REF=main
 RUN echo ${REF}
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@ -9,7 +9,7 @@ SHELL ["sh", "-lc"]
 # The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
 # to be used as arguments for docker build (so far).

-ARG PYTORCH='2.5.1'
+ARG PYTORCH='2.6.0'
 # (not always a valid torch version)
 ARG INTEL_TORCH_EXT='2.3.0'
 # Example: `cu102`, `cu113`, etc.
--- a/docker/transformers-past-gpu/Dockerfile
+++ b/docker/transformers-past-gpu/Dockerfile
@ -48,8 +48,8 @@ RUN python3 -m pip uninstall -y torch-tensorrt apex
 # Pre-build **nightly** release of DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
 RUN python3 -m pip uninstall -y deepspeed
 # This has to be run inside the GPU VMs running the tests. (So far, it fails here due to GPU checks during compilation.)
-# Issue: https://github.com/microsoft/DeepSpeed/issues/2010
-# RUN git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build && \
+# Issue: https://github.com/deepspeedai/DeepSpeed/issues/2010
+# RUN git clone https://github.com/deepspeedai/DeepSpeed && cd DeepSpeed && rm -rf build && \
 #    DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1

 RUN python3 -m pip install -U "itsdangerous<2.1.0"
--- a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
@ -1,5 +1,5 @@
 # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-23-11.html#rel-23-11
-FROM nvcr.io/nvidia/pytorch:23.04-py3
+FROM nvcr.io/nvidia/pytorch:23.11-py3
 LABEL maintainer="Hugging Face"

 ARG DEBIAN_FRONTEND=noninteractive
@ -15,10 +15,6 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip
 ARG REF=main
 RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF

-# Install Rust for Tokenizers
-RUN curl https://sh.rustup.rs -sSf | sh -s -- -y
-ENV PATH="$HOME/.cargo/bin:${PATH}"
-
 RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]

 # Install latest release PyTorch
--- a/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
@ -34,8 +34,8 @@ RUN python3 -m pip uninstall -y torch-tensorrt apex
 # Pre-build **nightly** release of DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
 RUN python3 -m pip uninstall -y deepspeed
 # This has to be run inside the GPU VMs running the tests. (So far, it fails here due to GPU checks during compilation.)
-# Issue: https://github.com/microsoft/DeepSpeed/issues/2010
-# RUN git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build && \
+# Issue: https://github.com/deepspeedai/DeepSpeed/issues/2010
+# RUN git clone https://github.com/deepspeedai/DeepSpeed && cd DeepSpeed && rm -rf build && \
 #    DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1

 ## For `torchdynamo` tests
--- a/docker/transformers-pytorch-gpu/Dockerfile
+++ b/docker/transformers-pytorch-gpu/Dockerfile
@ -11,7 +11,7 @@ ARG REF=main
 RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF

 # If set to nothing, will install the latest version
-ARG PYTORCH='2.5.1'
+ARG PYTORCH='2.6.0'
 ARG TORCH_VISION=''
 ARG TORCH_AUDIO=''
 # Example: `cu102`, `cu113`, etc.
--- a/docs/source/ar/notebooks.md
+++ b/docs/source/ar/notebooks.md
@ -130,7 +130,6 @@
 | دفتر الملاحظات     |      الوصف      |   |   |
 |:----------|:-------------|:-------------|------:|
 | [كيفية تكميم نموذج باستخدام ONNX Runtime لتصنيف النص](https://github.com/huggingface/notebooks/blob/main/examples/text_classification_quantization_ort.ipynb)| يوضح كيفية تطبيق التكميم الثابت والديناميكي على نموذج باستخدام [ONNX Runtime](https://github.com/microsoft/onnxruntime) لأي مهمة GLUE. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification_quantization_ort.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/text_classification_quantization_ort.ipynb)|
-| [كيفية تكميم نموذج باستخدام Intel Neural Compressor لتصنيف النص](https://github.com/huggingface/notebooks/blob/main/examples/text_classification_quantization_inc.ipynb)| يوضح كيفية تطبيق التكميم الثابت والديناميكي والتدريبي على نموذج باستخدام [Intel Neural Compressor (INC)](https://github.com/intel/neural-compressor) لأي مهمة GLUE. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification_quantization_inc.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/text_classification_quantization_inc.ipynb)|
 | [كيفية ضبط نموذج بدقة على تصنيف النص باستخدام ONNX Runtime](https://github.com/huggingface/notebooks/blob/main/examples/text_classification_ort.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج بدقة على أي مهمة GLUE باستخدام [ONNX Runtime](https://github.com/microsoft/onnxruntime). | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification_ort.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/text_classification_ort.ipynb)|
 | [كيفية ضبط نموذج بدقة على التلخيص باستخدام ONNX Runtime](https://github.com/huggingface/notebooks/blob/main/examples/summarization_ort.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج بدقة على XSUM باستخدام [ONNX Runtime](https://github.com/microsoft/onnxruntime). | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization_ort.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/summarization_ort.ipynb)|

--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@ -643,6 +643,8 @@
        title: ConvNeXTV2
      - local: model_doc/cvt
        title: CvT
+      - local: model_doc/dab-detr
+        title: DAB-DETR
      - local: model_doc/deformable_detr
        title: Deformable DETR
      - local: model_doc/deit
@ -651,6 +653,8 @@
        title: Depth Anything
      - local: model_doc/depth_anything_v2
        title: Depth Anything V2
+      - local: model_doc/depth_pro
+        title: DepthPro
      - local: model_doc/deta
        title: DETA
      - local: model_doc/detr
@ -707,6 +711,8 @@
        title: ResNet
      - local: model_doc/rt_detr
        title: RT-DETR
+      - local: model_doc/rt_detr_v2
+        title: RT-DETRv2
      - local: model_doc/segformer
        title: SegFormer
      - local: model_doc/seggpt
@ -872,6 +878,8 @@
        title: FLAVA
      - local: model_doc/git
        title: GIT
+      - local: model_doc/got_ocr2
+        title: GOT-OCR2
      - local: model_doc/grounding-dino
        title: Grounding DINO
      - local: model_doc/groupvit
--- a/docs/source/en/debugging.md
+++ b/docs/source/en/debugging.md
@ -30,7 +30,7 @@ DeepSpeed compiles CUDA C++ code and it can be a potential source of errors when

 <Tip>

-For any other installation issues, please [open an issue](https://github.com/microsoft/DeepSpeed/issues) with the DeepSpeed team.
+For any other installation issues, please [open an issue](https://github.com/deepspeedai/DeepSpeed/issues) with the DeepSpeed team.

 </Tip>

@ -89,7 +89,7 @@ sudo ln -s /usr/bin/g++-7  /usr/local/cuda-10.2/bin/g++
 If you're still having issues with installing DeepSpeed or if you're building DeepSpeed at run time, you can try to prebuild the DeepSpeed modules before installing them. To make a local build for DeepSpeed:

 ```bash
-git clone https://github.com/microsoft/DeepSpeed/
+git clone https://github.com/deepspeedai/DeepSpeed/
 cd DeepSpeed
 rm -rf build
 TORCH_CUDA_ARCH_LIST="8.6" DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 pip install . \
@ -141,7 +141,7 @@ It is also possible to not specify `TORCH_CUDA_ARCH_LIST` and the build program
 For training on multiple machines with the same setup, you'll need to make a binary wheel:

 ```bash
-git clone https://github.com/microsoft/DeepSpeed/
+git clone https://github.com/deepspeedai/DeepSpeed/
 cd DeepSpeed
 rm -rf build
 TORCH_CUDA_ARCH_LIST="8.6" DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 \
--- a/docs/source/en/deepspeed.md
+++ b/docs/source/en/deepspeed.md
@ -28,7 +28,7 @@ This guide will walk you through how to deploy DeepSpeed training, the features

 ## Installation

-DeepSpeed is available to install from PyPI or Transformers (for more detailed installation options, take a look at the DeepSpeed [installation details](https://www.deepspeed.ai/tutorials/advanced-install/) or the GitHub [README](https://github.com/microsoft/deepspeed#installation)).
+DeepSpeed is available to install from PyPI or Transformers (for more detailed installation options, take a look at the DeepSpeed [installation details](https://www.deepspeed.ai/tutorials/advanced-install/) or the GitHub [README](https://github.com/deepspeedai/DeepSpeed#installation)).

 <Tip>

@ -114,10 +114,10 @@ DeepSpeed works with the [`Trainer`] class by way of a config file containing al

 <Tip>

-Find a complete list of DeepSpeed configuration options on the [DeepSpeed Configuration JSON](https://www.deepspeed.ai/docs/config-json/) reference. You can also find more practical examples of various DeepSpeed configuration examples on the [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) repository or the main [DeepSpeed](https://github.com/microsoft/DeepSpeed) repository. To quickly find specific examples, you can:
+Find a complete list of DeepSpeed configuration options on the [DeepSpeed Configuration JSON](https://www.deepspeed.ai/docs/config-json/) reference. You can also find more practical examples of various DeepSpeed configuration examples on the [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples) repository or the main [DeepSpeed](https://github.com/deepspeedai/DeepSpeed) repository. To quickly find specific examples, you can:

 ```bash
-git clone https://github.com/microsoft/DeepSpeedExamples
+git clone https://github.com/deepspeedai/DeepSpeedExamples
 cd DeepSpeedExamples
 find . -name '*json'
 # find examples with the Lamb optimizer
@ -303,7 +303,7 @@ For more information about initializing large models with ZeRO-3 and accessing t

 [ZeRO-Infinity](https://hf.co/papers/2104.07857) allows offloading model states to the CPU and/or NVMe to save even more memory. Smart partitioning and tiling algorithms allow each GPU to send and receive very small amounts of data during offloading such that a modern NVMe can fit an even larger total memory pool than is available to your training process. ZeRO-Infinity requires ZeRO-3.

-Depending on the CPU and/or NVMe memory available, you can offload both the [optimizer states](https://www.deepspeed.ai/docs/config-json/#optimizer-offloading) and [parameters](https://www.deepspeed.ai/docs/config-json/#parameter-offloading), just one of them, or none. You should also make sure the `nvme_path` is pointing to an NVMe device, because while it still works with a normal hard drive or solid state drive, it'll be significantly slower. With a modern NVMe, you can expect peak transfer speeds of ~3.5GB/s for read and ~3GB/s for write operations. Lastly, [run a benchmark](https://github.com/microsoft/DeepSpeed/issues/998) on your training setup to determine the optimal `aio` configuration.
+Depending on the CPU and/or NVMe memory available, you can offload both the [optimizer states](https://www.deepspeed.ai/docs/config-json/#optimizer-offloading) and [parameters](https://www.deepspeed.ai/docs/config-json/#parameter-offloading), just one of them, or none. You should also make sure the `nvme_path` is pointing to an NVMe device, because while it still works with a normal hard drive or solid state drive, it'll be significantly slower. With a modern NVMe, you can expect peak transfer speeds of ~3.5GB/s for read and ~3GB/s for write operations. Lastly, [run a benchmark](https://github.com/deepspeedai/DeepSpeed/issues/998) on your training setup to determine the optimal `aio` configuration.

 The example ZeRO-3/Infinity configuration file below sets most of the parameter values to `auto`, but you could also manually add these values.

@ -1157,7 +1157,7 @@ For Transformers>=4.28, if `synced_gpus` is automatically set to `True` if multi

 ## Troubleshoot

-When you encounter an issue, you should consider whether DeepSpeed is the cause of the problem because often it isn't (unless it's super obviously and you can see DeepSpeed modules in the exception)! The first step should be to retry your setup without DeepSpeed, and if the problem persists, then you can report the issue. If the issue is a core DeepSpeed problem and unrelated to the Transformers integration, open an Issue on the [DeepSpeed repository](https://github.com/microsoft/DeepSpeed).
+When you encounter an issue, you should consider whether DeepSpeed is the cause of the problem because often it isn't (unless it's super obviously and you can see DeepSpeed modules in the exception)! The first step should be to retry your setup without DeepSpeed, and if the problem persists, then you can report the issue. If the issue is a core DeepSpeed problem and unrelated to the Transformers integration, open an Issue on the [DeepSpeed repository](https://github.com/deepspeedai/DeepSpeed).

 For issues related to the Transformers integration, please provide the following information:

@ -1227,7 +1227,7 @@ This means the DeepSpeed loss scaler is unable to find a scaling coefficient to

 ## Resources

-DeepSpeed ZeRO is a powerful technology for training and loading very large models for inference with limited GPU resources, making it more accessible to everyone. To learn more about DeepSpeed, feel free to read the [blog posts](https://www.microsoft.com/en-us/research/search/?q=deepspeed), [documentation](https://www.deepspeed.ai/getting-started/), and [GitHub repository](https://github.com/microsoft/deepspeed). 
+DeepSpeed ZeRO is a powerful technology for training and loading very large models for inference with limited GPU resources, making it more accessible to everyone. To learn more about DeepSpeed, feel free to read the [blog posts](https://www.microsoft.com/en-us/research/search/?q=deepspeed), [documentation](https://www.deepspeed.ai/getting-started/), and [GitHub repository](https://github.com/deepspeedai/DeepSpeed). 

 The following papers are also a great resource for learning more about ZeRO:

--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@ -110,6 +110,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                       [CPM-Ant](model_doc/cpmant)                        |       ✅        |         ❌         |      ❌      |
 |                          [CTRL](model_doc/ctrl)                          |       ✅        |         ✅         |      ❌      |
 |                           [CvT](model_doc/cvt)                           |       ✅        |         ✅         |      ❌      |
+|                      [DAB-DETR](model_doc/dab-detr)                      |       ✅        |         ❌         |      ❌      |
 |                           [DAC](model_doc/dac)                           |       ✅        |         ❌         |      ❌      |
 |                   [Data2VecAudio](model_doc/data2vec)                    |       ✅        |         ❌         |      ❌      |
 |                    [Data2VecText](model_doc/data2vec)                    |       ✅        |         ❌         |      ❌      |
@ -122,6 +123,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                          [DeiT](model_doc/deit)                          |       ✅        |         ✅         |      ❌      |
 |                        [DePlot](model_doc/deplot)                        |       ✅        |         ❌         |      ❌      |
 |                [Depth Anything](model_doc/depth_anything)                |       ✅        |         ❌         |      ❌      |
+|                     [DepthPro](model_doc/depth_pro)                      |       ✅        |         ❌         |      ❌      |
 |                          [DETA](model_doc/deta)                          |       ✅        |         ❌         |      ❌      |
 |                          [DETR](model_doc/detr)                          |       ✅        |         ❌         |      ❌      |
 |                      [DialoGPT](model_doc/dialogpt)                      |       ✅        |         ✅         |      ✅      |
@ -161,6 +163,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                           [GIT](model_doc/git)                           |       ✅        |         ❌         |      ❌      |
 |                           [GLM](model_doc/glm)                           |       ✅        |         ❌         |      ❌      |
 |                          [GLPN](model_doc/glpn)                          |       ✅        |         ❌         |      ❌      |
+|                      [GOT-OCR2](model_doc/got_ocr2)                      |       ✅        |         ❌         |      ❌      |
 |                       [GPT Neo](model_doc/gpt_neo)                       |       ✅        |         ❌         |      ✅      |
 |                      [GPT NeoX](model_doc/gpt_neox)                      |       ✅        |         ❌         |      ❌      |
 |             [GPT NeoX Japanese](model_doc/gpt_neox_japanese)             |       ✅        |         ❌         |      ❌      |
@ -303,6 +306,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                      [RoFormer](model_doc/roformer)                      |       ✅        |         ✅         |      ✅      |
 |                       [RT-DETR](model_doc/rt_detr)                       |       ✅        |         ❌         |      ❌      |
 |                [RT-DETR-ResNet](model_doc/rt_detr_resnet)                |       ✅        |         ❌         |      ❌      |
+|                    [RT-DETRv2](model_doc/rt_detr_v2)                     |       ✅        |         ❌         |      ❌      |
 |                          [RWKV](model_doc/rwkv)                          |       ✅        |         ❌         |      ❌      |
 |                           [SAM](model_doc/sam)                           |       ✅        |         ✅         |      ❌      |
 |                  [SeamlessM4T](model_doc/seamless_m4t)                   |       ✅        |         ❌         |      ❌      |
--- a/docs/source/en/kv_cache.md
+++ b/docs/source/en/kv_cache.md
@ -57,15 +57,15 @@ More concretely, key-value cache acts as a memory bank for these generative mode
  >>> from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache

  >>> model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
-  >>> model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="cuda:0")
+  >>> model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto")
  >>> tokenizer = AutoTokenizer.from_pretrained(model_id)

  >>> past_key_values = DynamicCache()
  >>> messages = [{"role": "user", "content": "Hello, what's your name."}]
-  >>> inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True).to("cuda:0")
+  >>> inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True).to(model.device)

  >>> generated_ids = inputs.input_ids
-  >>> cache_position = torch.arange(inputs.input_ids.shape[1], dtype=torch.int64, device="cuda:0")
+  >>> cache_position = torch.arange(inputs.input_ids.shape[1], dtype=torch.int64, device=model.device)
  >>> max_new_tokens = 10

  >>> for _ in range(max_new_tokens):
@ -139,7 +139,7 @@ Cache quantization can be detrimental in terms of latency if the context length
 >>> from transformers import AutoTokenizer, AutoModelForCausalLM

 >>> tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
->>> model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype=torch.float16).to("cuda:0")
+>>> model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype=torch.float16, device_map="auto")
 >>> inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device)

 >>> out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="quantized", cache_config={"nbits": 4, "backend": "quanto"})
@ -168,7 +168,7 @@ Use `cache_implementation="offloaded_static"` for an offloaded static cache (see
 >>> ckpt = "microsoft/Phi-3-mini-4k-instruct"

 >>> tokenizer = AutoTokenizer.from_pretrained(ckpt)
->>> model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16).to("cuda:0")
+>>> model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16, device_map="auto")
 >>> inputs = tokenizer("Fun fact: The shortest", return_tensors="pt").to(model.device)

 >>> out = model.generate(**inputs, do_sample=False, max_new_tokens=23, cache_implementation="offloaded")
@ -278,7 +278,7 @@ Note that you can use this cache only for models that support sliding window, e.
 >>> from transformers import AutoTokenizer, AutoModelForCausalLM, SinkCache

 >>> tokenizer = AutoTokenizer.from_pretrained("teknium/OpenHermes-2.5-Mistral-7B")
->>> model = AutoModelForCausalLM.from_pretrained("teknium/OpenHermes-2.5-Mistral-7B", torch_dtype=torch.float16).to("cuda:0")
+>>> model = AutoModelForCausalLM.from_pretrained("teknium/OpenHermes-2.5-Mistral-7B", torch_dtype=torch.float16, device_map="auto")
 >>> inputs = tokenizer("Yesterday I was on a rock concert and.", return_tensors="pt").to(model.device)

 >>> # can be used by passing in cache implementation
@ -298,7 +298,7 @@ Unlike other cache classes, this one can't be used directly by indicating a `cac
 >>> from transformers import AutoTokenizer, AutoModelForCausalLM, SinkCache

 >>> tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
->>> model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype=torch.float16).to("cuda:0")
+>>> model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype=torch.float16, device_map="auto")
 >>> inputs = tokenizer("This is a long story about unicorns, fairies and magic.", return_tensors="pt").to(model.device)

 >>> # get our cache, specify number of sink tokens and window size
@ -349,7 +349,7 @@ In case you are using Sink Cache, you have to crop your inputs to that maximum l
 >>> user_prompts = ["Hello, what's your name?", "Btw, yesterday I was on a rock concert."]

 >>> past_key_values = DynamicCache()
->>> max_cache_length = past_key_values.get_max_length()
+>>> max_cache_length = past_key_values.get_max_cache_shape()

 >>> messages = []
 >>> for prompt in user_prompts:
@ -377,17 +377,19 @@ Sometimes you would want to first fill-in cache object with key/values for certa
 >>> import copy
 >>> import torch
 >>> from transformers import AutoModelForCausalLM, AutoTokenizer, DynamicCache, StaticCache
+>>> from accelerate.test_utils.testing import get_backend

+>>> DEVICE, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
 >>> model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
->>> model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="cuda")
+>>> model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map=DEVICE)
 >>> tokenizer = AutoTokenizer.from_pretrained(model_id)

 >>> # Init StaticCache with big enough max-length (1024 tokens for the below example)
 >>> # You can also init a DynamicCache, if that suits you better
->>> prompt_cache = StaticCache(config=model.config, max_batch_size=1, max_cache_len=1024, device="cuda", dtype=torch.bfloat16)
+>>> prompt_cache = StaticCache(config=model.config, max_batch_size=1, max_cache_len=1024, device=DEVICE, dtype=torch.bfloat16)

 >>> INITIAL_PROMPT = "You are a helpful assistant. "
->>> inputs_initial_prompt = tokenizer(INITIAL_PROMPT, return_tensors="pt").to("cuda")
+>>> inputs_initial_prompt = tokenizer(INITIAL_PROMPT, return_tensors="pt").to(DEVICE)
 >>> # This is the common prompt cached, we need to run forward without grad to be abel to copy
 >>> with torch.no_grad():
 ...      prompt_cache = model(**inputs_initial_prompt, past_key_values = prompt_cache).past_key_values
@ -395,7 +397,7 @@ Sometimes you would want to first fill-in cache object with key/values for certa
 >>> prompts = ["Help me to write a blogpost about travelling.", "What is the capital of France?"]
 >>> responses = []
 >>> for prompt in prompts:
-...     new_inputs = tokenizer(INITIAL_PROMPT + prompt, return_tensors="pt").to("cuda")
+...     new_inputs = tokenizer(INITIAL_PROMPT + prompt, return_tensors="pt").to(DEVICE)
 ...     past_key_values = copy.deepcopy(prompt_cache)
 ...     outputs = model.generate(**new_inputs, past_key_values=past_key_values,max_new_tokens=20)
 ...     response = tokenizer.batch_decode(outputs)[0]
--- a/docs/source/en/llm_tutorial.md
+++ b/docs/source/en/llm_tutorial.md
@ -40,6 +40,7 @@ Before you begin, make sure you have all the necessary libraries installed:
 ```bash
 pip install transformers bitsandbytes>=0.39.0 -q
 ```
+Bitsandbytes supports multiple backends in addition to CUDA-based GPUs. Refer to the multi-backend installation [guide](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend) to learn more.


 ## Generate text
@ -101,9 +102,11 @@ Next, you need to preprocess your text input with a [tokenizer](tokenizer_summar

 ```py
 >>> from transformers import AutoTokenizer
+>>> from accelerate.test_utils.testing import get_backend

+>>> DEVICE, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
 >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", padding_side="left")
->>> model_inputs = tokenizer(["A list of colors: red, blue"], return_tensors="pt").to("cuda")
+>>> model_inputs = tokenizer(["A list of colors: red, blue"], return_tensors="pt").to(DEVICE)
 ```

 The `model_inputs` variable holds the tokenized text input, as well as the attention mask. While [`~generation.GenerationMixin.generate`] does its best effort to infer the attention mask when it is not passed, we recommend passing it whenever possible for optimal results.
@ -122,7 +125,7 @@ Finally, you don't need to do it one sequence at a time! You can batch your inpu
 >>> tokenizer.pad_token = tokenizer.eos_token  # Most LLMs don't have a pad token by default
 >>> model_inputs = tokenizer(
 ...     ["A list of colors: red, blue", "Portugal is"], return_tensors="pt", padding=True
-... ).to("cuda")
+... ).to(DEVICE)
 >>> generated_ids = model.generate(**model_inputs)
 >>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
 ['A list of colors: red, blue, green, yellow, orange, purple, pink,',
@ -152,7 +155,7 @@ If not specified in the [`~generation.GenerationConfig`] file, `generate` return


 ```py
->>> model_inputs = tokenizer(["A sequence of numbers: 1, 2"], return_tensors="pt").to("cuda")
+>>> model_inputs = tokenizer(["A sequence of numbers: 1, 2"], return_tensors="pt").to(DEVICE)

 >>> # By default, the output will contain up to 20 tokens
 >>> generated_ids = model.generate(**model_inputs)
@ -174,7 +177,7 @@ By default, and unless specified in the [`~generation.GenerationConfig`] file, `
 >>> from transformers import set_seed
 >>> set_seed(42)

->>> model_inputs = tokenizer(["I am a cat."], return_tensors="pt").to("cuda")
+>>> model_inputs = tokenizer(["I am a cat."], return_tensors="pt").to(DEVICE)

 >>> # LLM + greedy decoding = repetitive, boring output
 >>> generated_ids = model.generate(**model_inputs)
@ -196,7 +199,7 @@ LLMs are [decoder-only](https://huggingface.co/learn/nlp-course/chapter1/6?fw=pt
 >>> # which is shorter, has padding on the right side. Generation fails to capture the logic.
 >>> model_inputs = tokenizer(
 ...     ["1, 2, 3", "A, B, C, D, E"], padding=True, return_tensors="pt"
-... ).to("cuda")
+... ).to(DEVICE)
 >>> generated_ids = model.generate(**model_inputs)
 >>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
 '1, 2, 33333333333'
@ -206,7 +209,7 @@ LLMs are [decoder-only](https://huggingface.co/learn/nlp-course/chapter1/6?fw=pt
 >>> tokenizer.pad_token = tokenizer.eos_token  # Most LLMs don't have a pad token by default
 >>> model_inputs = tokenizer(
 ...     ["1, 2, 3", "A, B, C, D, E"], padding=True, return_tensors="pt"
-... ).to("cuda")
+... ).to(DEVICE)
 >>> generated_ids = model.generate(**model_inputs)
 >>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
 '1, 2, 3, 4, 5, 6,'
@ -223,7 +226,7 @@ Some models and tasks expect a certain input prompt format to work properly. Whe
 ... )
 >>> set_seed(0)
 >>> prompt = """How many helicopters can a human eat in one sitting? Reply as a thug."""
->>> model_inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
+>>> model_inputs = tokenizer([prompt], return_tensors="pt").to(DEVICE)
 >>> input_length = model_inputs.input_ids.shape[1]
 >>> generated_ids = model.generate(**model_inputs, max_new_tokens=20)
 >>> print(tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_tokens=True)[0])
@ -239,7 +242,7 @@ Some models and tasks expect a certain input prompt format to work properly. Whe
 ...     },
 ...     {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
 ... ]
->>> model_inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to("cuda")
+>>> model_inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(DEVICE)
 >>> input_length = model_inputs.shape[1]
 >>> generated_ids = model.generate(model_inputs, do_sample=True, max_new_tokens=20)
 >>> print(tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_tokens=True)[0])
--- a/docs/source/en/main_classes/deepspeed.md
+++ b/docs/source/en/main_classes/deepspeed.md
@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.

 # DeepSpeed

-[DeepSpeed](https://github.com/microsoft/DeepSpeed), powered by Zero Redundancy Optimizer (ZeRO), is an optimization library for training and fitting very large models onto a GPU. It is available in several ZeRO stages, where each stage progressively saves more GPU memory by partitioning the optimizer state, gradients, parameters, and enabling offloading to a CPU or NVMe. DeepSpeed is integrated with the [`Trainer`] class and most of the setup is automatically taken care of for you. 
+[DeepSpeed](https://github.com/deepspeedai/DeepSpeed), powered by Zero Redundancy Optimizer (ZeRO), is an optimization library for training and fitting very large models onto a GPU. It is available in several ZeRO stages, where each stage progressively saves more GPU memory by partitioning the optimizer state, gradients, parameters, and enabling offloading to a CPU or NVMe. DeepSpeed is integrated with the [`Trainer`] class and most of the setup is automatically taken care of for you. 

 However, if you want to use DeepSpeed without the [`Trainer`], Transformers provides a [`HfDeepSpeedConfig`] class.

--- a/docs/source/en/model_doc/blip.md
+++ b/docs/source/en/model_doc/blip.md
@ -61,6 +61,11 @@ The original code can be found [here](https://github.com/salesforce/BLIP).
 [[autodoc]] BlipImageProcessor
    - preprocess

+## BlipImageProcessorFast
+
+[[autodoc]] BlipImageProcessorFast
+    - preprocess
+
 <frameworkcontent>
 <pt>

--- a/docs/source/en/model_doc/clip.md
+++ b/docs/source/en/model_doc/clip.md
@ -251,6 +251,11 @@ The resource should ideally demonstrate something new instead of duplicating an
 [[autodoc]] CLIPImageProcessor
    - preprocess

+## CLIPImageProcessorFast
+
+[[autodoc]] CLIPImageProcessorFast
+    - preprocess
+
 ## CLIPFeatureExtractor

 [[autodoc]] CLIPFeatureExtractor
--- a/docs/source/en/model_doc/convnext.md
+++ b/docs/source/en/model_doc/convnext.md
@ -64,6 +64,11 @@ If you're interested in submitting a resource to be included here, please feel f
 [[autodoc]] ConvNextImageProcessor
    - preprocess

+## ConvNextImageProcessorFast
+
+[[autodoc]] ConvNextImageProcessorFast
+    - preprocess
+
 <frameworkcontent>
 <pt>

--- a/docs/source/en/model_doc/dab-detr.md
+++ b/docs/source/en/model_doc/dab-detr.md
@ -0,0 +1,119 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# DAB-DETR
+
+## Overview
+
+The DAB-DETR model was proposed in [DAB-DETR: Dynamic Anchor Boxes are Better Queries for DETR](https://arxiv.org/abs/2201.12329) by Shilong Liu, Feng Li, Hao Zhang, Xiao Yang, Xianbiao Qi, Hang Su, Jun Zhu, Lei Zhang.
+DAB-DETR is an enhanced variant of Conditional DETR. It utilizes dynamically updated anchor boxes to provide both a reference query point (x, y) and a reference anchor size (w, h), improving cross-attention computation. This new approach achieves 45.7% AP when trained for 50 epochs with a single ResNet-50 model as the backbone.
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/dab_detr_convergence_plot.png"
+alt="drawing" width="600"/>
+
+The abstract from the paper is the following:
+
+*We present in this paper a novel query formulation using dynamic anchor boxes
+for DETR (DEtection TRansformer) and offer a deeper understanding of the role
+of queries in DETR. This new formulation directly uses box coordinates as queries
+in Transformer decoders and dynamically updates them layer-by-layer. Using box
+coordinates not only helps using explicit positional priors to improve the query-to-feature similarity and eliminate the slow training convergence issue in DETR,
+but also allows us to modulate the positional attention map using the box width
+and height information. Such a design makes it clear that queries in DETR can be
+implemented as performing soft ROI pooling layer-by-layer in a cascade manner.
+As a result, it leads to the best performance on MS-COCO benchmark among
+the DETR-like detection models under the same setting, e.g., AP 45.7% using
+ResNet50-DC5 as backbone trained in 50 epochs. We also conducted extensive
+experiments to confirm our analysis and verify the effectiveness of our methods.*
+
+This model was contributed by [davidhajdu](https://huggingface.co/davidhajdu).
+The original code can be found [here](https://github.com/IDEA-Research/DAB-DETR).
+
+## How to Get Started with the Model
+
+Use the code below to get started with the model.
+
+```python
+import torch
+import requests
+
+from PIL import Image
+from transformers import AutoModelForObjectDetection, AutoImageProcessor
+
+url = 'http://images.cocodataset.org/val2017/000000039769.jpg' 
+image = Image.open(requests.get(url, stream=True).raw)
+
+image_processor = AutoImageProcessor.from_pretrained("IDEA-Research/dab-detr-resnet-50")
+model = AutoModelForObjectDetection.from_pretrained("IDEA-Research/dab-detr-resnet-50")
+
+inputs = image_processor(images=image, return_tensors="pt")
+
+with torch.no_grad():
+    outputs = model(**inputs)
+
+results = image_processor.post_process_object_detection(outputs, target_sizes=torch.tensor([image.size[::-1]]), threshold=0.3)
+
+for result in results:
+    for score, label_id, box in zip(result["scores"], result["labels"], result["boxes"]):
+        score, label = score.item(), label_id.item()
+        box = [round(i, 2) for i in box.tolist()]
+        print(f"{model.config.id2label[label]}: {score:.2f} {box}")
+```
+This should output
+```
+cat: 0.87 [14.7, 49.39, 320.52, 469.28]
+remote: 0.86 [41.08, 72.37, 173.39, 117.2]
+cat: 0.86 [344.45, 19.43, 639.85, 367.86]
+remote: 0.61 [334.27, 75.93, 367.92, 188.81]
+couch: 0.59 [-0.04, 1.34, 639.9, 477.09]
+```
+
+There are three other ways to instantiate a DAB-DETR model (depending on what you prefer):
+
+Option 1: Instantiate DAB-DETR with pre-trained weights for entire model
+```py
+>>> from transformers import DabDetrForObjectDetection
+
+>>> model = DabDetrForObjectDetection.from_pretrained("IDEA-Research/dab-detr-resnet-50")
+```
+
+Option 2: Instantiate DAB-DETR with randomly initialized weights for Transformer, but pre-trained weights for backbone
+```py
+>>> from transformers import DabDetrConfig, DabDetrForObjectDetection
+
+>>> config = DabDetrConfig()
+>>> model = DabDetrForObjectDetection(config)
+```
+Option 3: Instantiate DAB-DETR with randomly initialized weights for backbone + Transformer
+```py
+>>> config = DabDetrConfig(use_pretrained_backbone=False)
+>>> model = DabDetrForObjectDetection(config)
+```
+
+
+## DabDetrConfig
+
+[[autodoc]] DabDetrConfig
+
+## DabDetrModel
+
+[[autodoc]] DabDetrModel
+    - forward
+
+## DabDetrForObjectDetection
+
+[[autodoc]] DabDetrForObjectDetection
+    - forward
--- a/docs/source/en/model_doc/deit.md
+++ b/docs/source/en/model_doc/deit.md
@ -125,6 +125,11 @@ If you're interested in submitting a resource to be included here, please feel f
 [[autodoc]] DeiTImageProcessor
    - preprocess

+## DeiTImageProcessorFast
+
+[[autodoc]] DeiTImageProcessorFast
+    - preprocess
+
 <frameworkcontent>
 <pt>

--- a/docs/source/en/model_doc/depth_pro.md
+++ b/docs/source/en/model_doc/depth_pro.md
@ -0,0 +1,183 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# DepthPro
+
+## Overview
+
+The DepthPro model was proposed in [Depth Pro: Sharp Monocular Metric Depth in Less Than a Second](https://arxiv.org/abs/2410.02073) by Aleksei Bochkovskii, Amaël Delaunoy, Hugo Germain, Marcel Santos, Yichao Zhou, Stephan R. Richter, Vladlen Koltun.
+
+DepthPro is a foundation model for zero-shot metric monocular depth estimation, designed to generate high-resolution depth maps with remarkable sharpness and fine-grained details. It employs a multi-scale Vision Transformer (ViT)-based architecture, where images are downsampled, divided into patches, and processed using a shared Dinov2 encoder. The extracted patch-level features are merged, upsampled, and refined using a DPT-like fusion stage, enabling precise depth estimation.
+
+The abstract from the paper is the following:
+
+*We present a foundation model for zero-shot metric monocular depth estimation. Our model, Depth Pro, synthesizes high-resolution depth maps with unparalleled sharpness and high-frequency details. The predictions are metric, with absolute scale, without relying on the availability of metadata such as camera intrinsics. And the model is fast, producing a 2.25-megapixel depth map in 0.3 seconds on a standard GPU. These characteristics are enabled by a number of technical contributions, including an efficient multi-scale vision transformer for dense prediction, a training protocol that combines real and synthetic datasets to achieve high metric accuracy alongside fine boundary tracing, dedicated evaluation metrics for boundary accuracy in estimated depth maps, and state-of-the-art focal length estimation from a single image. Extensive experiments analyze specific design choices and demonstrate that Depth Pro outperforms prior work along multiple dimensions.*
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/depth_pro_teaser.png"
+alt="drawing" width="600"/>
+
+<small> DepthPro Outputs. Taken from the <a href="https://github.com/apple/ml-depth-pro" target="_blank">official code</a>. </small>
+
+This model was contributed by [geetu040](https://github.com/geetu040). The original code can be found [here](https://github.com/apple/ml-depth-pro).
+
+## Usage Tips
+
+The DepthPro model processes an input image by first downsampling it at multiple scales and splitting each scaled version into patches. These patches are then encoded using a shared Vision Transformer (ViT)-based Dinov2 patch encoder, while the full image is processed by a separate image encoder. The extracted patch features are merged into feature maps, upsampled, and fused using a DPT-like decoder to generate the final depth estimation. If enabled, an additional Field of View (FOV) encoder processes the image for estimating the camera's field of view, aiding in depth accuracy.
+
+```py
+>>> import requests
+>>> from PIL import Image
+>>> import torch
+>>> from transformers import DepthProImageProcessorFast, DepthProForDepthEstimation
+
+>>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+>>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> image_processor = DepthProImageProcessorFast.from_pretrained("apple/DepthPro-hf")
+>>> model = DepthProForDepthEstimation.from_pretrained("apple/DepthPro-hf").to(device)
+
+>>> inputs = image_processor(images=image, return_tensors="pt").to(device)
+
+>>> with torch.no_grad():
+...     outputs = model(**inputs)
+
+>>> post_processed_output = image_processor.post_process_depth_estimation(
+...     outputs, target_sizes=[(image.height, image.width)],
+... )
+
+>>> field_of_view = post_processed_output[0]["field_of_view"]
+>>> focal_length = post_processed_output[0]["focal_length"]
+>>> depth = post_processed_output[0]["predicted_depth"]
+>>> depth = (depth - depth.min()) / depth.max()
+>>> depth = depth * 255.
+>>> depth = depth.detach().cpu().numpy()
+>>> depth = Image.fromarray(depth.astype("uint8"))
+```
+
+### Architecture and Configuration
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/depth_pro_architecture.png"
+alt="drawing" width="600"/>
+
+<small> DepthPro architecture. Taken from the <a href="https://arxiv.org/abs/2410.02073" target="_blank">original paper</a>. </small>
+
+The `DepthProForDepthEstimation` model uses a `DepthProEncoder`, for encoding the input image and a `FeatureFusionStage` for fusing the output features from encoder.
+
+The `DepthProEncoder` further uses two encoders:
+- `patch_encoder`
+   - Input image is scaled with multiple ratios, as specified in the `scaled_images_ratios` configuration.
+   - Each scaled image is split into smaller **patches** of size `patch_size` with overlapping areas determined by `scaled_images_overlap_ratios`.
+   - These patches are processed by the **`patch_encoder`**
+- `image_encoder`
+   - Input image is also rescaled to `patch_size` and processed by the **`image_encoder`**
+
+Both these encoders can be configured via `patch_model_config` and `image_model_config` respectively, both of which are seperate `Dinov2Model` by default.
+
+Outputs from both encoders (`last_hidden_state`) and selected intermediate states (`hidden_states`) from **`patch_encoder`** are fused by a `DPT`-based `FeatureFusionStage` for depth estimation.
+
+### Field-of-View (FOV) Prediction
+
+The network is supplemented with a focal length estimation head. A small convolutional head ingests frozen features from the depth estimation network and task-specific features from a separate ViT image encoder to predict the horizontal angular field-of-view.
+
+The `use_fov_model` parameter in `DepthProConfig` controls whether **FOV prediction** is enabled. By default, it is set to `False` to conserve memory and computation. When enabled, the **FOV encoder** is instantiated based on the `fov_model_config` parameter, which defaults to a `Dinov2Model`. The `use_fov_model` parameter can also be passed when initializing the `DepthProForDepthEstimation` model.
+
+The pretrained model at checkpoint `apple/DepthPro-hf` uses the FOV encoder. To use the pretrained-model without FOV encoder, set `use_fov_model=False` when loading the model, which saves computation.
+```py
+>>> from transformers import DepthProForDepthEstimation
+>>> model = DepthProForDepthEstimation.from_pretrained("apple/DepthPro-hf", use_fov_model=False)
+```
+
+To instantiate a new model with FOV encoder, set `use_fov_model=True` in the config.
+```py
+>>> from transformers import DepthProConfig, DepthProForDepthEstimation
+>>> config = DepthProConfig(use_fov_model=True)
+>>> model = DepthProForDepthEstimation(config)
+```
+
+Or set `use_fov_model=True` when initializing the model, which overrides the value in config.
+```py
+>>> from transformers import DepthProConfig, DepthProForDepthEstimation
+>>> config = DepthProConfig()
+>>> model = DepthProForDepthEstimation(config, use_fov_model=True)
+```
+
+### Using Scaled Dot Product Attention (SDPA)
+
+PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function 
+encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the 
+[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) 
+or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
+page for more information.
+
+SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set 
+`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
+
+```py
+from transformers import DepthProForDepthEstimation
+model = DepthProForDepthEstimation.from_pretrained("apple/DepthPro-hf", attn_implementation="sdpa", torch_dtype=torch.float16)
+```
+
+For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).
+
+On a local benchmark (A100-40GB, PyTorch 2.3.0, OS Ubuntu 22.04) with `float32` and `google/vit-base-patch16-224` model, we saw the following speedups during inference.
+
+|   Batch size |   Average inference time (ms), eager mode |   Average inference time (ms), sdpa model |   Speed up, Sdpa / Eager (x) |
+|--------------|-------------------------------------------|-------------------------------------------|------------------------------|
+|            1 |                                         7 |                                         6 |                      1.17 |
+|            2 |                                         8 |                                         6 |                      1.33 |
+|            4 |                                         8 |                                         6 |                      1.33 |
+|            8 |                                         8 |                                         6 |                      1.33 |
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DepthPro:
+
+- Research Paper: [Depth Pro: Sharp Monocular Metric Depth in Less Than a Second](https://arxiv.org/pdf/2410.02073)
+- Official Implementation: [apple/ml-depth-pro](https://github.com/apple/ml-depth-pro)
+- DepthPro Inference Notebook: [DepthPro Inference](https://github.com/qubvel/transformers-notebooks/blob/main/notebooks/DepthPro_inference.ipynb)
+- DepthPro for Super Resolution and Image Segmentation
+    - Read blog on Medium: [Depth Pro: Beyond Depth](https://medium.com/@raoarmaghanshakir040/depth-pro-beyond-depth-9d822fc557ba)
+    - Code on Github: [geetu040/depthpro-beyond-depth](https://github.com/geetu040/depthpro-beyond-depth)
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+## DepthProConfig
+
+[[autodoc]] DepthProConfig
+
+## DepthProImageProcessor
+
+[[autodoc]] DepthProImageProcessor
+    - preprocess
+    - post_process_depth_estimation
+
+## DepthProImageProcessorFast
+
+[[autodoc]] DepthProImageProcessorFast
+    - preprocess
+    - post_process_depth_estimation
+
+## DepthProModel
+
+[[autodoc]] DepthProModel
+    - forward
+
+## DepthProForDepthEstimation
+
+[[autodoc]] DepthProForDepthEstimation
+    - forward
--- a/docs/source/en/model_doc/got_ocr2.md
+++ b/docs/source/en/model_doc/got_ocr2.md
@ -0,0 +1,269 @@
+<!--Copyright 2024 StepFun and The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# GOT-OCR2
+
+## Overview
+
+The GOT-OCR2 model was proposed in [General OCR Theory: Towards OCR-2.0 via a Unified End-to-end Model](https://arxiv.org/abs/2409.01704) by Haoran Wei, Chenglong Liu, Jinyue Chen, Jia Wang, Lingyu Kong, Yanming Xu, Zheng Ge, Liang Zhao, Jianjian Sun, Yuang Peng, Chunrui Han, Xiangyu Zhang.
+
+The abstract from the paper is the following:
+
+*Traditional OCR systems (OCR-1.0) are increasingly unable to meet people’snusage due to the growing demand for intelligent processing of man-made opticalncharacters. In this paper, we collectively refer to all artificial optical signals (e.g., plain texts, math/molecular formulas, tables, charts, sheet music, and even geometric shapes) as "characters" and propose the General OCR Theory along with an excellent model, namely GOT, to promote the arrival of OCR-2.0. The GOT, with 580M parameters, is a unified, elegant, and end-to-end model, consisting of a high-compression encoder and a long-contexts decoder. As an OCR-2.0 model, GOT can handle all the above "characters" under various OCR tasks. On the input side, the model supports commonly used scene- and document-style images in slice and whole-page styles. On the output side, GOT can generate plain or formatted results (markdown/tikz/smiles/kern) via an easy prompt. Besides, the model enjoys interactive OCR features, i.e., region-level recognition guided by coordinates or colors. Furthermore, we also adapt dynamic resolution and multipage OCR technologies to GOT for better practicality. In experiments, we provide sufficient results to prove the superiority of our model.*
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/got_ocr_overview.png"
+alt="drawing" width="600"/>
+
+<small> GOT-OCR2 training stages. Taken from the <a href="https://arxiv.org/abs/2409.01704">original paper.</a> </small>
+
+
+Tips:
+
+GOT-OCR2 works on a wide range of tasks, including plain document OCR, scene text OCR, formatted document OCR, and even OCR for tables, charts, mathematical formulas, geometric shapes, molecular formulas and sheet music. While this implementation of the model will only output plain text, the outputs can be further processed to render the desired format, with packages like `pdftex`, `mathpix`, `matplotlib`, `tikz`, `verovio` or `pyecharts`.
+The model can also be used for interactive OCR, where the user can specify the region to be recognized by providing the coordinates or the color of the region's bounding box.
+
+This model was contributed by [yonigozlan](https://huggingface.co/yonigozlan).
+The original code can be found [here](https://github.com/Ucas-HaoranWei/GOT-OCR2.0).
+
+## Usage example
+
+### Plain text inference
+
+```python
+>>> from transformers import AutoProcessor, AutoModelForImageTextToText
+
+>>> device = "cuda" if torch.cuda.is_available() else "cpu"
+>>> model = AutoModelForImageTextToText.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf", device_map=device)
+>>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf")
+
+>>> image = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/image_ocr.jpg"
+>>> inputs = processor(image, return_tensors="pt").to(device)
+
+>>> generate_ids = model.generate(
+...     **inputs,
+...     do_sample=False,
+...     tokenizer=processor.tokenizer,
+...     stop_strings="<|im_end|>",
+...     max_new_tokens=4096,
+... )
+
+>>> processor.decode(generate_ids[0, inputs["input_ids"].shape[1]:], skip_special_tokens=True)
+"R&D QUALITY IMPROVEMENT\nSUGGESTION/SOLUTION FORM\nName/Phone Ext. : (...)"
+```
+
+### Plain text inference batched
+
+```python
+>>> from transformers import AutoProcessor, AutoModelForImageTextToText
+
+>>> device = "cuda" if torch.cuda.is_available() else "cpu"
+>>> model = AutoModelForImageTextToText.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf", device_map=device)
+>>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf")
+
+>>> image1 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/multi_box.png"
+>>> image2 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/image_ocr.jpg"
+
+>>> inputs = processor([image1, image2], return_tensors="pt").to(device)
+
+>>> generate_ids = model.generate(
+...     **inputs,
+...     do_sample=False,
+...     tokenizer=processor.tokenizer,
+...     stop_strings="<|im_end|>",
+...     max_new_tokens=4,
+... )
+
+>>> processor.batch_decode(generate_ids[:, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
+["Reducing the number", "R&D QUALITY"]
+```
+
+### Formatted text inference
+
+GOT-OCR2 can also generate formatted text, such as markdown or LaTeX. Here is an example of how to generate formatted text:
+
+```python
+>>> from transformers import AutoProcessor, AutoModelForImageTextToText
+
+>>> device = "cuda" if torch.cuda.is_available() else "cpu"
+>>> model = AutoModelForImageTextToText.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf", device_map=device)
+>>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf")
+
+>>> image = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/latex.png"
+>>> inputs = processor(image, return_tensors="pt", format=True).to(device)
+
+>>> generate_ids = model.generate(
+...     **inputs,
+...     do_sample=False,
+...     tokenizer=processor.tokenizer,
+...     stop_strings="<|im_end|>",
+...     max_new_tokens=4096,
+... )
+
+>>> processor.decode(generate_ids[0, inputs["input_ids"].shape[1]:], skip_special_tokens=True)
+"\\author{\nHanwen Jiang* \\(\\quad\\) Arjun Karpur \\({ }^{\\dagger} \\quad\\) Bingyi Cao \\({ }^{\\dagger} \\quad\\) (...)"
+```
+
+### Inference on multiple pages
+
+Although it might be reasonable in most cases to use a “for loop” for multi-page processing, some text data with formatting across several pages make it necessary to process all pages at once. GOT introduces a multi-page OCR (without “for loop”) feature, where multiple pages can be processed by the model at once, whith the output being one continuous text.
+Here is an example of how to process multiple pages at once:
+
+
+```python
+>>> from transformers import AutoProcessor, AutoModelForImageTextToText
+
+>>> device = "cuda" if torch.cuda.is_available() else "cpu"
+>>> model = AutoModelForImageTextToText.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf", device_map=device)
+>>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf")
+
+>>> image1 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/page1.png"
+>>> image2 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/page2.png"
+>>> inputs = processor([image1, image2], return_tensors="pt", multi_page=True, format=True).to(device)
+
+>>> generate_ids = model.generate(
+...     **inputs,
+...     do_sample=False,
+...     tokenizer=processor.tokenizer,
+...     stop_strings="<|im_end|>",
+...     max_new_tokens=4096,
+... )
+
+>>> processor.decode(generate_ids[0, inputs["input_ids"].shape[1]:], skip_special_tokens=True)
+"\\title{\nGeneral OCR Theory: Towards OCR-2.0 via a Unified End-to-end Model\n}\n\\author{\nHaoran Wei (...)"
+```
+
+### Inference on cropped patches
+
+GOT supports a 1024×1024 input resolution, which is sufficient for most OCR tasks, such as scene OCR or processing A4-sized PDF pages. However, certain scenarios, like horizontally stitched two-page PDFs commonly found in academic papers or images with unusual aspect ratios, can lead to accuracy issues when processed as a single image. To address this, GOT can dynamically crop an image into patches, process them all at once, and merge the results for better accuracy with such inputs.
+Here is an example of how to process cropped patches:
+
+```python
+>>> import torch
+>>> from transformers import AutoProcessor, AutoModelForImageTextToText
+
+>>> device = "cuda" if torch.cuda.is_available() else "cpu"
+>>> model = AutoModelForImageTextToText.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf", torch_dtype=torch.bfloat16, device_map=device)
+>>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf")
+
+>>> image = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/one_column.png"
+>>> inputs = processor(image, return_tensors="pt", format=True, crop_to_patches=True, max_patches=3).to(device)
+
+>>> generate_ids = model.generate(
+...     **inputs,
+...     do_sample=False,
+...     tokenizer=processor.tokenizer,
+...     stop_strings="<|im_end|>",
+...     max_new_tokens=4096,
+... )
+
+>>> processor.decode(generate_ids[0, inputs["input_ids"].shape[1]:], skip_special_tokens=True)
+"on developing architectural improvements to make learnable matching methods generalize.\nMotivated by the above observations, (...)"
+```
+
+### Inference on a specific region
+
+GOT supports interactive OCR, where the user can specify the region to be recognized by providing the coordinates or the color of the region's bounding box. Here is an example of how to process a specific region:
+
+```python
+>>> from transformers import AutoProcessor, AutoModelForImageTextToText
+
+>>> device = "cuda" if torch.cuda.is_available() else "cpu"
+>>> model = AutoModelForImageTextToText.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf", device_map=device)
+>>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf")
+
+>>> image = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/multi_box.png"
+>>> inputs = processor(image, return_tensors="pt", color="green").to(device) # or box=[x1, y1, x2, y2] for coordinates (image pixels)
+
+>>> generate_ids = model.generate(
+...     **inputs,
+...     do_sample=False,
+...     tokenizer=processor.tokenizer,
+...     stop_strings="<|im_end|>",
+...     max_new_tokens=4096,
+... )
+
+>>> processor.decode(generate_ids[0, inputs["input_ids"].shape[1]:], skip_special_tokens=True)
+"You should keep in mind what features from the module should be used, especially \nwhen you’re planning to sell a template."
+```
+
+### Inference on general OCR data example: sheet music
+
+Although this implementation of the model will only output plain text, the outputs can be further processed to render the desired format, with packages like `pdftex`, `mathpix`, `matplotlib`, `tikz`, `verovio` or `pyecharts`.
+Here is an example of how to process sheet music:
+
+```python
+>>> from transformers import AutoProcessor, AutoModelForImageTextToText
+>>> import verovio
+
+>>> device = "cuda" if torch.cuda.is_available() else "cpu"
+>>> model = AutoModelForImageTextToText.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf", device_map=device)
+>>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf")
+
+>>> image = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/sheet_music.png"
+>>> inputs = processor(image, return_tensors="pt", format=True).to(device)
+
+>>> generate_ids = model.generate(
+...     **inputs,
+...     do_sample=False,
+...     tokenizer=processor.tokenizer,
+...     stop_strings="<|im_end|>",
+...     max_new_tokens=4096,
+... )
+
+>>> outputs = processor.decode(generate_ids[0, inputs["input_ids"].shape[1]:], skip_special_tokens=True)
+>>> tk = verovio.toolkit()
+>>> tk.loadData(outputs)
+>>> tk.setOptions(
+...     {
+...         "pageWidth": 2100,
+...         "pageHeight": 800,
+...         "footer": "none",
+...         "barLineWidth": 0.5,
+...         "beamMaxSlope": 15,
+...         "staffLineWidth": 0.2,
+...         "spacingStaff": 6,
+...     }
+... )
+>>> tk.getPageCount()
+>>> svg = tk.renderToSVG()
+>>> svg = svg.replace('overflow="inherit"', 'overflow="visible"')
+>>> with open("output.svg", "w") as f:
+>>>     f.write(svg)
+```
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/sheet_music.svg"
+alt="drawing" width="600"/>
+
+## GotOcr2Config
+
+[[autodoc]] GotOcr2Config
+
+## GotOcr2VisionConfig
+
+[[autodoc]] GotOcr2VisionConfig
+
+## GotOcr2ImageProcessor
+
+[[autodoc]] GotOcr2ImageProcessor
+
+## GotOcr2Processor
+
+[[autodoc]] GotOcr2Processor
+
+## GotOcr2ForConditionalGeneration
+
+[[autodoc]] GotOcr2ForConditionalGeneration
+    - forward
+
--- a/docs/source/en/model_doc/granitevision.md
+++ b/docs/source/en/model_doc/granitevision.md
@ -31,13 +31,8 @@ Tips:
 Sample inference:
 ```python
 from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
-from PIL import Image
-import requests

-# Note: These docs were written prior to the public model release,
-# and this path is subject to change.
-# Please see https://huggingface.co/ibm-granite for the current model list.
-model_path = "ibm-granite/granite-3.1-2b-instruct-vision"
+model_path = "ibm-granite/granite-vision-3.1-2b-preview"
 processor = LlavaNextProcessor.from_pretrained(model_path)

 model = LlavaNextForConditionalGeneration.from_pretrained(model_path).to("cuda")
--- a/docs/source/en/model_doc/llava.md
+++ b/docs/source/en/model_doc/llava.md
@ -195,6 +195,11 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] LlavaImageProcessor
    - preprocess

+## LlavaImageProcessorFast
+
+[[autodoc]] LlavaImageProcessorFast
+    - preprocess
+
 ## LlavaProcessor

 [[autodoc]] LlavaProcessor
--- a/docs/source/en/model_doc/llava_next.md
+++ b/docs/source/en/model_doc/llava_next.md
@ -288,6 +288,11 @@ model = AutoModelForImageTextToText.from_pretrained(
 [[autodoc]] LlavaNextImageProcessor
    - preprocess

+## LlavaNextImageProcessorFast
+
+[[autodoc]] LlavaNextImageProcessorFast
+    - preprocess
+
 ## LlavaNextProcessor

 [[autodoc]] LlavaNextProcessor
--- a/docs/source/en/model_doc/llava_onevision.md
+++ b/docs/source/en/model_doc/llava_onevision.md
@ -100,8 +100,8 @@ import torch
 from PIL import Image
 import requests

-processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-hf") 
-model = LlavaOnevisionForConditionalGeneration.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-hf", torch_dtype=torch.float16, low_cpu_mem_usage=True) 
+processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-hf")
+model = LlavaOnevisionForConditionalGeneration.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-hf", torch_dtype=torch.float16, low_cpu_mem_usage=True)
 model.to("cuda:0")

 # prepare image and text prompt, using the appropriate prompt template
@ -298,8 +298,8 @@ First make sure to install flash-attn. Refer to the [original repository of Flas
 from transformers import LlavaOnevisionForConditionalGeneration

 model = LlavaOnevisionForConditionalGeneration.from_pretrained(
-    model_id, 
-    torch_dtype=torch.float16, 
+    model_id,
+    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    use_flash_attention_2=True
 ).to(0)
@ -318,6 +318,11 @@ model = LlavaOnevisionForConditionalGeneration.from_pretrained(

 [[autodoc]] LlavaOnevisionImageProcessor

+## LlavaOnevisionImageProcessorFast
+
+[[autodoc]] LlavaOnevisionImageProcessorFast
+    - preprocess
+
 ## LlavaOnevisionVideoProcessor

 [[autodoc]] LlavaOnevisionVideoProcessor
--- a/docs/source/en/model_doc/rt_detr_v2.md
+++ b/docs/source/en/model_doc/rt_detr_v2.md
@ -0,0 +1,97 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# RT-DETRv2
+
+## Overview
+
+The RT-DETRv2 model was proposed in [RT-DETRv2: Improved Baseline with Bag-of-Freebies for Real-Time Detection Transformer](https://arxiv.org/abs/2407.17140) by Wenyu Lv, Yian Zhao, Qinyao Chang, Kui Huang, Guanzhong Wang, Yi Liu.
+
+RT-DETRv2 refines RT-DETR by introducing selective multi-scale feature extraction, a discrete sampling operator for broader deployment compatibility, and improved training strategies like dynamic data augmentation and scale-adaptive hyperparameters. These changes enhance flexibility and practicality while maintaining real-time performance.
+
+The abstract from the paper is the following:
+
+*In this report, we present RT-DETRv2, an improved Real-Time DEtection TRansformer (RT-DETR). RT-DETRv2 builds upon the previous state-of-the-art real-time detector, RT-DETR, and opens up a set of bag-of-freebies for flexibility and practicality, as well as optimizing the training strategy to achieve enhanced performance. To improve the flexibility, we suggest setting a distinct number of sampling points for features at different scales in the deformable attention to achieve selective multi-scale feature extraction by the decoder. To enhance practicality, we propose an optional discrete sampling operator to replace the grid_sample operator that is specific to RT-DETR compared to YOLOs. This removes the deployment constraints typically associated with DETRs. For the training strategy, we propose dynamic data augmentation and scale-adaptive hyperparameters customization to improve performance without loss of speed.*
+
+This model was contributed by [jadechoghari](https://huggingface.co/jadechoghari).
+The original code can be found [here](https://github.com/lyuwenyu/RT-DETR).
+
+## Usage tips 
+
+This second version of RT-DETR improves how the decoder finds objects in an image. 
+
+- **better sampling** – adjusts offsets so the model looks at the right areas
+- **flexible attention** – can use smooth (bilinear) or fixed (discrete) sampling
+- **optimized processing** – improves how attention weights mix information
+
+```py
+>>> import torch
+>>> import requests
+
+>>> from PIL import Image
+>>> from transformers import RTDetrV2ForObjectDetection, RTDetrImageProcessor
+
+>>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> image_processor = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_v2_r18vd")
+>>> model = RTDetrV2ForObjectDetection.from_pretrained("PekingU/rtdetr_v2_r18vd")
+
+>>> inputs = image_processor(images=image, return_tensors="pt")
+
+>>> with torch.no_grad():
+...     outputs = model(**inputs)
+
+>>> results = image_processor.post_process_object_detection(outputs, target_sizes=torch.tensor([(image.height, image.width)]), threshold=0.5)
+
+>>> for result in results:
+...     for score, label_id, box in zip(result["scores"], result["labels"], result["boxes"]):
+...         score, label = score.item(), label_id.item()
+...         box = [round(i, 2) for i in box.tolist()]
+...         print(f"{model.config.id2label[label]}: {score:.2f} {box}")
+cat: 0.97 [341.14, 25.11, 639.98, 372.89]
+cat: 0.96 [12.78, 56.35, 317.67, 471.34]
+remote: 0.95 [39.96, 73.12, 175.65, 117.44]
+sofa: 0.86 [-0.11, 2.97, 639.89, 473.62]
+sofa: 0.82 [-0.12, 1.78, 639.87, 473.52]
+remote: 0.79 [333.65, 76.38, 370.69, 187.48]
+```
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with RT-DETRv2.
+
+<PipelineTag pipeline="object-detection"/>
+
+- Scripts for finetuning [`RTDetrV2ForObjectDetection`] with [`Trainer`] or [Accelerate](https://huggingface.co/docs/accelerate/index) can be found [here](https://github.com/huggingface/transformers/tree/main/examples/pytorch/object-detection).
+- See also: [Object detection task guide](../tasks/object_detection).
+- Notebooks for [inference](https://github.com/qubvel/transformers-notebooks/blob/main/notebooks/RT_DETR_v2_inference.ipynb) and [fine-tuning](https://github.com/qubvel/transformers-notebooks/blob/main/notebooks/RT_DETR_v2_finetune_on_a_custom_dataset.ipynb) RT-DETRv2 on a custom dataset (🌎).
+
+
+## RTDetrV2Config
+
+[[autodoc]] RTDetrV2Config
+
+
+## RTDetrV2Model
+
+[[autodoc]] RTDetrV2Model
+    - forward
+ 
+## RTDetrV2ForObjectDetection
+
+[[autodoc]] RTDetrV2ForObjectDetection
+    - forward
--- a/docs/source/en/model_doc/siglip.md
+++ b/docs/source/en/model_doc/siglip.md
@ -214,6 +214,11 @@ Below is an expected speedup diagram that compares inference time between the na
 [[autodoc]] SiglipImageProcessor
    - preprocess

+## SiglipImageProcessorFast
+
+[[autodoc]] SiglipImageProcessorFast
+    - preprocess
+
 ## SiglipProcessor

 [[autodoc]] SiglipProcessor
--- a/docs/source/en/perf_infer_gpu_one.md
+++ b/docs/source/en/perf_infer_gpu_one.md
@ -52,6 +52,7 @@ FlashAttention-2 is currently supported for the following architectures:
 * [Emu3](https://huggingface.co/docs/transformers/model_doc/emu3)
 * [Gemma](https://huggingface.co/docs/transformers/model_doc/gemma#transformers.GemmaModel)
 * [Gemma2](https://huggingface.co/docs/transformers/model_doc/gemma2#transformers.Gemma2Model)
+* [GotOcr2](https://huggingface.co/docs/transformers/model_doc/got_ocr2#transformers.GotOcr2ForConditionalGeneration)
 * [GPT2](https://huggingface.co/docs/transformers/model_doc/gpt2)
 * [GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode#transformers.GPTBigCodeModel)
 * [GPTNeo](https://huggingface.co/docs/transformers/model_doc/gpt_neo#transformers.GPTNeoModel)
@ -243,6 +244,7 @@ For now, Transformers supports SDPA inference and training for the following arc
 * [data2vec_vision](https://huggingface.co/docs/transformers/main/en/model_doc/data2vec#transformers.Data2VecVisionModel)
 * [Dbrx](https://huggingface.co/docs/transformers/model_doc/dbrx#transformers.DbrxModel)
 * [DeiT](https://huggingface.co/docs/transformers/model_doc/deit#transformers.DeiTModel)
+* [DepthPro](https://huggingface.co/docs/transformers/model_doc/depth_pro#transformers.DepthProModel)
 * [DiffLlama](https://huggingface.co/docs/transformers/model_doc/diffllama#transformers.DiffLlamaModel)
 * [Dinov2](https://huggingface.co/docs/transformers/en/model_doc/dinov2)
 * [Dinov2_with_registers](https://huggingface.co/docs/transformers/en/model_doc/dinov2)
@ -253,6 +255,7 @@ For now, Transformers supports SDPA inference and training for the following arc
 * [Falcon](https://huggingface.co/docs/transformers/model_doc/falcon#transformers.FalconModel)
 * [Gemma](https://huggingface.co/docs/transformers/model_doc/gemma#transformers.GemmaModel)
 * [Gemma2](https://huggingface.co/docs/transformers/model_doc/gemma2#transformers.Gemma2Model)
+* [GotOcr2](https://huggingface.co/docs/transformers/model_doc/got_ocr2#transformers.GotOcr2ForConditionalGeneration)
 * [Granite](https://huggingface.co/docs/transformers/model_doc/granite#transformers.GraniteModel)
 * [GPT2](https://huggingface.co/docs/transformers/model_doc/gpt2)
 * [GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode#transformers.GPTBigCodeModel)
@ -355,7 +358,7 @@ tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
 model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype=torch.float16).to("cuda")

 input_text = "Hello my dog is cute and"
-inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
+inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

 + with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
    outputs = model.generate(**inputs)
@ -429,14 +432,14 @@ To load a model in 4-bit for inference, use the `load_in_4bit` parameter. The `d
 ```py
 from transformers import AutoModelForCausalLM

-model_name = "bigscience/bloom-2b5"
+model_name = "bigscience/bloom-1b7"
 model_4bit = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", load_in_4bit=True)
 ```

-To load a model in 4-bit for inference with multiple GPUs, you can control how much GPU RAM you want to allocate to each GPU. For example, to distribute 600MB of memory to the first GPU and 1GB of memory to the second GPU:
+To load a model in 4-bit for inference with multiple GPUs, you can control how much GPU RAM you want to allocate to each GPU. For example, to distribute 2GB of memory to the first GPU and 5GB of memory to the second GPU:

 ```py
-max_memory_mapping = {0: "600MB", 1: "1GB"}
+max_memory_mapping = {0: "2GB", 1: "5GB"}
 model_name = "bigscience/bloom-3b"
 model_4bit = AutoModelForCausalLM.from_pretrained(
    model_name, torch_dtype="auto", device_map="auto", load_in_4bit=True, max_memory=max_memory_mapping
@ -456,7 +459,7 @@ To load a model in 8-bit for inference, use the `load_in_8bit` parameter. The `d
 ```py
 from transformers import AutoModelForCausalLM, BitsAndBytesConfig

-model_name = "bigscience/bloom-2b5"
+model_name = "bigscience/bloom-1b7"
 model_8bit = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", quantization_config=BitsAndBytesConfig(load_in_8bit=True))
 ```

@ -465,20 +468,20 @@ If you're loading a model in 8-bit for text generation, you should use the [`~tr
 ```py
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

-model_name = "bigscience/bloom-2b5"
+model_name = "bigscience/bloom-1b7"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model_8bit = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", quantization_config=BitsAndBytesConfig(load_in_8bit=True))

 prompt = "Hello, my llama is cute"
-inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
-generated_ids = model.generate(**inputs)
+inputs = tokenizer(prompt, return_tensors="pt").to(model_8bit.device)
+generated_ids = model_8bit.generate(**inputs)
 outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
 ```

-To load a model in 8-bit for inference with multiple GPUs, you can control how much GPU RAM you want to allocate to each GPU. For example, to distribute 1GB of memory to the first GPU and 2GB of memory to the second GPU:
+To load a model in 8-bit for inference with multiple GPUs, you can control how much GPU RAM you want to allocate to each GPU. For example, to distribute 2GB of memory to the first GPU and 5GB of memory to the second GPU:

 ```py
-max_memory_mapping = {0: "1GB", 1: "2GB"}
+max_memory_mapping = {0: "2GB", 1: "5GB"}
 model_name = "bigscience/bloom-3b"
 model_8bit = AutoModelForCausalLM.from_pretrained(
    model_name, torch_dtype="auto", device_map="auto", load_in_8bit=True, max_memory=max_memory_mapping
@ -543,11 +546,8 @@ quantization_config = BitsAndBytesConfig(
 tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
 model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype="auto", quantization_config=quantization_config)

-# enable BetterTransformer
-model = model.to_bettertransformer()
-
 input_text = "Hello my dog is cute and"
-inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
+inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

 # enable FlashAttention
 with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
--- a/docs/source/en/perf_train_gpu_many.md
+++ b/docs/source/en/perf_train_gpu_many.md
@ -476,7 +476,7 @@ And GPU1 does the same by enlisting GPU3 to its aid.
 Since each dimension requires at least 2 GPUs, here you'd need at least 4 GPUs.

 Implementations:
- [DeepSpeed](https://github.com/microsoft/DeepSpeed)
+- [DeepSpeed](https://github.com/deepspeedai/DeepSpeed)
 - [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)
 - [Varuna](https://github.com/microsoft/varuna)
 - [SageMaker](https://arxiv.org/abs/2111.05972)
@ -497,7 +497,7 @@ This diagram is from a blog post [3D parallelism: Scaling to trillion-parameter
 Since each dimension requires at least 2 GPUs, here you'd need at least 8 GPUs.

 Implementations:
- [DeepSpeed](https://github.com/microsoft/DeepSpeed) - DeepSpeed also includes an even more efficient DP, which they call ZeRO-DP.
+- [DeepSpeed](https://github.com/deepspeedai/DeepSpeed) - DeepSpeed also includes an even more efficient DP, which they call ZeRO-DP.
 - [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)
 - [Varuna](https://github.com/microsoft/varuna)
 - [SageMaker](https://arxiv.org/abs/2111.05972)
--- a/docs/source/en/perf_train_gpu_one.md
+++ b/docs/source/en/perf_train_gpu_one.md
@ -298,8 +298,7 @@ from transformers.trainer_pt_utils import get_parameter_names

 training_args = TrainingArguments(per_device_train_batch_size=4, **default_args)

-decay_parameters = get_parameter_names(model, [nn.LayerNorm])
-decay_parameters = [name for name in decay_parameters if "bias" not in name]
+decay_parameters = get_parameter_names(model, [nn.LayerNorm], ["bias", "layernorm", "rmsnorm"])
 optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if n in decay_parameters],
--- a/docs/source/en/quantization/bitsandbytes.md
+++ b/docs/source/en/quantization/bitsandbytes.md
@ -208,7 +208,8 @@ from transformers import AutoModelForCausalLM, BitsAndBytesConfig
 model_id = "bigscience/bloom-1b7"

 quantization_config = BitsAndBytesConfig(
-    llm_int8_threshold=10,
+    llm_int8_threshold=10.0,
+    llm_int8_enable_fp32_cpu_offload=True
 )

 model_8bit = AutoModelForCausalLM.from_pretrained(
@ -285,7 +286,7 @@ For inference, the `bnb_4bit_quant_type` does not have a huge impact on performa

 ### Nested quantization

-Nested quantization is a technique that can save additional memory at no additional performance cost. This feature performs a second quantization of the already quantized weights to save an additional 0.4 bits/parameter. For example, with nested quantization, you can finetune a [Llama-13b](https://huggingface.co/meta-llama/Llama-2-13b) model on a 16GB NVIDIA T4 GPU with a sequence length of 1024, a batch size of 1, and enabling gradient accumulation with 4 steps.
+Nested quantization is a technique that can save additional memory at no additional performance cost. This feature performs a second quantization of the already quantized weights to save an additional 0.4 bits/parameter. For example, with nested quantization, you can finetune a [Llama-13b](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) model on a 16GB NVIDIA T4 GPU with a sequence length of 1024, a batch size of 1, and enabling gradient accumulation with 4 steps.

 ```py
 from transformers import BitsAndBytesConfig
@ -295,7 +296,7 @@ double_quant_config = BitsAndBytesConfig(
    bnb_4bit_use_double_quant=True,
 )

-model_double_quant = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-13b", torch_dtype="auto", quantization_config=double_quant_config)
+model_double_quant = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-13b-chat-hf", torch_dtype="auto", quantization_config=double_quant_config)
 ```

 ## Dequantizing `bitsandbytes` models
@ -307,7 +308,7 @@ from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer

 model_id = "facebook/opt-125m"

-model = AutoModelForCausalLM.from_pretrained(model_id, BitsAndBytesConfig(load_in_4bit=True))
+model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=BitsAndBytesConfig(load_in_4bit=True))
 tokenizer = AutoTokenizer.from_pretrained(model_id)

 model.dequantize()
--- a/docs/source/en/quantization/compressed_tensors.md
+++ b/docs/source/en/quantization/compressed_tensors.md
@ -61,7 +61,7 @@ ct_model = AutoModelForCausalLM.from_pretrained("nm-testing/Meta-Llama-3.1-8B-In

 # Measure memory usage
 mem_params = sum([param.nelement()*param.element_size() for param in ct_model.parameters()])
-print(f"{mem/2**30:.4f} GB")
+print(f"{mem_params/2**30:.4f} GB")
 # 8.4575 GB
 ```

--- a/docs/source/en/serialization.md
+++ b/docs/source/en/serialization.md
@ -130,7 +130,7 @@ Alternative to CLI, you can export a 🤗 Transformers model to ONNX programmati
 >>> from optimum.onnxruntime import ORTModelForSequenceClassification
 >>> from transformers import AutoTokenizer

->>> model_checkpoint = "distilbert_base_uncased_squad"
+>>> model_checkpoint = "distilbert/distilbert-base-uncased-distilled-squad"
 >>> save_directory = "onnx/"

 >>> # Load a model from transformers and export it to ONNX
--- a/docs/source/en/trainer.md
+++ b/docs/source/en/trainer.md
@ -130,7 +130,7 @@ from torch import nn
 from transformers import Trainer

 class CustomTrainer(Trainer):
-    def compute_loss(self, model, inputs, return_outputs=False):
+    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        labels = inputs.pop("labels")
        # forward pass
        outputs = model(**inputs)
@ -156,9 +156,7 @@ class EarlyStoppingCallback(TrainerCallback):

    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step >= self.num_steps:
-            return {"should_training_stop": True}
-        else:
-            return {}
+            control.should_training_stop = True
 ```

 Then pass it to the [`Trainer`]'s `callback` parameter.
@ -737,7 +735,7 @@ accelerate launch --num_processes=2 \
    --fsdp_transformer_layer_cls_to_wrap="BertLayer" \
    --fsdp_sharding_strategy=1 \
    --fsdp_state_dict_type=FULL_STATE_DICT \
-    ./examples/pytorch/text-classification/run_glue.py
+    ./examples/pytorch/text-classification/run_glue.py \
    --model_name_or_path google-bert/bert-base-cased \
    --task_name $TASK_NAME \
    --do_train \
--- a/docs/source/ja/main_classes/deepspeed.md
+++ b/docs/source/ja/main_classes/deepspeed.md
@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.

 # DeepSpeed Integration

-[DeepSpeed](https://github.com/microsoft/DeepSpeed) は、[ZeRO 論文](https://arxiv.org/abs/1910.02054) で説明されているすべてを実装します。現在、次のものを完全にサポートしています。
+[DeepSpeed](https://github.com/deepspeedai/DeepSpeed) は、[ZeRO 論文](https://arxiv.org/abs/1910.02054) で説明されているすべてを実装します。現在、次のものを完全にサポートしています。

 1. オプティマイザーの状態分割 (ZeRO ステージ 1)
 2. 勾配分割 (ZeRO ステージ 2)
@ -32,7 +32,7 @@ DeepSpeed ZeRO-2 は、その機能が推論には役に立たないため、主
 DeepSpeed ZeRO-3 は、巨大なモデルを複数の GPU にロードできるため、推論にも使用できます。
 単一の GPU では不可能です。

-🤗 Transformers は、2 つのオプションを介して [DeepSpeed](https://github.com/microsoft/DeepSpeed) を統合します。
+🤗 Transformers は、2 つのオプションを介して [DeepSpeed](https://github.com/deepspeedai/DeepSpeed) を統合します。

 1. [`Trainer`] によるコア DeepSpeed 機能の統合。何でもやってくれるタイプです
   統合の場合 - カスタム構成ファイルを指定するか、テンプレートを使用するだけで、他に何もする必要はありません。たいていの
@ -78,7 +78,7 @@ pip install deepspeed
 pip install transformers[deepspeed]
 ```

-または、[DeepSpeed の GitHub ページ](https://github.com/microsoft/deepspeed#installation) で詳細を確認してください。
+または、[DeepSpeed の GitHub ページ](https://github.com/deepspeedai/DeepSpeed#installation) で詳細を確認してください。
 [高度なインストール](https://www.deepspeed.ai/tutorials/advanced-install/)。

 それでもビルドに苦労する場合は、まず [CUDA 拡張機能のインストール ノート](trainer#cuda-extension-installation-notes) を必ず読んでください。
@ -89,7 +89,7 @@ pip install transformers[deepspeed]
 DeepSpeed のローカル ビルドを作成するには:

 ```bash
-git clone https://github.com/microsoft/DeepSpeed/
+git clone https://github.com/deepspeedai/DeepSpeed/
 cd DeepSpeed
 rm -rf build
 TORCH_CUDA_ARCH_LIST="8.6" DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 pip install . \
@ -113,7 +113,7 @@ CUDA_VISIBLE_DEVICES=0 python -c "import torch; print(torch.cuda.get_device_capa
 複数のマシンで同じセットアップを使用する必要がある場合は、バイナリ ホイールを作成します。

 ```bash
-git clone https://github.com/microsoft/DeepSpeed/
+git clone https://github.com/deepspeedai/DeepSpeed/
 cd DeepSpeed
 rm -rf build
 TORCH_CUDA_ARCH_LIST="8.6" DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 \
@ -154,7 +154,7 @@ _CudaDeviceProperties(name='GeForce RTX 3090', major=8, minor=6, total_memory=24
 目的のアーチを明示的に指定することをお勧めします。

 提案されたことをすべて試してもまだビルドの問題が発生する場合は、GitHub の問題に進んでください。
-[ディープスピード](https://github.com/microsoft/DeepSpeed/issues)、
+[ディープスピード](https://github.com/deepspeedai/DeepSpeed/issues)、

 <a id='deepspeed-multi-gpu'></a>

@ -481,11 +481,11 @@ deepspeed examples/pytorch/translation/run_translation.py ...
 設定ファイルで使用できる DeepSpeed 設定オプションの完全なガイドについては、次を参照してください。
 [次のドキュメント](https://www.deepspeed.ai/docs/config-json/) にアクセスしてください。

-さまざまな実際のニーズに対応する数十の DeepSpeed 構成例を [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples)で見つけることができます。
+さまざまな実際のニーズに対応する数十の DeepSpeed 構成例を [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples)で見つけることができます。
 リポジトリ:

 ```bash
-git clone https://github.com/microsoft/DeepSpeedExamples
+git clone https://github.com/deepspeedai/DeepSpeedExamples
 cd DeepSpeedExamples
 find . -name '*json'
 ```
@ -497,7 +497,7 @@ find . -name '*json'
 grep -i Lamb $(find . -name '*json')
 ```

-さらにいくつかの例が [メイン リポジトリ](https://github.com/microsoft/DeepSpeed) にもあります。
+さらにいくつかの例が [メイン リポジトリ](https://github.com/deepspeedai/DeepSpeed) にもあります。

 DeepSpeed を使用する場合は、常に DeepSpeed 構成ファイルを指定する必要がありますが、一部の構成パラメータには
 コマンドライン経由で設定します。微妙な違いについては、このガイドの残りの部分で説明します。
@ -868,7 +868,7 @@ ZeRO-Infinity は、GPU と CPU メモリを NVMe メモリで拡張すること
 書き込みでは、読み取り最大 3.5 GB/秒、書き込み最大 3 GB/秒のピーク速度が得られます)。

 最適な`aio`構成ブロックを見つけるには、ターゲット設定でベンチマークを実行する必要があります。
-[ここで説明](https://github.com/microsoft/DeepSpeed/issues/998)。
+[ここで説明](https://github.com/deepspeedai/DeepSpeed/issues/998)。

 <a id='deepspeed-zero2-zero3-performance'></a>

@ -1934,7 +1934,7 @@ SW: Model with 2783M total params, 65M largest layer params.
  問題が解決しない場合にのみ、Deepspeed について言及し、必要な詳細をすべて提供してください。

 - 問題が統合部分ではなく DeepSpeed コアにあることが明らかな場合は、問題を提出してください。
-  [Deepspeed](https://github.com/microsoft/DeepSpeed/) を直接使用します。よくわからない場合でも、ご安心ください。
+  [Deepspeed](https://github.com/deepspeedai/DeepSpeed/) を直接使用します。よくわからない場合でも、ご安心ください。
  どちらの問題トラッカーでも問題ありません。投稿されたらそれを判断し、次の場合は別の問題トラッカーにリダイレクトします。
  そうである必要がある。

@ -1994,7 +1994,7 @@ SW: Model with 2783M total params, 65M largest layer params.

 ### Notes

- DeepSpeed には pip でインストール可能な PyPI パッケージがありますが、ハードウェアに最も適合するように、また有効にする必要がある場合は、[ソース](https://github.com/microsoft/deepspeed#installation) からインストールすることを強くお勧めします。
+- DeepSpeed には pip でインストール可能な PyPI パッケージがありますが、ハードウェアに最も適合するように、また有効にする必要がある場合は、[ソース](https://github.com/deepspeedai/DeepSpeed#installation) からインストールすることを強くお勧めします。
  1 ビット Adam などの特定の機能は、pypi ディストリビューションでは利用できません。
 - 🤗 Transformers で DeepSpeed を使用するために [`Trainer`] を使用する必要はありません - 任意のモデルを使用できます
  後者は [DeepSpeed 統合手順](https://www.deepspeed.ai/getting-started/#writing-deepspeed-models) に従って調整する必要があります。
@ -2239,7 +2239,7 @@ RUN_SLOW=1 pytest tests/deepspeed

 ## Main DeepSpeed Resources

- [プロジェクトの github](https://github.com/microsoft/deepspeed)
+- [プロジェクトの github](https://github.com/deepspeedai/DeepSpeed)
 - [使用方法ドキュメント](https://www.deepspeed.ai/getting-started/)
 - [API ドキュメント](https://deepspeed.readthedocs.io/en/latest/index.html)
 - [ブログ投稿](https://www.microsoft.com/en-us/research/search/?q=deepspeed)
@ -2251,4 +2251,4 @@ RUN_SLOW=1 pytest tests/deepspeed
 - [ZeRO-Infinity: 極限スケールの深層学習のための GPU メモリの壁を打ち破る](https://arxiv.org/abs/2104.07857)

 最後に、HuggingFace [`Trainer`] は DeepSpeed のみを統合していることを覚えておいてください。
-DeepSpeed の使用に関して問題や質問がある場合は、[DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/issues) に問題を提出してください。
+DeepSpeed の使用に関して問題や質問がある場合は、[DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/issues) に問題を提出してください。
--- a/docs/source/ja/main_classes/trainer.md
+++ b/docs/source/ja/main_classes/trainer.md
@ -199,7 +199,7 @@ _python_、_numpy_、および _pytorch_ の RNG 状態は、そのチェック
 torchrun --nproc_per_node=2  trainer-program.py ...
 ```

-[`accelerate`](https://github.com/huggingface/accelerate) または [`deepspeed`](https://github.com/microsoft/DeepSpeed) がインストールされている場合は、次を使用して同じことを達成することもできます。の一つ：
+[`accelerate`](https://github.com/huggingface/accelerate) または [`deepspeed`](https://github.com/deepspeedai/DeepSpeed) がインストールされている場合は、次を使用して同じことを達成することもできます。の一つ：

 ```bash
 accelerate launch --num_processes 2 trainer-program.py ...
@ -291,7 +291,7 @@ export CUDA_VISIBLE_DEVICES=1,0
 [`Trainer`] は、トレーニングを劇的に改善する可能性のあるライブラリをサポートするように拡張されました。
 時間とはるかに大きなモデルに適合します。

-現在、サードパーティのソリューション [DeepSpeed](https://github.com/microsoft/DeepSpeed) および [PyTorch FSDP](https://pytorch.org/docs/stable/fsdp.html) をサポートしています。論文 [ZeRO: メモリの最適化兆パラメータ モデルのトレーニングに向けて、Samyam Rajbhandari、Jeff Rasley、Olatunji Ruwase、Yuxiong He 著](https://arxiv.org/abs/1910.02054)。
+現在、サードパーティのソリューション [DeepSpeed](https://github.com/deepspeedai/DeepSpeed) および [PyTorch FSDP](https://pytorch.org/docs/stable/fsdp.html) をサポートしています。論文 [ZeRO: メモリの最適化兆パラメータ モデルのトレーニングに向けて、Samyam Rajbhandari、Jeff Rasley、Olatunji Ruwase、Yuxiong He 著](https://arxiv.org/abs/1910.02054)。

 この提供されるサポートは、この記事の執筆時点では新しくて実験的なものです。 DeepSpeed と PyTorch FSDP のサポートはアクティブであり、それに関する問題は歓迎しますが、FairScale 統合は PyTorch メインに統合されているため、もうサポートしていません ([PyTorch FSDP 統合](#pytorch-fully-sharded-data-parallel))

@ -301,7 +301,7 @@ export CUDA_VISIBLE_DEVICES=1,0

 この記事の執筆時点では、Deepspeed を使用するには、CUDA C++ コードをコンパイルする必要があります。

-すべてのインストールの問題は、[Deepspeed](https://github.com/microsoft/DeepSpeed/issues) の対応する GitHub の問題を通じて対処する必要がありますが、ビルド中に発生する可能性のある一般的な問題がいくつかあります。
+すべてのインストールの問題は、[Deepspeed](https://github.com/deepspeedai/DeepSpeed/issues) の対応する GitHub の問題を通じて対処する必要がありますが、ビルド中に発生する可能性のある一般的な問題がいくつかあります。
 CUDA 拡張機能を構築する必要がある PyTorch 拡張機能。

 したがって、次の操作を実行中に CUDA 関連のビルドの問題が発生した場合は、次のとおりです。
--- a/docs/source/ja/model_doc/blip.md
+++ b/docs/source/ja/model_doc/blip.md
@ -61,6 +61,11 @@ BLIP は、次のようなさまざまなマルチモーダル タスクを実
 [[autodoc]] BlipImageProcessor
    - preprocess

+## BlipImageProcessorFast
+
+[[autodoc]] BlipImageProcessorFast
+    - preprocess
+
 <frameworkcontent>
 <pt>

--- a/docs/source/ja/model_doc/clip.md
+++ b/docs/source/ja/model_doc/clip.md
@ -133,6 +133,11 @@ CLIP を使い始めるのに役立つ公式 Hugging Face およびコミュニ
 [[autodoc]] CLIPImageProcessor
    - preprocess

+## CLIPImageProcessorFast
+
+[[autodoc]] CLIPImageProcessorFast
+    - preprocess
+
 ## CLIPFeatureExtractor

 [[autodoc]] CLIPFeatureExtractor
--- a/docs/source/ja/model_doc/convnext.md
+++ b/docs/source/ja/model_doc/convnext.md
@ -64,6 +64,11 @@ ConvNeXT の使用を開始するのに役立つ公式 Hugging Face およびコ
 [[autodoc]] ConvNextImageProcessor
    - preprocess

+## ConvNextImageProcessorFast
+
+[[autodoc]] ConvNextImageProcessorFast
+    - preprocess
+
 <frameworkcontent>
 <pt>

--- a/docs/source/ja/model_doc/deit.md
+++ b/docs/source/ja/model_doc/deit.md
@ -98,6 +98,11 @@ DeiT を始めるのに役立つ公式 Hugging Face およびコミュニティ
 [[autodoc]] DeiTImageProcessor
    - preprocess

+## DeiTImageProcessorFast
+
+[[autodoc]] DeiTImageProcessorFast
+    - preprocess
+
 <frameworkcontent>
 <pt>

--- a/docs/source/ja/perf_train_gpu_many.md
+++ b/docs/source/ja/perf_train_gpu_many.md
@ -360,7 +360,7 @@ by [@anton-l](https://github.com/anton-l)。
 SageMakerは、より効率的な処理のためにTPとDPを組み合わせて使用します。

 代替名：
- [DeepSpeed](https://github.com/microsoft/DeepSpeed)はこれを「テンソルスライシング」と呼びます。詳細は[DeepSpeedの特徴](https://www.deepspeed.ai/training/#model-parallelism)をご覧ください。
+- [DeepSpeed](https://github.com/deepspeedai/DeepSpeed)はこれを「テンソルスライシング」と呼びます。詳細は[DeepSpeedの特徴](https://www.deepspeed.ai/training/#model-parallelism)をご覧ください。

 実装例:
 - [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)には、モデル固有の内部実装があります。
@ -384,7 +384,7 @@ DeepSpeedの[パイプラインチュートリアル](https://www.deepspeed.ai/t
 各次元には少なくとも2つのGPUが必要ですので、ここでは少なくとも4つのGPUが必要です。

 実装例:
- [DeepSpeed](https://github.com/microsoft/DeepSpeed)
+- [DeepSpeed](https://github.com/deepspeedai/DeepSpeed)
 - [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)
 - [Varuna](https://github.com/microsoft/varuna)
 - [SageMaker](https://arxiv.org/abs/2111.05972)
@ -403,7 +403,7 @@ DeepSpeedの[パイプラインチュートリアル](https://www.deepspeed.ai/t
 各次元には少なくとも2つのGPUが必要ですので、ここでは少なくとも8つのGPUが必要です。

 実装例:
- [DeepSpeed](https://github.com/microsoft/DeepSpeed) - DeepSpeedには、さらに効率的なDPであるZeRO-DPと呼ばれるものも含まれています。
+- [DeepSpeed](https://github.com/deepspeedai/DeepSpeed) - DeepSpeedには、さらに効率的なDPであるZeRO-DPと呼ばれるものも含まれています。
 - [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)
 - [Varuna](https://github.com/microsoft/varuna)
 - [SageMaker](https://arxiv.org/abs/2111.05972)
--- a/docs/source/ja/perf_train_gpu_one.md
+++ b/docs/source/ja/perf_train_gpu_one.md
@ -237,8 +237,7 @@ from transformers.trainer_pt_utils import get_parameter_names

 training_args = TrainingArguments(per_device_train_batch_size=4, **default_args)

-decay_parameters = get_parameter_names(model, [nn.LayerNorm])
-decay_parameters = [name for name in decay_parameters if "bias" not in name]
+decay_parameters = get_parameter_names(model, [nn.LayerNorm], ["bias", "layernorm", "rmsnorm"])
 optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if n in decay_parameters],
--- a/docs/source/ko/deepspeed.md
+++ b/docs/source/ko/deepspeed.md
@ -28,7 +28,7 @@ GPU가 제한된 환경에서 ZeRO는 최적화 메모리와 계산을 GPU에서

 ## 설치[[installation]]

-DeepSpeed는 PyPI 또는 Transformers에서 설치할 수 있습니다(자세한 설치 옵션은 DeepSpeed [설치 상세사항](https://www.deepspeed.ai/tutorials/advanced-install/) 또는 GitHub [README](https://github.com/microsoft/deepspeed#installation)를 참조하세요).
+DeepSpeed는 PyPI 또는 Transformers에서 설치할 수 있습니다(자세한 설치 옵션은 DeepSpeed [설치 상세사항](https://www.deepspeed.ai/tutorials/advanced-install/) 또는 GitHub [README](https://github.com/deepspeedai/DeepSpeed#installation)를 참조하세요).

 <Tip>

@ -114,10 +114,10 @@ DeepSpeed는 트레이닝 실행 방법을 구성하는 모든 매개변수가

 <Tip>

-DeepSpeed 구성 옵션의 전체 목록은 [DeepSpeed Configuration JSON](https://www.deepspeed.ai/docs/config-json/)에서 확인할 수 있습니다. 또한 [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) 리포지토리 또는 기본 [DeepSpeed](https://github.com/microsoft/DeepSpeed) 리포지토리에서 다양한 DeepSpeed 구성 예제에 대한 보다 실용적인 예제를 찾을 수 있습니다. 구체적인 예제를 빠르게 찾으려면 다음과 같이 하세요:
+DeepSpeed 구성 옵션의 전체 목록은 [DeepSpeed Configuration JSON](https://www.deepspeed.ai/docs/config-json/)에서 확인할 수 있습니다. 또한 [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples) 리포지토리 또는 기본 [DeepSpeed](https://github.com/deepspeedai/DeepSpeed) 리포지토리에서 다양한 DeepSpeed 구성 예제에 대한 보다 실용적인 예제를 찾을 수 있습니다. 구체적인 예제를 빠르게 찾으려면 다음과 같이 하세요:

 ```bash
-git clone https://github.com/microsoft/DeepSpeedExamples
+git clone https://github.com/deepspeedai/DeepSpeedExamples
 cd DeepSpeedExamples
 find . -name '*json'
 # Lamb 옵티마이저 샘플 찾기
@ -303,7 +303,7 @@ ZeRO-3로 대규모 모델을 초기화하고 매개변수에 액세스하는

 [ZeRO-Infinity](https://hf.co/papers/2104.07857)를 사용하면 모델 상태를 CPU 및/또는 NVMe로 오프로드하여 더 많은 메모리를 절약할 수 있습니다. 스마트 파티셔닝 및 타일링 알고리즘을 통해 각 GPU는 오프로딩 중에 매우 적은 양의 데이터를 주고받을 수 있으므로 최신 NVMe는 훈련 프로세스에 사용할 수 있는 것보다 훨씬 더 큰 총 메모리 풀에 맞출 수 있습니다. ZeRO-Infinity에는 ZeRO-3가 필요합니다.

-사용 가능한 CPU 및/또는 NVMe 메모리에 따라 [옵티마이저](https://www.deepspeed.ai/docs/config-json/#optimizer-offloading)와 [매개변수](https://www.deepspeed.ai/docs/config-json/#parameter-offloading) 중 하나만 오프로드하거나 아무것도 오프로드하지 않을 수 있습니다. 또한 일반 하드 드라이브나 솔리드 스테이트 드라이브에서도 작동하지만 속도가 현저히 느려지므로 `nvme_path`가 NVMe 장치를 가리키고 있는지 확인해야 합니다. 최신 NVMe를 사용하면 읽기 작업의 경우 최대 3.5GB/s, 쓰기 작업의 경우 최대 3GB/s의 전송 속도를 기대할 수 있습니다. 마지막으로, 트레이닝 설정에서 [벤치마크 실행하기](https://github.com/microsoft/DeepSpeed/issues/998)을 통해 최적의 'aio' 구성을 결정합니다.
+사용 가능한 CPU 및/또는 NVMe 메모리에 따라 [옵티마이저](https://www.deepspeed.ai/docs/config-json/#optimizer-offloading)와 [매개변수](https://www.deepspeed.ai/docs/config-json/#parameter-offloading) 중 하나만 오프로드하거나 아무것도 오프로드하지 않을 수 있습니다. 또한 일반 하드 드라이브나 솔리드 스테이트 드라이브에서도 작동하지만 속도가 현저히 느려지므로 `nvme_path`가 NVMe 장치를 가리키고 있는지 확인해야 합니다. 최신 NVMe를 사용하면 읽기 작업의 경우 최대 3.5GB/s, 쓰기 작업의 경우 최대 3GB/s의 전송 속도를 기대할 수 있습니다. 마지막으로, 트레이닝 설정에서 [벤치마크 실행하기](https://github.com/deepspeedai/DeepSpeed/issues/998)을 통해 최적의 'aio' 구성을 결정합니다.

 아래 예제 ZeRO-3/Infinity 구성 파일은 대부분의 매개변수 값을 `auto`으로 설정하고 있지만, 수동으로 값을 추가할 수도 있습니다.

@ -1141,7 +1141,7 @@ rank1:

 ## 트러블슈팅[[troubleshoot]]

-문제가 발생하면 DeepSpeed가 문제의 원인이 아닌 경우가 많으므로(아주 명백하고 예외적으로 DeepSpeed 모듈을 볼 수 있는 경우가 아니라면) DeepSpeed가 문제의 원인인지 고려해야 합니다! 첫 번째 단계는 DeepSpeed 없이 설정을 다시 시도하고 문제가 지속되면 문제를 신고하는 것입니다. 문제가 핵심적인 DeepSpeed 문제이고 transformers와 관련이 없는 경우, [DeepSpeed 리포지토리](https://github.com/microsoft/DeepSpeed)에서 이슈를 개설하세요.
+문제가 발생하면 DeepSpeed가 문제의 원인이 아닌 경우가 많으므로(아주 명백하고 예외적으로 DeepSpeed 모듈을 볼 수 있는 경우가 아니라면) DeepSpeed가 문제의 원인인지 고려해야 합니다! 첫 번째 단계는 DeepSpeed 없이 설정을 다시 시도하고 문제가 지속되면 문제를 신고하는 것입니다. 문제가 핵심적인 DeepSpeed 문제이고 transformers와 관련이 없는 경우, [DeepSpeed 리포지토리](https://github.com/deepspeedai/DeepSpeed)에서 이슈를 개설하세요.

 transformers와 관련된 이슈를 개설할 때에는 다음 정보를 제공해 주세요:

@ -1211,7 +1211,7 @@ NVMe 및 ZeRO-3를 설정한 경우 NVMe로 오프로드를 실험해 보세요(

 ## 리소스[[resources]]

-DeepSpeed ZeRO는 제한된 GPU 리소스로 추론을 위해 매우 큰 모델을 훈련하고 로드하는 강력한 기술로, 누구나 쉽게 사용할 수 있습니다. DeepSpeed에 대해 자세히 알아보려면 [블로그 포스트](https://www.microsoft.com/en-us/research/search/?q=deepspeed), [공식 문서](https://www.deepspeed.ai/getting-started/), [깃허브 리포지토리](https://github.com/microsoft/deepspeed)를 참조하세요. 
+DeepSpeed ZeRO는 제한된 GPU 리소스로 추론을 위해 매우 큰 모델을 훈련하고 로드하는 강력한 기술로, 누구나 쉽게 사용할 수 있습니다. DeepSpeed에 대해 자세히 알아보려면 [블로그 포스트](https://www.microsoft.com/en-us/research/search/?q=deepspeed), [공식 문서](https://www.deepspeed.ai/getting-started/), [깃허브 리포지토리](https://github.com/deepspeedai/DeepSpeed)를 참조하세요. 

 다음 문서도 ZeRO에 대해 자세히 알아볼 수 있는 훌륭한 자료입니다:

--- a/docs/source/ko/perf_train_gpu_many.md
+++ b/docs/source/ko/perf_train_gpu_many.md
@ -386,7 +386,7 @@ DeepSpeed [pipeline tutorial](https://www.deepspeed.ai/tutorials/pipeline/)에
 각 차원마다 적어도 2개의 GPU가 필요하므로 최소한 4개의 GPU가 필요합니다.

 구현:
- [DeepSpeed](https://github.com/microsoft/DeepSpeed)
+- [DeepSpeed](https://github.com/deepspeedai/DeepSpeed)
 - [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)
 - [Varuna](https://github.com/microsoft/varuna)
 - [SageMaker](https://arxiv.org/abs/2111.05972)
@ -405,7 +405,7 @@ DeepSpeed [pipeline tutorial](https://www.deepspeed.ai/tutorials/pipeline/)에
 각 차원마다 적어도 2개의 GPU가 필요하므로 최소한 8개의 GPU가 필요합니다.

 구현:
- [DeepSpeed](https://github.com/microsoft/DeepSpeed) - DeepSpeed는 더욱 효율적인 DP인 ZeRO-DP라고도 부릅니다.
+- [DeepSpeed](https://github.com/deepspeedai/DeepSpeed) - DeepSpeed는 더욱 효율적인 DP인 ZeRO-DP라고도 부릅니다.
 - [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)
 - [Varuna](https://github.com/microsoft/varuna)
 - [SageMaker](https://arxiv.org/abs/2111.05972)
--- a/docs/source/zh/main_classes/deepspeed.md
+++ b/docs/source/zh/main_classes/deepspeed.md
@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.

 # DeepSpeed集成

-[DeepSpeed](https://github.com/microsoft/DeepSpeed)实现了[ZeRO论文](https://arxiv.org/abs/1910.02054)中描述的所有内容。目前，它提供对以下功能的全面支持：
+[DeepSpeed](https://github.com/deepspeedai/DeepSpeed)实现了[ZeRO论文](https://arxiv.org/abs/1910.02054)中描述的所有内容。目前，它提供对以下功能的全面支持：

 1. 优化器状态分区（ZeRO stage 1）
 2. 梯度分区（ZeRO stage 2）
@ -31,7 +31,7 @@ DeepSpeed ZeRO-2主要用于训练，因为它的特性对推理没有用处。

 DeepSpeed ZeRO-3也可以用于推理，因为它允许将单个GPU无法加载的大模型加载到多个GPU上。

-🤗 Transformers通过以下两种方式集成了[DeepSpeed](https://github.com/microsoft/DeepSpeed)：
+🤗 Transformers通过以下两种方式集成了[DeepSpeed](https://github.com/deepspeedai/DeepSpeed)：

 1. 通过[`Trainer`]集成核心的DeepSpeed功能。这是一种“为您完成一切”式的集成 - 您只需提供自定义配置文件或使用我们的模板配置文件。本文档的大部分内容都集中在这个功能上。
 2. 如果您不使用[`Trainer`]并希望在自己的Trainer中集成DeepSpeed，那么像`from_pretrained`和`from_config`这样的核心功能函数将包括ZeRO stage 3及以上的DeepSpeed的基础部分，如`zero.Init`。要利用此功能，请阅读有关[非Trainer DeepSpeed集成](#nontrainer-deepspeed-integration)的文档。
@ -72,7 +72,7 @@ pip install deepspeed
 pip install transformers[deepspeed]
 ```

-或在 [DeepSpeed 的 GitHub 页面](https://github.com/microsoft/deepspeed#installation) 和
+或在 [DeepSpeed 的 GitHub 页面](https://github.com/deepspeedai/DeepSpeed#installation) 和
 [高级安装](https://www.deepspeed.ai/tutorials/advanced-install/) 中查找更多详细信息。

 如果构建过程中仍然遇到问题，请首先确保阅读 [CUDA 扩展安装注意事项](trainer#cuda-extension-installation-notes)。
@ -83,7 +83,7 @@ pip install transformers[deepspeed]


 ```bash
-git clone https://github.com/microsoft/DeepSpeed/
+git clone https://github.com/deepspeedai/DeepSpeed/
 cd DeepSpeed
 rm -rf build
 TORCH_CUDA_ARCH_LIST="8.6" DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 pip install . \
@ -105,7 +105,7 @@ CUDA_VISIBLE_DEVICES=0 python -c "import torch; print(torch.cuda.get_device_capa


 ```bash
-git clone https://github.com/microsoft/DeepSpeed/
+git clone https://github.com/deepspeedai/DeepSpeed/
 cd DeepSpeed
 rm -rf build
 TORCH_CUDA_ARCH_LIST="8.6" DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 \
@ -142,7 +142,7 @@ _CudaDeviceProperties(name='GeForce RTX 3090', major=8, minor=6, total_memory=24

 您也可以完全省略 `TORCH_CUDA_ARCH_LIST`，然后构建程序将自动查询构建所在的 GPU 的架构。这可能与目标机器上的 GPU 不匹配，因此最好明确指定所需的架构。

-如果尝试了所有建议的方法仍然遇到构建问题，请继续在 [Deepspeed](https://github.com/microsoft/DeepSpeed/issues)的 GitHub Issue 上提交问题。
+如果尝试了所有建议的方法仍然遇到构建问题，请继续在 [Deepspeed](https://github.com/deepspeedai/DeepSpeed/issues)的 GitHub Issue 上提交问题。


 <a id='deepspeed-multi-gpu'></a>
@ -471,10 +471,10 @@ deepspeed examples/pytorch/translation/run_translation.py ...

 有关可以在 DeepSpeed 配置文件中使用的完整配置选项的详细指南，请参阅[以下文档](https://www.deepspeed.ai/docs/config-json/)。

-您可以在 [DeepSpeedExamples 仓库](https://github.com/microsoft/DeepSpeedExamples)中找到解决各种实际需求的数十个 DeepSpeed 配置示例。
+您可以在 [DeepSpeedExamples 仓库](https://github.com/deepspeedai/DeepSpeedExamples)中找到解决各种实际需求的数十个 DeepSpeed 配置示例。

 ```bash
-git clone https://github.com/microsoft/DeepSpeedExamples
+git clone https://github.com/deepspeedai/DeepSpeedExamples
 cd DeepSpeedExamples
 find . -name '*json'
 ```
@ -485,7 +485,7 @@ find . -name '*json'
 grep -i Lamb $(find . -name '*json')
 ```

-还可以在[主仓](https://github.com/microsoft/DeepSpeed)中找到更多示例。
+还可以在[主仓](https://github.com/deepspeedai/DeepSpeed)中找到更多示例。

 在使用 DeepSpeed 时，您总是需要提供一个 DeepSpeed 配置文件，但是一些配置参数必须通过命令行进行配置。您将在本指南的剩余章节找到这些细微差别。

@ -797,7 +797,7 @@ ZeRO-Infinity 通过使用 NVMe 内存扩展 GPU 和 CPU 内存，从而允许

 确保您的 `nvme_path` 实际上是一个 NVMe，因为它与普通硬盘或 SSD 一起工作，但速度会慢得多。快速可扩展的训练是根据现代 NVMe 传输速度设计的（截至本文撰写时，可以达到 ~3.5GB/s 读取，~3GB/s 写入的峰值速度）。

-为了找出最佳的 `aio` 配置块，您必须在目标设置上运行一个基准测试，具体操作请参见[说明](https://github.com/microsoft/DeepSpeed/issues/998)。
+为了找出最佳的 `aio` 配置块，您必须在目标设置上运行一个基准测试，具体操作请参见[说明](https://github.com/deepspeedai/DeepSpeed/issues/998)。



@ -1789,7 +1789,7 @@ SW: Model with 2783M total params, 65M largest layer params.

  因此，如果问题明显与DeepSpeed相关，例如您可以看到有一个异常并且可以看到DeepSpeed模块涉及其中，请先重新测试没有DeepSpeed的设置。只有当问题仍然存在时，才向Deepspeed提供所有必需的细节。

- 如果您明确问题是在Deepspeed核心中而不是集成部分，请直接向[Deepspeed](https://github.com/microsoft/DeepSpeed/)提交问题。如果您不确定，请不要担心，无论使用哪个issue跟踪问题都可以，一旦您发布问题，我们会弄清楚并将其重定向到另一个issue跟踪（如果需要的话）。
+- 如果您明确问题是在Deepspeed核心中而不是集成部分，请直接向[Deepspeed](https://github.com/deepspeedai/DeepSpeed/)提交问题。如果您不确定，请不要担心，无论使用哪个issue跟踪问题都可以，一旦您发布问题，我们会弄清楚并将其重定向到另一个issue跟踪（如果需要的话）。



@ -2086,7 +2086,7 @@ RUN_SLOW=1 pytest tests/deepspeed

 ## 主要的DeepSpeed资源

- [项目GitHub](https://github.com/microsoft/deepspeed)
+- [项目GitHub](https://github.com/deepspeedai/DeepSpeed)
 - [使用文档](https://www.deepspeed.ai/getting-started/)
 - [API文档](https://deepspeed.readthedocs.io/en/latest/index.html)
 - [博客文章](https://www.microsoft.com/en-us/research/search/?q=deepspeed)
@ -2097,4 +2097,4 @@ RUN_SLOW=1 pytest tests/deepspeed
 - [ZeRO-Offload: Democratizing Billion-Scale Model Training](https://arxiv.org/abs/2101.06840)
 - [ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning](https://arxiv.org/abs/2104.07857)

-最后，请记住，HuggingFace [`Trainer`]仅集成了DeepSpeed，因此如果您在使用DeepSpeed时遇到任何问题或疑问，请在[DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/issues)上提交一个issue。
+最后，请记住，HuggingFace [`Trainer`]仅集成了DeepSpeed，因此如果您在使用DeepSpeed时遇到任何问题或疑问，请在[DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/issues)上提交一个issue。
--- a/docs/source/zh/main_classes/trainer.md
+++ b/docs/source/zh/main_classes/trainer.md
@ -182,7 +182,7 @@ my_app.py ... --log_level error --log_level_replica error --log_on_each_node 0
 python -m torch.distributed.launch --nproc_per_node=2  trainer-program.py ...
 ```

-如果你安装了 [`accelerate`](https://github.com/huggingface/accelerate) 或 [`deepspeed`](https://github.com/microsoft/DeepSpeed)，你还可以通过以下任一方法实现相同的效果：
+如果你安装了 [`accelerate`](https://github.com/huggingface/accelerate) 或 [`deepspeed`](https://github.com/deepspeedai/DeepSpeed)，你还可以通过以下任一方法实现相同的效果：


 ```bash
@ -281,7 +281,7 @@ export CUDA_VISIBLE_DEVICES=1,0

 [`Trainer`] 已经被扩展，以支持可能显著提高训练时间并适应更大模型的库。

-目前，它支持第三方解决方案 [DeepSpeed](https://github.com/microsoft/DeepSpeed) 和 [PyTorch FSDP](https://pytorch.org/docs/stable/fsdp.html)，它们实现了论文 [ZeRO: Memory Optimizations Toward Training Trillion Parameter Models, by Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, Yuxiong He](https://arxiv.org/abs/1910.02054) 的部分内容。
+目前，它支持第三方解决方案 [DeepSpeed](https://github.com/deepspeedai/DeepSpeed) 和 [PyTorch FSDP](https://pytorch.org/docs/stable/fsdp.html)，它们实现了论文 [ZeRO: Memory Optimizations Toward Training Trillion Parameter Models, by Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, Yuxiong He](https://arxiv.org/abs/1910.02054) 的部分内容。

 截至撰写本文，此提供的支持是新的且实验性的。尽管我们欢迎围绕 DeepSpeed 和 PyTorch FSDP 的issues，但我们不再支持 FairScale 集成，因为它已经集成到了 PyTorch 主线（参见 [PyTorch FSDP 集成](#pytorch-fully-sharded-data-parallel)）。

@ -293,7 +293,7 @@ export CUDA_VISIBLE_DEVICES=1,0

 撰写时，Deepspeed 需要在使用之前编译 CUDA C++ 代码。

-虽然所有安装问题都应通过 [Deepspeed](https://github.com/microsoft/DeepSpeed/issues) 的 GitHub Issues处理，但在构建依赖CUDA 扩展的任何 PyTorch 扩展时，可能会遇到一些常见问题。
+虽然所有安装问题都应通过 [Deepspeed](https://github.com/deepspeedai/DeepSpeed/issues) 的 GitHub Issues处理，但在构建依赖CUDA 扩展的任何 PyTorch 扩展时，可能会遇到一些常见问题。

 因此，如果在执行以下操作时遇到与 CUDA 相关的构建问题：

--- a/examples/modular-transformers/modeling_dummy.py
+++ b/examples/modular-transformers/modeling_dummy.py
@ -639,7 +639,7 @@ class DummyModel(DummyPreTrainedModel):
        if (
            self.config._attn_implementation == "sdpa"
            and attention_mask is not None
-            and attention_mask.device.type == "cuda"
+            and attention_mask.device.type in ["cuda", "xpu"]
            and not output_attentions
        ):
            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
--- a/examples/modular-transformers/modeling_multimodal1.py
+++ b/examples/modular-transformers/modeling_multimodal1.py
@ -639,7 +639,7 @@ class Multimodal1TextModel(Multimodal1TextPreTrainedModel):
        if (
            self.config._attn_implementation == "sdpa"
            and attention_mask is not None
-            and attention_mask.device.type == "cuda"
+            and attention_mask.device.type in ["cuda", "xpu"]
            and not output_attentions
        ):
            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
--- a/examples/modular-transformers/modeling_my_new_model2.py
+++ b/examples/modular-transformers/modeling_my_new_model2.py
@ -644,7 +644,7 @@ class MyNewModel2Model(MyNewModel2PreTrainedModel):
        if (
            self.config._attn_implementation == "sdpa"
            and attention_mask is not None
-            and attention_mask.device.type == "cuda"
+            and attention_mask.device.type in ["cuda", "xpu"]
            and not output_attentions
        ):
            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
--- a/examples/modular-transformers/modeling_new_task_model.py
+++ b/examples/modular-transformers/modeling_new_task_model.py
@ -452,11 +452,9 @@ class NewTaskModelForNewTask(NewTaskModelPreTrainedModel, GenerationMixin):
        return model_inputs

    def resize_token_embeddings(
-        self,
-        new_num_tokens: Optional[int] = None,
-        pad_to_multiple_of=None,
+        self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None, mean_resizing=True
    ) -> nn.Embedding:
-        model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
+        model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of, mean_resizing)

        # Update vocab size
        self.config.text_config.vocab_size = model_embeds.num_embeddings
--- a/examples/modular-transformers/modeling_super.py
+++ b/examples/modular-transformers/modeling_super.py
@ -561,7 +561,7 @@ class SuperModel(SuperPreTrainedModel):
        if (
            self.config._attn_implementation == "sdpa"
            and attention_mask is not None
-            and attention_mask.device.type == "cuda"
+            and attention_mask.device.type in ["cuda", "xpu"]
            and not output_attentions
        ):
            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
--- a/examples/modular-transformers/modular_new_task_model.py
+++ b/examples/modular-transformers/modular_new_task_model.py
@ -70,11 +70,9 @@ class NewTaskModelForNewTask(PaliGemmaForConditionalGeneration):
        return (embeddings,) + vlm_outputs

    def resize_token_embeddings(
-        self,
-        new_num_tokens: Optional[int] = None,
-        pad_to_multiple_of=None,
+        self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None, mean_resizing=True
    ) -> nn.Embedding:
-        model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
+        model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of, mean_resizing)

        # Update vocab size
        self.config.text_config.vocab_size = model_embeds.num_embeddings
--- a/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py
+++ b/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py
@ -680,8 +680,7 @@ def main():
    # Instantiate custom data collator
    data_collator = DataCollatorCTCWithPadding(processor=processor)

-    decay_parameters = get_parameter_names(model, [torch.nn.LayerNorm])
-    decay_parameters = [name for name in decay_parameters if "bias" not in name]
+    decay_parameters = get_parameter_names(model, [torch.nn.LayerNorm], ["bias", "layernorm", "rmsnorm"])
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if n in decay_parameters],
--- a/notebooks/README.md
+++ b/notebooks/README.md
@ -144,7 +144,6 @@ You can open any page of the documentation as a notebook in Colab (there is a bu
 | Notebook     |      Description      |   |   |
 |:----------|:-------------|:-------------|------:|
 | [How to quantize a model with ONNX Runtime for text classification](https://github.com/huggingface/notebooks/blob/main/examples/text_classification_quantization_ort.ipynb)| Show how to apply static and dynamic quantization on a model using [ONNX Runtime](https://github.com/microsoft/onnxruntime) for any GLUE task. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification_quantization_ort.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/text_classification_quantization_ort.ipynb)|
-| [How to quantize a model with Intel Neural Compressor for text classification](https://github.com/huggingface/notebooks/blob/main/examples/text_classification_quantization_inc.ipynb)| Show how to apply static, dynamic and aware training quantization on a model using [Intel Neural Compressor (INC)](https://github.com/intel/neural-compressor) for any GLUE task. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification_quantization_inc.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/text_classification_quantization_inc.ipynb)|
 | [How to fine-tune a model on text classification with ONNX Runtime](https://github.com/huggingface/notebooks/blob/main/examples/text_classification_ort.ipynb)| Show how to preprocess the data and fine-tune a model on any GLUE task using [ONNX Runtime](https://github.com/microsoft/onnxruntime). | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification_ort.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/text_classification_ort.ipynb)|
 | [How to fine-tune a model on summarization with ONNX Runtime](https://github.com/huggingface/notebooks/blob/main/examples/summarization_ort.ipynb)| Show how to preprocess the data and fine-tune a model on XSUM using [ONNX Runtime](https://github.com/microsoft/onnxruntime). | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization_ort.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/summarization_ort.ipynb)|

--- a/pyproject.toml
+++ b/pyproject.toml
@ -52,3 +52,5 @@ markers = [
    "bitsandbytes: select (or deselect with `not`) bitsandbytes integration tests",
    "generate: marks tests that use the GenerationTesterMixin"
 ]
+log_cli = 1
+log_cli_level = "WARNING"
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@ -328,6 +328,7 @@ _import_structure = {
        "CTRLTokenizer",
    ],
    "models.cvt": ["CvtConfig"],
+    "models.dab_detr": ["DabDetrConfig"],
    "models.dac": ["DacConfig", "DacFeatureExtractor"],
    "models.data2vec": [
        "Data2VecAudioConfig",
@ -399,6 +400,7 @@ _import_structure = {
    "models.deprecated.vit_hybrid": ["ViTHybridConfig"],
    "models.deprecated.xlm_prophetnet": ["XLMProphetNetConfig"],
    "models.depth_anything": ["DepthAnythingConfig"],
+    "models.depth_pro": ["DepthProConfig"],
    "models.detr": ["DetrConfig"],
    "models.dialogpt": [],
    "models.diffllama": ["DiffLlamaConfig"],
@ -476,6 +478,11 @@ _import_structure = {
    ],
    "models.glm": ["GlmConfig"],
    "models.glpn": ["GLPNConfig"],
+    "models.got_ocr2": [
+        "GotOcr2Config",
+        "GotOcr2Processor",
+        "GotOcr2VisionConfig",
+    ],
    "models.gpt2": [
        "GPT2Config",
        "GPT2Tokenizer",
@ -742,6 +749,7 @@ _import_structure = {
        "RoFormerTokenizer",
    ],
    "models.rt_detr": ["RTDetrConfig", "RTDetrResNetConfig"],
+    "models.rt_detr_v2": ["RTDetrV2Config"],
    "models.rwkv": ["RwkvConfig"],
    "models.sam": [
        "SamConfig",
@ -1230,6 +1238,7 @@ else:
    _import_structure["models.deprecated.efficientformer"].append("EfficientFormerImageProcessor")
    _import_structure["models.deprecated.tvlt"].append("TvltImageProcessor")
    _import_structure["models.deprecated.vit_hybrid"].extend(["ViTHybridImageProcessor"])
+    _import_structure["models.depth_pro"].extend(["DepthProImageProcessor", "DepthProImageProcessorFast"])
    _import_structure["models.detr"].extend(["DetrFeatureExtractor", "DetrImageProcessor"])
    _import_structure["models.donut"].extend(["DonutFeatureExtractor", "DonutImageProcessor"])
    _import_structure["models.dpt"].extend(["DPTFeatureExtractor", "DPTImageProcessor"])
@ -1238,6 +1247,7 @@ else:
    _import_structure["models.flava"].extend(["FlavaFeatureExtractor", "FlavaImageProcessor", "FlavaProcessor"])
    _import_structure["models.fuyu"].extend(["FuyuImageProcessor", "FuyuProcessor"])
    _import_structure["models.glpn"].extend(["GLPNFeatureExtractor", "GLPNImageProcessor"])
+    _import_structure["models.got_ocr2"].extend(["GotOcr2ImageProcessor"])
    _import_structure["models.grounding_dino"].extend(["GroundingDinoImageProcessor"])
    _import_structure["models.idefics"].extend(["IdeficsImageProcessor"])
    _import_structure["models.idefics2"].extend(["Idefics2ImageProcessor"])
@ -1301,11 +1311,20 @@ except OptionalDependencyNotAvailable:
    ]
 else:
    _import_structure["image_processing_utils_fast"] = ["BaseImageProcessorFast"]
+    _import_structure["models.blip"].append("BlipImageProcessorFast")
+    _import_structure["models.clip"].append("CLIPImageProcessorFast")
+    _import_structure["models.convnext"].append("ConvNextImageProcessorFast")
    _import_structure["models.deformable_detr"].append("DeformableDetrImageProcessorFast")
+    _import_structure["models.deit"].append("DeiTImageProcessorFast")
+    _import_structure["models.depth_pro"].append("DepthProImageProcessorFast")
    _import_structure["models.detr"].append("DetrImageProcessorFast")
+    _import_structure["models.llava"].append("LlavaImageProcessorFast")
+    _import_structure["models.llava_next"].append("LlavaNextImageProcessorFast")
+    _import_structure["models.llava_onevision"].append("LlavaOnevisionImageProcessorFast")
    _import_structure["models.pixtral"].append("PixtralImageProcessorFast")
    _import_structure["models.qwen2_vl"].append("Qwen2VLImageProcessorFast")
    _import_structure["models.rt_detr"].append("RTDetrImageProcessorFast")
+    _import_structure["models.siglip"].append("SiglipImageProcessorFast")
    _import_structure["models.vit"].append("ViTImageProcessorFast")

 try:
@ -1892,6 +1911,13 @@ else:
            "CvtPreTrainedModel",
        ]
    )
+    _import_structure["models.dab_detr"].extend(
+        [
+            "DabDetrForObjectDetection",
+            "DabDetrModel",
+            "DabDetrPreTrainedModel",
+        ]
+    )
    _import_structure["models.dac"].extend(
        [
            "DacModel",
@ -2158,6 +2184,13 @@ else:
            "DepthAnythingPreTrainedModel",
        ]
    )
+    _import_structure["models.depth_pro"].extend(
+        [
+            "DepthProForDepthEstimation",
+            "DepthProModel",
+            "DepthProPreTrainedModel",
+        ]
+    )
    _import_structure["models.detr"].extend(
        [
            "DetrForObjectDetection",
@ -2426,6 +2459,12 @@ else:
            "GLPNPreTrainedModel",
        ]
    )
+    _import_structure["models.got_ocr2"].extend(
+        [
+            "GotOcr2ForConditionalGeneration",
+            "GotOcr2PreTrainedModel",
+        ]
+    )
    _import_structure["models.gpt2"].extend(
        [
            "GPT2DoubleHeadsModel",
@ -3426,6 +3465,9 @@ else:
            "RTDetrResNetPreTrainedModel",
        ]
    )
+    _import_structure["models.rt_detr_v2"].extend(
+        ["RTDetrV2ForObjectDetection", "RTDetrV2Model", "RTDetrV2PreTrainedModel"]
+    )
    _import_structure["models.rwkv"].extend(
        [
            "RwkvForCausalLM",
@ -5375,6 +5417,9 @@ if TYPE_CHECKING:
        CTRLTokenizer,
    )
    from .models.cvt import CvtConfig
+    from .models.dab_detr import (
+        DabDetrConfig,
+    )
    from .models.dac import (
        DacConfig,
        DacFeatureExtractor,
@ -5463,6 +5508,7 @@ if TYPE_CHECKING:
        XLMProphetNetConfig,
    )
    from .models.depth_anything import DepthAnythingConfig
+    from .models.depth_pro import DepthProConfig
    from .models.detr import DetrConfig
    from .models.diffllama import DiffLlamaConfig
    from .models.dinat import DinatConfig
@ -5540,6 +5586,7 @@ if TYPE_CHECKING:
    )
    from .models.glm import GlmConfig
    from .models.glpn import GLPNConfig
+    from .models.got_ocr2 import GotOcr2Config, GotOcr2Processor, GotOcr2VisionConfig
    from .models.gpt2 import (
        GPT2Config,
        GPT2Tokenizer,
@ -5843,6 +5890,7 @@ if TYPE_CHECKING:
        RTDetrConfig,
        RTDetrResNetConfig,
    )
+    from .models.rt_detr_v2 import RTDetrV2Config
    from .models.rwkv import RwkvConfig
    from .models.sam import (
        SamConfig,
@ -6330,6 +6378,7 @@ if TYPE_CHECKING:
        from .models.deprecated.efficientformer import EfficientFormerImageProcessor
        from .models.deprecated.tvlt import TvltImageProcessor
        from .models.deprecated.vit_hybrid import ViTHybridImageProcessor
+        from .models.depth_pro import DepthProImageProcessor, DepthProImageProcessorFast
        from .models.detr import DetrFeatureExtractor, DetrImageProcessor
        from .models.donut import DonutFeatureExtractor, DonutImageProcessor
        from .models.dpt import DPTFeatureExtractor, DPTImageProcessor
@ -6342,6 +6391,7 @@ if TYPE_CHECKING:
        )
        from .models.fuyu import FuyuImageProcessor, FuyuProcessor
        from .models.glpn import GLPNFeatureExtractor, GLPNImageProcessor
+        from .models.got_ocr2 import GotOcr2ImageProcessor
        from .models.grounding_dino import GroundingDinoImageProcessor
        from .models.idefics import IdeficsImageProcessor
        from .models.idefics2 import Idefics2ImageProcessor
@ -6417,11 +6467,20 @@ if TYPE_CHECKING:
        from .utils.dummy_torchvision_objects import *
    else:
        from .image_processing_utils_fast import BaseImageProcessorFast
+        from .models.blip import BlipImageProcessorFast
+        from .models.clip import CLIPImageProcessorFast
+        from .models.convnext import ConvNextImageProcessorFast
        from .models.deformable_detr import DeformableDetrImageProcessorFast
+        from .models.deit import DeiTImageProcessorFast
+        from .models.depth_pro import DepthProImageProcessorFast
        from .models.detr import DetrImageProcessorFast
+        from .models.llava import LlavaImageProcessorFast
+        from .models.llava_next import LlavaNextImageProcessorFast
+        from .models.llava_onevision import LlavaOnevisionImageProcessorFast
        from .models.pixtral import PixtralImageProcessorFast
        from .models.qwen2_vl import Qwen2VLImageProcessorFast
        from .models.rt_detr import RTDetrImageProcessorFast
+        from .models.siglip import SiglipImageProcessorFast
        from .models.vit import ViTImageProcessorFast

    try:
@ -6912,6 +6971,11 @@ if TYPE_CHECKING:
            CvtModel,
            CvtPreTrainedModel,
        )
+        from .models.dab_detr import (
+            DabDetrForObjectDetection,
+            DabDetrModel,
+            DabDetrPreTrainedModel,
+        )
        from .models.dac import (
            DacModel,
            DacPreTrainedModel,
@ -7127,6 +7191,11 @@ if TYPE_CHECKING:
            DepthAnythingForDepthEstimation,
            DepthAnythingPreTrainedModel,
        )
+        from .models.depth_pro import (
+            DepthProForDepthEstimation,
+            DepthProModel,
+            DepthProPreTrainedModel,
+        )
        from .models.detr import (
            DetrForObjectDetection,
            DetrForSegmentation,
@ -7346,6 +7415,10 @@ if TYPE_CHECKING:
            GLPNModel,
            GLPNPreTrainedModel,
        )
+        from .models.got_ocr2 import (
+            GotOcr2ForConditionalGeneration,
+            GotOcr2PreTrainedModel,
+        )
        from .models.gpt2 import (
            GPT2DoubleHeadsModel,
            GPT2ForQuestionAnswering,
@ -8121,6 +8194,7 @@ if TYPE_CHECKING:
            RTDetrResNetBackbone,
            RTDetrResNetPreTrainedModel,
        )
+        from .models.rt_detr_v2 import RTDetrV2ForObjectDetection, RTDetrV2Model, RTDetrV2PreTrainedModel
        from .models.rwkv import (
            RwkvForCausalLM,
            RwkvModel,
--- a/src/transformers/activations.py
+++ b/src/transformers/activations.py
@ -217,6 +217,7 @@ ACT2CLS = {
    "silu": nn.SiLU,
    "swish": nn.SiLU,
    "tanh": nn.Tanh,
+    "prelu": nn.PReLU,
 }
 ACT2FN = ClassInstantier(ACT2CLS)

--- a/src/transformers/cache_utils.py
+++ b/src/transformers/cache_utils.py
@ -29,6 +29,8 @@ class Cache(torch.nn.Module):
    Base, abstract class for all caches. The actual data structure is specific to each subclass.
    """

+    is_compileable = False
+
    def __init__(self):
        super().__init__()

@ -1098,6 +1100,8 @@ class StaticCache(Cache):
        ```
    """

+    is_compileable = True
+
    # TODO (joao): remove `=None` in non-optional arguments in v4.46. Remove from `OBJECTS_TO_IGNORE` as well.
    @deprecate_kwarg("layer_device_map", version="4.52.0")
    def __init__(
@ -1297,6 +1301,7 @@ class SlidingWindowCache(StaticCache):
    """

    is_sliding = True
+    is_compileable = True

    # TODO (joao): remove `=None` in non-optional arguments in v4.46. Remove from `OBJECTS_TO_IGNORE` as well.
    def __init__(
@ -1421,6 +1426,7 @@ class EncoderDecoderCache(Cache):
        super().__init__()
        self.self_attention_cache = self_attention_cache
        self.cross_attention_cache = cross_attention_cache
+        self.is_compileable = getattr(self.self_attention_cache, "is_compileable", False)

        self.is_updated = {}
        for layer_idx in range(len(cross_attention_cache.key_cache)):
@ -1612,6 +1618,8 @@ class HybridCache(Cache):
        ```
    """

+    is_compileable = True
+
    # TODO (joao): remove `=None` in non-optional arguments in v4.46. Remove from `OBJECTS_TO_IGNORE` as well.
    @deprecate_kwarg("layer_device_map", version="4.52.0")
    def __init__(
@ -1832,6 +1840,8 @@ class MambaCache:
        ```
    """

+    is_compileable = True
+
    # TODO (joao): remove `=None` in non-optional arguments in v4.46. Remove from `OBJECTS_TO_IGNORE` as well.
    def __init__(
        self,
@ -1975,6 +1985,8 @@ class OffloadedStaticCache(StaticCache):
        ```
    """

+    is_compileable = True
+
    @deprecate_kwarg("layer_device_map", version="4.52.0")
    def __init__(
        self,
--- a/src/transformers/commands/add_fast_image_processor.py
+++ b/src/transformers/commands/add_fast_image_processor.py
@ -0,0 +1,655 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+from argparse import ArgumentParser, Namespace
+from datetime import date
+from pathlib import Path
+
+from ..utils import logging
+from . import BaseTransformersCLICommand
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+CURRENT_YEAR = date.today().year
+TRANSFORMERS_PATH = Path(__file__).parent.parent
+REPO_PATH = TRANSFORMERS_PATH.parent.parent
+
+
+def add_import_structure_entry_init(content: str, fast_image_processor_name: str, model_name: str):
+    """
+    Add an entry to the `_import_structure` dictionary in the `__init__.py` file of the transformers package.
+    """
+    # Step 1: Find the block
+    block_regex = re.compile(
+        r"if not is_torchvision_available\(\):.*?else:\s*(\n(?P<indent>\s+)_import_structure\[.*?\].*?\n(?:\s*(?P=indent)_import_structure\[.*?\].*?\n)*)",
+        re.DOTALL,
+    )
+    match = block_regex.search(content)
+
+    if not match:
+        raise ValueError("Couldn't find the '_import_structure' block.")
+
+    # Capture the block content and indentation
+    block_content = match.group(1)
+    indent = match.group("indent")
+
+    # Step 2: Parse existing entries
+    lines = block_content.strip().split("\n")
+    entries = []
+
+    import_structure_header = indent + lines[0]
+    entries = lines[1:]
+
+    # Add the new entry, maintaining alphabetical order
+    new_entry = f'{indent}_import_structure["models.{model_name}"].append("{fast_image_processor_name}")'
+    if new_entry not in entries:
+        entries.append(new_entry)
+
+    entries.sort()
+    entries = [import_structure_header] + entries
+
+    # Step 3: Reconstruct the block
+    updated_block = "\n".join(entry for entry in entries)
+
+    # Replace the original block in the content
+    updated_content = content[: match.start(1)] + "\n" + updated_block + "\n" + content[match.end(1) :]
+
+    return updated_content
+
+
+def add_import_statement_init(content: str, fast_image_processor_name: str, model_name: str):
+    """
+    Add an import statement to the `__init__.py` file of the transformers package.
+    """
+    # Step 1: Find the block
+    block_regex = re.compile(
+        r"if not is_torchvision_available\(\):\s+raise OptionalDependencyNotAvailable\(\)\s+except OptionalDependencyNotAvailable:\s+from \.utils\.dummy_torchvision_objects import \*\s+else:(?P<else_block>\s*(\n\s*from .+ import .*\n)+)(?=\s*try:\s+if not \(is_torchvision_available\(\) and is_timm_available\(\)\):)",
+        re.DOTALL,
+    )
+    match = block_regex.search(content)
+
+    if match:
+        block_content = match.group("else_block")  # The captured import block
+    else:
+        print("Couldn't find the import statement block.")
+
+    # Step 2: Parse existing entries
+    lines = block_content.strip().split("\n")
+    entries = []
+
+    indent = " " * (len(lines[1]) - len(lines[1].lstrip()))
+    import_structure_header = indent + lines[0]
+    entries = lines[1:]
+
+    # Add the new entry, maintaining alphabetical order
+    new_entry = f"{indent}from .models.{model_name} import {fast_image_processor_name}"
+    if new_entry not in entries:
+        entries.append(new_entry)
+
+    entries.sort()
+    entries = [import_structure_header] + entries
+
+    # Step 3: Reconstruct the block
+    updated_block = "\n".join(entry for entry in entries)
+
+    # Replace the original block in the content
+    updated_content = (
+        content[: match.start("else_block")] + "\n" + updated_block + "\n\n" + content[match.end("else_block") :]
+    )
+
+    return updated_content
+
+
+def add_fast_image_processor_to_main_init(fast_image_processor_name: str, model_name: str):
+    """
+    Add the fast image processor to the main __init__.py file of the transformers package.
+    """
+    with open(TRANSFORMERS_PATH / "__init__.py", "r", encoding="utf-8") as f:
+        content = f.read()
+
+    # add _import_structure entry
+    content = add_import_structure_entry_init(content, fast_image_processor_name, model_name)
+    # add import statement
+    content = add_import_statement_init(content, fast_image_processor_name, model_name)
+
+    # write the updated content
+    with open(TRANSFORMERS_PATH / "__init__.py", "w", encoding="utf-8") as f:
+        f.write(content)
+
+
+def add_fast_image_processor_to_model_init(
+    fast_image_processing_module_file: str, fast_image_processor_name, model_name: str
+):
+    """
+    Add the fast image processor to the __init__.py file of the model.
+    """
+    with open(TRANSFORMERS_PATH / "models" / model_name / "__init__.py", "r", encoding="utf-8") as f:
+        content = f.read()
+
+    fast_image_processing_module_file = fast_image_processing_module_file.split(os.sep)[-1].replace(".py", "")
+
+    if "import *" in content:
+        # we have an init file in the updated format
+        # get the indented block after if TYPE_CHECKING: and before else:, append the new import, sort the imports and write the updated content
+        # Step 1: Find the block
+        block_regex = re.compile(
+            r"if TYPE_CHECKING:\n(?P<if_block>.*?)(?=\s*else:)",
+            re.DOTALL,
+        )
+        match = block_regex.search(content)
+
+        if not match:
+            raise ValueError("Couldn't find the 'if TYPE_CHECKING' block.")
+
+        block_content = match.group("if_block")  # The captured import block
+
+        # Step 2: Parse existing entries
+        entries = block_content.split("\n")
+        indent = " " * (len(entries[0]) - len(entries[0].lstrip()))
+        new_entry = f"{indent}from .{fast_image_processing_module_file} import *"
+        if new_entry not in entries:
+            entries.append(new_entry)
+        entries.sort()
+        updated_block = "\n".join(entry for entry in entries)
+
+        # Replace the original block in the content
+        updated_content = content[: match.start("if_block")] + updated_block + content[match.end("if_block") :]
+    else:
+        # we have an init file in the old format
+
+        # add "is_torchvision_available" import to from ...utils import (
+        # Regex to match import statements from transformers.utils
+        pattern = r"""
+            from\s+\.\.\.utils\s+import\s+
+            (?:                                   # Non-capturing group for either:
+                ([\w, ]+)                         # 1. Single-line imports (e.g., 'a, b')
+                |                                 # OR
+                \((.*?)\)                         # 2. Multi-line imports (e.g., '(a, ... b)')
+            )
+        """
+        regex = re.compile(pattern, re.VERBOSE | re.DOTALL)
+
+        def replacement_function(match):
+            # Extract existing imports
+            imports = (match.group(1) or match.group(2)).split(",")
+            imports = imports[:-1] if imports[-1] == "\n" else imports
+            imports = [imp.strip() for imp in imports]
+
+            # Add the new import if not already present
+            if "is_torchvision_available" not in imports:
+                imports.append("is_torchvision_available")
+                imports.sort()
+
+            # Convert to multi-line import in all cases
+            updated_imports = "(\n    " + ",\n    ".join(imports) + ",\n)"
+
+            return f"from ...utils import {updated_imports}"
+
+        # Replace all matches in the file content
+        updated_content = regex.sub(replacement_function, content)
+
+        vision_import_structure_block = f'    _import_structure["{fast_image_processing_module_file[:-5]}"] = ["{fast_image_processor_name[:-4]}"]\n'
+
+        added_import_structure_block = (
+            "try:\n    if not is_torchvision_available():\n"
+            "        raise OptionalDependencyNotAvailable()\n"
+            "except OptionalDependencyNotAvailable:\n"
+            "    pass\n"
+            "else:\n"
+            f'    _import_structure["{fast_image_processing_module_file}"] = ["{fast_image_processor_name}"]\n'
+        )
+
+        if vision_import_structure_block not in updated_content:
+            raise ValueError("Couldn't find the 'vision _import_structure block' block.")
+
+        if added_import_structure_block not in updated_content:
+            updated_content = updated_content.replace(
+                vision_import_structure_block, vision_import_structure_block + "\n" + added_import_structure_block
+            )
+
+        vision_import_statement_block = (
+            f"        from .{fast_image_processing_module_file[:-5]} import {fast_image_processor_name[:-4]}\n"
+        )
+
+        added_import_statement_block = (
+            "    try:\n        if not is_torchvision_available():\n"
+            "            raise OptionalDependencyNotAvailable()\n"
+            "    except OptionalDependencyNotAvailable:\n"
+            "        pass\n"
+            "    else:\n"
+            f"        from .{fast_image_processing_module_file} import {fast_image_processor_name}\n"
+        )
+
+        if vision_import_statement_block not in updated_content:
+            raise ValueError("Couldn't find the 'vision _import_structure block' block.")
+
+        if added_import_statement_block not in updated_content:
+            updated_content = updated_content.replace(
+                vision_import_statement_block, vision_import_statement_block + "\n" + added_import_statement_block
+            )
+
+    # write the updated content
+    with open(TRANSFORMERS_PATH / "models" / model_name / "__init__.py", "w", encoding="utf-8") as f:
+        f.write(updated_content)
+
+
+def add_fast_image_processor_to_auto(image_processor_name: str, fast_image_processor_name: str):
+    """
+    Add the fast image processor to the auto module.
+    """
+    with open(TRANSFORMERS_PATH / "models" / "auto" / "image_processing_auto.py", "r", encoding="utf-8") as f:
+        content = f.read()
+
+    # get all lines containing the image processor name
+    updated_content = content.replace(
+        f'("{image_processor_name}",)', f'("{image_processor_name}", "{fast_image_processor_name}")'
+    )
+
+    # write the updated content
+    with open(TRANSFORMERS_PATH / "models" / "auto" / "image_processing_auto.py", "w", encoding="utf-8") as f:
+        f.write(updated_content)
+
+
+def add_fast_image_processor_to_dummy(fast_image_processor_name: str):
+    """
+    Add the fast image processor to the dummy torchvision objects file.
+    """
+    dummy_torchvision_objects_file = TRANSFORMERS_PATH / "utils" / "dummy_torchvision_objects.py"
+    with open(dummy_torchvision_objects_file, "r", encoding="utf-8") as f:
+        content = f.read()
+
+    # regex to find objects starting with "class " and ending with "ImageProcessorFast", including "ImageProcessorFast" in the match
+    image_processor_names = re.findall(r"class (\w*ImageProcessorFast)", content)
+    image_processor_names.append(fast_image_processor_name)
+    image_processor_names.sort()
+    index_new = image_processor_names.index(fast_image_processor_name)
+
+    new_dummy_object = (
+        f"class {fast_image_processor_name}(metaclass=DummyObject):\n"
+        '    _backends = ["torchvision"]\n\n'
+        "    def __init__(self, *args, **kwargs):\n"
+        '        requires_backends(self, ["torchvision"])\n'
+    )
+    if new_dummy_object not in content:
+        if index_new != len(image_processor_names) - 1:
+            # add the dummy object just before the next ImageProcessorFast
+            first_line = f"class {image_processor_names[index_new+1]}(metaclass=DummyObject):"
+            updated_content = content.replace(first_line, new_dummy_object + "\n\n" + first_line)
+        else:
+            # add the dummy object at the very end
+            updated_content = content + "\n\n" + new_dummy_object
+
+        # write the updated content
+        with open(dummy_torchvision_objects_file, "w", encoding="utf-8") as f:
+            f.write(updated_content)
+
+
+def add_fast_image_processor_to_doc(fast_image_processor_name: str, model_name: str):
+    """
+    Add the fast image processor to the model's doc file.
+    """
+    doc_source = REPO_PATH / "docs" / "source"
+    # find the doc files
+    doc_files = list(doc_source.glob(f"*/model_doc/{model_name}.md"))
+    if not doc_files:
+        # try again with "-"
+        doc_files = list(doc_source.glob(f"*/model_doc/{model_name.replace('_', '-')}.md"))
+    if not doc_files:
+        raise ValueError(f"No doc files found for {model_name}")
+
+    base_doc_string = (
+        f"## {fast_image_processor_name[:-4]}\n\n" f"[[autodoc]] {fast_image_processor_name[:-4]}\n" "    - preprocess"
+    )
+    fast_doc_string = (
+        f"## {fast_image_processor_name}\n\n" f"[[autodoc]] {fast_image_processor_name}\n" "    - preprocess"
+    )
+
+    for doc_file in doc_files:
+        with open(doc_file, "r", encoding="utf-8") as f:
+            content = f.read()
+
+        if fast_doc_string not in content:
+            # add the fast image processor to the doc
+            updated_content = content.replace(
+                base_doc_string,
+                base_doc_string + "\n\n" + fast_doc_string,
+            )
+
+            # write the updated content
+            with open(doc_file, "w", encoding="utf-8") as f:
+                f.write(updated_content)
+
+
+def add_fast_image_processor_to_tests(fast_image_processor_name: str, model_name: str):
+    """
+    Add the fast image processor to the image processing tests.
+    """
+    tests_path = REPO_PATH / "tests" / "models" / model_name
+    test_file = tests_path / f"test_image_processing_{model_name}.py"
+    if not os.path.exists(test_file):
+        logger.warning(f"No test file found for {model_name}. Skipping.")
+        return
+
+    with open(test_file, "r", encoding="utf-8") as f:
+        content = f.read()
+
+    # add is_torchvision_available import to the imports
+    # Regex to match import statements from transformers.utils
+    pattern = r"""
+        from\s+transformers\.utils\s+import\s+
+        (?:                                   # Non-capturing group for either:
+            ([\w, ]+)                         # 1. Single-line imports (e.g., 'a, b')
+            |                                 # OR
+            \((.*?)\)                         # 2. Multi-line imports (e.g., '(a, ... b)')
+        )
+    """
+    regex = re.compile(pattern, re.VERBOSE | re.DOTALL)
+
+    def replacement_function(match):
+        # Extract existing imports
+        existing_imports = (match.group(1) or match.group(2)).split(",")
+        existing_imports = existing_imports[:-1] if existing_imports[-1] == "\n" else existing_imports
+        existing_imports = [imp.strip() for imp in existing_imports]
+
+        # Add the new import if not already present
+        if "is_torchvision_available" not in existing_imports:
+            existing_imports.append("is_torchvision_available")
+            existing_imports.sort()
+
+        # Rebuild the import statement
+        if match.group(1):  # Single-line import
+            updated_imports = ", ".join(existing_imports)
+        else:  # Multi-line import
+            updated_imports = "(\n    " + ",\n    ".join(existing_imports) + ",\n)"
+
+        return f"from transformers.utils import {updated_imports}"
+
+    # Replace all matches in the file content
+    updated_content = regex.sub(replacement_function, content)
+
+    # add the fast image processor to the imports
+    base_import_string = f"    from transformers import {fast_image_processor_name[:-4]}"
+    fast_import_string = (
+        "    if is_torchvision_available():\n" f"        from transformers import {fast_image_processor_name}"
+    )
+    if fast_import_string not in updated_content:
+        updated_content = updated_content.replace(base_import_string, base_import_string + "\n\n" + fast_import_string)
+
+    # get line starting with "    image_processing_class = " and add a line after it starting with "    fast_image_processing_class = "
+    image_processing_class_line = re.search(r"    image_processing_class = .*", updated_content)
+    if not image_processing_class_line:
+        logger.warning(f"Couldn't find the 'image_processing_class' line in {test_file}. Skipping.")
+        return
+
+    fast_image_processing_class_line = (
+        f"    fast_image_processing_class = {fast_image_processor_name} if is_torchvision_available() else None"
+    )
+    if "    fast_image_processing_class = " not in updated_content:
+        updated_content = updated_content.replace(
+            image_processing_class_line.group(0),
+            image_processing_class_line.group(0) + "\n" + fast_image_processing_class_line,
+        )
+
+    # write the updated content
+    with open(test_file, "w", encoding="utf-8") as f:
+        f.write(updated_content)
+
+
+def get_fast_image_processing_content_header(content: str) -> str:
+    """
+    Get the header of the slow image processor file.
+    """
+    # get all lines before and including the line containing """Image processor
+    content_header = re.search(r"^(.*?\n)*?\"\"\"Image processor.*", content)
+    content_header = content_header.group(0)
+    content_header = re.sub(r"# Copyright (\d+)\s", f"# Copyright {CURRENT_YEAR} ", content_header)
+    content_header = content_header.replace("Image processor", "Fast Image processor")
+    return content_header
+
+
+def write_default_fast_image_processor_file(
+    fast_image_processing_module_file: str, fast_image_processor_name: str, content_base_file: str
+):
+    """
+    Write a default fast image processor file. Used when encountering a problem while parsing the slow image processor file.
+    """
+    imports = "\n\nfrom ...image_processing_utils_fast import BaseImageProcessorFast\n\n\n"
+    content_header = get_fast_image_processing_content_header(content_base_file)
+    content_base_file = (
+        f"class {fast_image_processor_name}(BaseImageProcessorFast):\n"
+        "    # To be implemented\n"
+        "    resample = None\n"
+        "    image_mean = None\n"
+        "    image_std = None\n"
+        "    size = None\n"
+        "    default_to_square = None\n"
+        "    crop_size = None\n"
+        "    do_resize = None\n"
+        "    do_center_crop = None\n"
+        "    do_rescale = None\n"
+        "    do_normalize = None\n"
+        "    do_convert_rgb = None\n\n\n"
+        f'__all__ = ["{fast_image_processor_name}"]\n'
+    )
+
+    content = content_header + imports + content_base_file
+
+    with open(fast_image_processing_module_file, "w", encoding="utf-8") as f:
+        f.write(content)
+
+
+def add_fast_image_processor_file(
+    fast_image_processing_module_file: str, fast_image_processor_name: str, content_base_file: str
+):
+    """
+    Add the fast image processor file to the model's folder.
+    """
+    # if the file already exists, do nothing
+    if os.path.exists(fast_image_processing_module_file):
+        print(f"{fast_image_processing_module_file} already exists. Skipping.")
+        return
+
+    regex = rf"class {fast_image_processor_name[:-4]}.*?(\n\S|$)"
+    match = re.search(regex, content_base_file, re.DOTALL)
+    if not match:
+        print(f"Couldn't find the {fast_image_processor_name[:-4]} class in {fast_image_processing_module_file}")
+        print("Creating a new file with the default content.")
+        return write_default_fast_image_processor_file(
+            fast_image_processing_module_file, fast_image_processor_name, content_base_file
+        )
+    # Exclude the last unindented line
+    slow_class_content = match.group(0).rstrip()
+    # get default args:
+    # find the __init__ block which start with def __init__ and ends with def
+    match = re.search(r"def __init__.*?def ", slow_class_content, re.DOTALL)
+    if not match:
+        print(
+            f"Couldn't find the __init__ block for {fast_image_processor_name[:-4]} in {fast_image_processing_module_file}"
+        )
+        print("Creating a new file with the default content.")
+        return write_default_fast_image_processor_file(
+            fast_image_processing_module_file, fast_image_processor_name, content_base_file
+        )
+    init = match.group(0)
+    init_signature_block = init.split(")")[0]
+    arg_names = init_signature_block.split(":")
+    arg_names = [arg_name.split("\n")[-1].strip() for arg_name in arg_names]
+    # get the default values
+    default_args = re.findall(r"= (.*?)(?:,|\))", init_signature_block)
+
+    # build default args dict
+    default_args_dict = dict(zip(arg_names, default_args))
+    pattern_default_size = r"size = size if size is not None else\s+(.*)"
+    match_default_size = re.findall(pattern_default_size, init)
+    default_args_dict["size"] = match_default_size[0] if match_default_size else None
+    pattern_default_crop_size = r"crop_size = crop_size if crop_size is not None else\s+(.*)"
+    match_default_crop_size = re.findall(pattern_default_crop_size, init)
+    default_args_dict["crop_size"] = match_default_crop_size[0] if match_default_crop_size else None
+    pattern_default_image_mean = r"self.image_mean = image_mean if image_mean is not None else\s+(.*)"
+    match_default_image_mean = re.findall(pattern_default_image_mean, init)
+    default_args_dict["image_mean"] = match_default_image_mean[0] if match_default_image_mean else None
+    pattern_default_image_std = r"self.image_std = image_std if image_std is not None else\s+(.*)"
+    match_default_image_std = re.findall(pattern_default_image_std, init)
+    default_args_dict["image_std"] = match_default_image_std[0] if match_default_image_std else None
+    default_args_dict["default_to_square"] = False if "(size, default_to_square=False" in init else None
+
+    content_header = get_fast_image_processing_content_header(content_base_file)
+    content_base_file = (
+        f"@add_start_docstrings(\n"
+        f'    "Constructs a fast {fast_image_processor_name.replace("ImageProcessorFast", "")} image processor.",\n'
+        f"    BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,\n)\n"
+        f"class {fast_image_processor_name}(BaseImageProcessorFast):\n"
+        "    # This generated class can be used as a starting point for the fast image processor.\n"
+        "    # if the image processor is only used for simple augmentations, such as resizing, center cropping, rescaling, or normalizing,\n"
+        "    # only the default values should be set in the class.\n"
+        "    # If the image processor requires more complex augmentations, methods from BaseImageProcessorFast can be overridden.\n"
+        "    # In most cases, only the `_preprocess` method should be overridden.\n\n"
+        "    # For an example of a fast image processor requiring more complex augmentations, see `LlavaNextImageProcessorFast`.\n\n"
+        "    # Default values should be checked against the slow image processor\n"
+        "    # None values left after checking can be removed\n"
+        f'    resample = {default_args_dict.get("resample")}\n'
+        f'    image_mean = {default_args_dict.get("image_mean")}\n'
+        f'    image_std = {default_args_dict.get("image_std")}\n'
+        f'    size = {default_args_dict.get("size")}\n'
+        f'    default_to_square = {default_args_dict.get("default_to_square")}\n'
+        f'    crop_size = {default_args_dict.get("crop_size")}\n'
+        f'    do_resize = {default_args_dict.get("do_resize")}\n'
+        f'    do_center_crop = {default_args_dict.get("do_center_crop")}\n'
+        f'    do_rescale = {default_args_dict.get("do_rescale")}\n'
+        f'    do_normalize = {default_args_dict.get("do_normalize")}\n'
+        f'    do_convert_rgb = {default_args_dict.get("do_convert_rgb")}\n\n\n'
+        f'__all__ = ["{fast_image_processor_name}"]\n'
+    )
+
+    imports = (
+        "\n\nfrom ...image_processing_utils_fast import BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, BaseImageProcessorFast\n"
+    )
+    image_utils_imports = []
+    if default_args_dict.get("resample") is not None and "PILImageResampling" in default_args_dict.get("resample"):
+        image_utils_imports.append("PILImageResampling")
+    if default_args_dict.get("image_mean") is not None and not any(
+        char.isdigit() for char in default_args_dict.get("image_mean")
+    ):
+        image_utils_imports.append(default_args_dict.get("image_mean"))
+    if default_args_dict.get("image_std") is not None and not any(
+        char.isdigit() for char in default_args_dict.get("image_std")
+    ):
+        image_utils_imports.append(default_args_dict.get("image_std"))
+
+    if image_utils_imports:
+        # sort imports
+        image_utils_imports.sort()
+        imports += f"from ...image_utils import {', '.join(image_utils_imports)}\n"
+
+    imports += "from ...utils import add_start_docstrings\n"
+
+    content = content_header + imports + "\n\n" + content_base_file
+
+    with open(fast_image_processing_module_file, "w", encoding="utf-8") as f:
+        f.write(content)
+
+
+def add_fast_image_processor(model_name: str):
+    """
+    Add the necessary references to the fast image processor in the transformers package,
+    and create the fast image processor file in the model's folder.
+    """
+    model_module = TRANSFORMERS_PATH / "models" / model_name
+    image_processing_module_file = list(model_module.glob("image_processing*.py"))
+    if not image_processing_module_file:
+        raise ValueError(f"No image processing module found in {model_module}")
+    elif len(image_processing_module_file) > 1:
+        for file_name in image_processing_module_file:
+            if not str(file_name).endswith("_fast.py"):
+                image_processing_module_file = str(file_name)
+                break
+    else:
+        image_processing_module_file = str(image_processing_module_file[0])
+
+    with open(image_processing_module_file, "r", encoding="utf-8") as f:
+        content_base_file = f.read()
+
+    # regex to find object starting with "class " and ending with "ImageProcessor", including "ImageProcessor" in the match
+    image_processor_name = re.findall(r"class (\w*ImageProcessor)", content_base_file)
+    if not image_processor_name:
+        raise ValueError(f"No ImageProcessor class found in {image_processing_module_file}")
+    elif len(image_processor_name) > 1:
+        raise ValueError(f"Multiple ImageProcessor classes found in {image_processing_module_file}")
+
+    image_processor_name = image_processor_name[0]
+    fast_image_processor_name = image_processor_name + "Fast"
+    fast_image_processing_module_file = image_processing_module_file.replace(".py", "_fast.py")
+
+    print(f"Adding {fast_image_processor_name} to {fast_image_processing_module_file}")
+
+    add_fast_image_processor_to_main_init(
+        fast_image_processor_name=fast_image_processor_name,
+        model_name=model_name,
+    )
+
+    add_fast_image_processor_to_model_init(
+        fast_image_processing_module_file=fast_image_processing_module_file,
+        fast_image_processor_name=fast_image_processor_name,
+        model_name=model_name,
+    )
+
+    add_fast_image_processor_to_auto(
+        image_processor_name=image_processor_name,
+        fast_image_processor_name=fast_image_processor_name,
+    )
+
+    add_fast_image_processor_to_dummy(fast_image_processor_name=fast_image_processor_name)
+
+    add_fast_image_processor_to_doc(
+        fast_image_processor_name=fast_image_processor_name,
+        model_name=model_name,
+    )
+
+    add_fast_image_processor_to_tests(
+        fast_image_processor_name=fast_image_processor_name,
+        model_name=model_name,
+    )
+
+    add_fast_image_processor_file(
+        fast_image_processing_module_file=fast_image_processing_module_file,
+        fast_image_processor_name=fast_image_processor_name,
+        content_base_file=content_base_file,
+    )
+
+
+def add_new_model_like_command_factory(args: Namespace):
+    return AddFastImageProcessorCommand(model_name=args.model_name)
+
+
+class AddFastImageProcessorCommand(BaseTransformersCLICommand):
+    @staticmethod
+    def register_subcommand(parser: ArgumentParser):
+        add_fast_image_processor_parser = parser.add_parser("add-fast-image-processor")
+        add_fast_image_processor_parser.add_argument(
+            "--model-name",
+            type=str,
+            required=True,
+            help="The name of the folder containing the model's implementation.",
+        )
+        add_fast_image_processor_parser.set_defaults(func=add_new_model_like_command_factory)
+
+    def __init__(self, model_name: str, *args):
+        self.model_name = model_name
+
+    def run(self):
+        add_fast_image_processor(model_name=self.model_name)
--- a/src/transformers/commands/transformers_cli.py
+++ b/src/transformers/commands/transformers_cli.py
@ -15,6 +15,7 @@

 from transformers import HfArgumentParser

+from .add_fast_image_processor import AddFastImageProcessorCommand
 from .add_new_model_like import AddNewModelLikeCommand
 from .chat import ChatCommand
 from .convert import ConvertCommand
@ -40,6 +41,7 @@ def main():
    UserCommands.register_subcommand(commands_parser)
    AddNewModelLikeCommand.register_subcommand(commands_parser)
    LfsCommands.register_subcommand(commands_parser)
+    AddFastImageProcessorCommand.register_subcommand(commands_parser)

    # Let's go
    args = parser.parse_args()
--- a/src/transformers/data/processors/squad.py
+++ b/src/transformers/data/processors/squad.py
@ -249,7 +249,7 @@ def squad_convert_example_to_features(
        else:
            p_mask[-len(span["tokens"]) : -(len(truncated_query) + sequence_added_tokens)] = 0

-        pad_token_indices = np.where(span["input_ids"] == tokenizer.pad_token_id)
+        pad_token_indices = np.where(np.atleast_1d(span["input_ids"] == tokenizer.pad_token_id))
        special_token_indices = np.asarray(
            tokenizer.get_special_tokens_mask(span["input_ids"], already_has_special_tokens=True)
        ).nonzero()
--- a/src/transformers/generation/configuration_utils.py
+++ b/src/transformers/generation/configuration_utils.py
@ -378,6 +378,7 @@ class GenerationConfig(PushToHubMixin):
        compile_config (CompileConfig, *optional*):
            If using a static cache, this controls how `generate` will `compile` the forward pass for performance
            gains.
+        disable_compile (`bool`, *optional*): Disable the compilation of the forward pass.

        > Wild card

@ -482,7 +483,7 @@ class GenerationConfig(PushToHubMixin):

        # Performances
        self.compile_config = kwargs.pop("compile_config", CompileConfig())
-
+        self.disable_compile = kwargs.pop("disable_compile", False)
        # Wild card
        self.generation_kwargs = kwargs.pop("generation_kwargs", {})

@ -785,8 +786,7 @@ class GenerationConfig(PushToHubMixin):
            for arg_name in ("cache_implementation", "cache_config", "return_legacy_cache"):
                if getattr(self, arg_name) is not None:
                    logger.warning_once(
-                        no_cache_warning.format(cache_arg=arg_name, cache_arg_value=getattr(self, arg_name)),
-                        UserWarning,
+                        no_cache_warning.format(cache_arg=arg_name, cache_arg_value=getattr(self, arg_name))
                    )

        # 6.  check watermarking arguments
@ -1579,7 +1579,7 @@ class SynthIDTextWatermarkingConfig(BaseWatermarkingConfig):


@dataclass
-class CompileConfig(object):
+class CompileConfig:
    """
    Class that holds arguments relative to `torch.compile` behavior, when using automatic compilation in `generate`.
    See [`torch.compile`](https://pytorch.org/docs/stable/generated/torch.compile.html) for more details on the arguments.
@ -1620,7 +1620,9 @@ class CompileConfig(object):
    backend: Union[str, Callable] = "inductor"
    mode: str = "reduce-overhead"
    options: Optional[dict] = None
+    # Used to flag our `generate` call to compile on e.g. CPU. Often not optimal, but useful for testing purposes.
+    _compile_all_devices = None

    def to_dict(self) -> Dict[str, Any]:
        """Serializes this instance to a Python dictionary."""
-        return copy.deepcopy(self.__dict__)
+        return copy.deepcopy({key: value for key, value in self.__dict__.items() if key != "_compile_all_devices"})
--- a/src/transformers/generation/stopping_criteria.py
+++ b/src/transformers/generation/stopping_criteria.py
@ -245,26 +245,26 @@ class StopStringCriteria(StoppingCriteria):
        vocab = tokenizer.get_vocab()
        token_list, token_indices = tuple(vocab.keys()), tuple(vocab.values())
        self.embedding_vec, self.max_valid_positions, self.max_valid_end_lens = self.clean_and_embed_tokens_with_cache(
-            token_list, token_indices, self.stop_strings, tokenizer
+            token_list, token_indices, tokenizer
        )

        self.maximum_token_len = max([len(stop_string) for stop_string in self.stop_strings])
        self.num_stop_strings = len(self.stop_strings)
        self.target_lens = torch.tensor([len(stop_string) for stop_string in stop_strings], dtype=torch.int32)

-    def clean_and_embed_tokens_with_cache(self, token_list, token_indices, stop_strings, tokenizer):
+    def clean_and_embed_tokens_with_cache(self, token_list, token_indices, tokenizer):
        # We don't use the tokenizer in the cache key, because I don't trust it to have well-behaved equality
-        if (token_list, token_indices, stop_strings) in STOP_STRING_EMBEDDING_CACHE:
+        if (token_list, token_indices, self.stop_strings) in STOP_STRING_EMBEDDING_CACHE:
            embedding_vec, max_valid_positions, max_valid_end_lens = STOP_STRING_EMBEDDING_CACHE[
                (token_list, token_indices, self.stop_strings)
            ]
-            STOP_STRING_EMBEDDING_CACHE.move_to_end((token_list, token_indices, stop_strings))
+            STOP_STRING_EMBEDDING_CACHE.move_to_end((token_list, token_indices, self.stop_strings))
        else:
            clean_token_list, clean_token_indices = self.clean_tokenizer_vocab(tokenizer)
            embedding_vec, max_valid_positions, max_valid_end_lens = self._stop_string_create_embedding_vec(
-                clean_token_list, clean_token_indices, stop_strings
+                clean_token_list, clean_token_indices, self.stop_strings
            )
-            STOP_STRING_EMBEDDING_CACHE[(token_list, token_indices, stop_strings)] = (
+            STOP_STRING_EMBEDDING_CACHE[(token_list, token_indices, self.stop_strings)] = (
                embedding_vec,
                max_valid_positions,
                max_valid_end_lens,
@ -357,7 +357,9 @@ class StopStringCriteria(StoppingCriteria):
            )
        max_valid_end_lens = max(valid_end_lens)
        vec_size = len(stop_strings) * (max_valid_positions + max_valid_end_lens) + 1
-        gather_vec = np.full((len(token_list), vec_size), dtype=np.int32, fill_value=-1)
+        # We use +2 instead of +1 so we can have a dummy entry at the end. We will clamp all token values
+        # over the max to this, ensuring they do not contribute to stop string matching.
+        gather_vec = np.full((max(token_indices) + 2, vec_size), dtype=np.int32, fill_value=-1)

        for i, stop_string in enumerate(stop_strings):
            positions = token_valid_positions[stop_string]
@ -395,6 +397,9 @@ class StopStringCriteria(StoppingCriteria):
        # Flip input_ids because we're only matching strings at the end of the generated sequence
        flipped_ids = torch.flip(input_ids, (1,))

+        # Clip out-of-vocab values to the dummy value at the end of the embedding vector
+        flipped_ids = torch.clamp(flipped_ids, max=self.embedding_vec.size(0) - 1)
+
        # Size of the vector of positions a single token can match
        max_valid_positions = self.max_valid_positions

--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@ -381,9 +381,13 @@ class GenerationMixin:
        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
        # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
        #              (we can't check exception 3 while compiling)
+        # Excpetion 4: If input_embeds are passed then slice it through `cache_position`, to keep only the unprocessed tokens and
+        # generate the first token for each sequence. Later use the generated Input ids for continuation.
        if past_key_values is not None:
            model_inputs["past_key_values"] = past_key_values
-            if (
+            if inputs_embeds is not None and input_ids.shape[1] == 0:  # Exception 4
+                inputs_embeds = inputs_embeds[:, -cache_position.shape[0] :]
+            elif (
                inputs_embeds is not None  # Exception 1
                or (is_torchdynamo_compiling() or cache_position[-1] >= input_ids.shape[1])  # Exception 3
            ):
@ -393,9 +397,9 @@ class GenerationMixin:

        # 3. Prepare base model inputs
        input_ids_key = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step for every prompt.
        if not self.config.is_encoder_decoder:
-            if inputs_embeds is not None and cache_position[0] == 0:
+            if inputs_embeds is not None and len(cache_position) == inputs_embeds.shape[1]:
                model_inputs[input_ids_key] = None
                model_inputs["inputs_embeds"] = inputs_embeds
            else:
@ -406,23 +410,28 @@ class GenerationMixin:
            model_inputs[input_ids_key] = input_ids.clone(memory_format=torch.contiguous_format)

        # 4. Create missing `position_ids` on the fly
+        attention_mask = (
+            kwargs.pop("decoder_attention_mask", None) if self.config.is_encoder_decoder else attention_mask
+        )
+        attention_mask_key = "decoder_attention_mask" if self.config.is_encoder_decoder else "attention_mask"
+        position_ids_key = "decoder_position_ids" if self.config.is_encoder_decoder else "position_ids"
        if (
            attention_mask is not None
-            and kwargs.get("position_ids") is None
-            and "position_ids" in set(inspect.signature(self.forward).parameters.keys())
+            and kwargs.get(position_ids_key) is None
+            and position_ids_key in set(inspect.signature(self.forward).parameters.keys())
        ):
            position_ids = attention_mask.long().cumsum(-1) - 1
            position_ids.masked_fill_(attention_mask == 0, 1)
-            kwargs["position_ids"] = position_ids  # placed in kwargs for further processing (see below)
+            kwargs[position_ids_key] = position_ids  # placed in kwargs for further processing (see below)

        # 5. Slice model inputs if it's an input that should have the same length as `input_ids`
-        for model_input_name in ["position_ids", "token_type_ids"]:
+        for model_input_name in ["position_ids", "token_type_ids", "decoder_position_ids"]:
            model_input = kwargs.get(model_input_name)
            if model_input is not None:
                if past_key_values is not None:
                    current_input_length = (
                        model_inputs["inputs_embeds"].shape[1]
-                        if model_inputs["inputs_embeds"] is not None
+                        if model_inputs.get("inputs_embeds") is not None
                        else model_inputs[input_ids_key].shape[1]
                    )
                    model_input = model_input[:, -current_input_length:]
@ -469,7 +478,7 @@ class GenerationMixin:
                    past_key_values=past_key_values,
                )
        if attention_mask is not None:
-            model_inputs["attention_mask"] = attention_mask
+            model_inputs[attention_mask_key] = attention_mask

        # 7. Forward ALL kwargs that are uninitialized (e.g. `use_cache`).
        for key, value in kwargs.items():
@ -3177,9 +3186,13 @@ class GenerationMixin:
        model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)

        model_forward = self.__call__
-        if isinstance(model_kwargs.get("past_key_values"), StaticCache):
-            if self.device.type == "cuda":
-                logger.warning_once("Using `torch.compile`.")
+        if isinstance(model_kwargs.get("past_key_values"), Cache):
+            is_compileable = model_kwargs["past_key_values"].is_compileable and self._supports_static_cache
+            if generation_config.disable_compile:
+                is_compileable = False
+            if is_compileable and (
+                self.device.type == "cuda" or generation_config.compile_config._compile_all_devices
+            ):
                os.environ["TOKENIZERS_PARALLELISM"] = "0"
                model_forward = self.get_compiled_call(generation_config.compile_config)

--- a/src/transformers/image_processing_utils.py
+++ b/src/transformers/image_processing_utils.py
@ -13,13 +13,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+import math
 from typing import Dict, Iterable, Optional, Union

 import numpy as np

 from .image_processing_base import BatchFeature, ImageProcessingMixin
 from .image_transforms import center_crop, normalize, rescale
-from .image_utils import ChannelDimension
+from .image_utils import ChannelDimension, get_image_size
 from .utils import logging


@ -285,3 +286,23 @@ def select_best_resolution(original_size: tuple, possible_resolutions: list) ->
            best_fit = (height, width)

    return best_fit
+
+
+def get_patch_output_size(image, target_resolution, input_data_format):
+    """
+    Given an image and a target resolution, calculate the output size of the image after cropping to the target
+    """
+    original_height, original_width = get_image_size(image, channel_dim=input_data_format)
+    target_height, target_width = target_resolution
+
+    scale_w = target_width / original_width
+    scale_h = target_height / original_height
+
+    if scale_w < scale_h:
+        new_width = target_width
+        new_height = min(math.ceil(original_height * scale_w), target_height)
+    else:
+        new_height = target_height
+        new_width = min(math.ceil(original_width * scale_h), target_width)
+
+    return new_height, new_width
--- a/src/transformers/image_processing_utils_fast.py
+++ b/src/transformers/image_processing_utils_fast.py
@ -13,94 +13,63 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import functools
-from dataclasses import dataclass
-from typing import Any, Iterable, List, Optional, Tuple
+from functools import lru_cache, partial
+from typing import Any, Dict, Iterable, List, Optional, Tuple, TypedDict, Union

-from .image_processing_utils import BaseImageProcessor
-from .utils.import_utils import is_torch_available, is_torchvision_available
+import numpy as np
+
+from .image_processing_utils import (
+    BaseImageProcessor,
+    BatchFeature,
+    get_size_dict,
+)
+from .image_transforms import (
+    convert_to_rgb,
+    get_resize_output_image_size,
+    get_size_with_aspect_ratio,
+    group_images_by_shape,
+    reorder_images,
+)
+from .image_utils import (
+    ChannelDimension,
+    ImageInput,
+    ImageType,
+    SizeDict,
+    get_image_size,
+    get_image_size_for_max_height_width,
+    get_image_type,
+    infer_channel_dimension_format,
+    make_flat_list_of_images,
+    validate_fast_preprocess_arguments,
+    validate_kwargs,
+)
+from .processing_utils import Unpack
+from .utils import (
+    TensorType,
+    add_start_docstrings,
+    is_torch_available,
+    is_torchvision_available,
+    is_torchvision_v2_available,
+    is_vision_available,
+    logging,
+)


-if is_torchvision_available():
-    from torchvision.transforms import Compose
+if is_vision_available():
+    from .image_utils import PILImageResampling

 if is_torch_available():
    import torch

+if is_torchvision_available():
+    from .image_utils import pil_torch_interpolation_mapping

-@dataclass(frozen=True)
-class SizeDict:
-    """
-    Hashable dictionary to store image size information.
-    """
+    if is_torchvision_v2_available():
+        from torchvision.transforms.v2 import functional as F
+    else:
+        from torchvision.transforms import functional as F

-    height: int = None
-    width: int = None
-    longest_edge: int = None
-    shortest_edge: int = None
-    max_height: int = None
-    max_width: int = None
-
-    def __getitem__(self, key):
-        if hasattr(self, key):
-            return getattr(self, key)
-        raise KeyError(f"Key {key} not found in SizeDict.")
-
-
-class BaseImageProcessorFast(BaseImageProcessor):
-    _transform_params = None
-
-    def _build_transforms(self, **kwargs) -> "Compose":
-        """
-        Given the input settings e.g. do_resize, build the image transforms.
-        """
-        raise NotImplementedError
-
-    def _validate_params(self, **kwargs) -> None:
-        for k, v in kwargs.items():
-            if k not in self._transform_params:
-                raise ValueError(f"Invalid transform parameter {k}={v}.")
-
-    @functools.lru_cache(maxsize=1)
-    def get_transforms(self, **kwargs) -> "Compose":
-        self._validate_params(**kwargs)
-        return self._build_transforms(**kwargs)
-
-    def to_dict(self):
-        encoder_dict = super().to_dict()
-        encoder_dict.pop("_transform_params", None)
-        return encoder_dict
-
-
-def get_image_size_for_max_height_width(
-    image_size: Tuple[int, int],
-    max_height: int,
-    max_width: int,
-) -> Tuple[int, int]:
-    """
-    Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio.
-    Important, even if image_height < max_height and image_width < max_width, the image will be resized
-    to at least one of the edges be equal to max_height or max_width.
-
-    For example:
-        - input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50)
-        - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400)
-
-    Args:
-        image_size (`Tuple[int, int]`):
-            The image to resize.
-        max_height (`int`):
-            The maximum allowed height.
-        max_width (`int`):
-            The maximum allowed width.
-    """
-    height, width = image_size
-    height_scale = max_height / height
-    width_scale = max_width / width
-    min_scale = min(height_scale, width_scale)
-    new_height = int(height * min_scale)
-    new_width = int(width * min_scale)
-    return new_height, new_width
+logger = logging.get_logger(__name__)


 def safe_squeeze(tensor: "torch.Tensor", axis: Optional[int] = None) -> "torch.Tensor":
@ -131,3 +100,606 @@ def get_max_height_width(images: List["torch.Tensor"]) -> Tuple[int]:
    _, max_height, max_width = max_across_indices([img.shape for img in images])

    return (max_height, max_width)
+
+
+def divide_to_patches(
+    image: Union[np.array, "torch.Tensor"], patch_size: int
+) -> List[Union[np.array, "torch.Tensor"]]:
+    """
+    Divides an image into patches of a specified size.
+
+    Args:
+        image (`Union[np.array, "torch.Tensor"]`):
+            The input image.
+        patch_size (`int`):
+            The size of each patch.
+    Returns:
+        list: A list of Union[np.array, "torch.Tensor"] representing the patches.
+    """
+    patches = []
+    height, width = get_image_size(image, channel_dim=ChannelDimension.FIRST)
+    for i in range(0, height, patch_size):
+        for j in range(0, width, patch_size):
+            patch = image[:, i : i + patch_size, j : j + patch_size]
+            patches.append(patch)
+
+    return patches
+
+
+class DefaultFastImageProcessorInitKwargs(TypedDict, total=False):
+    do_resize: Optional[bool]
+    size: Optional[Dict[str, int]]
+    default_to_square: Optional[bool]
+    resample: Optional[Union["PILImageResampling", "F.InterpolationMode"]]
+    do_center_crop: Optional[bool]
+    crop_size: Optional[Dict[str, int]]
+    do_rescale: Optional[bool]
+    rescale_factor: Optional[Union[int, float]]
+    do_normalize: Optional[bool]
+    image_mean: Optional[Union[float, List[float]]]
+    image_std: Optional[Union[float, List[float]]]
+    do_convert_rgb: Optional[bool]
+
+
+class DefaultFastImageProcessorPreprocessKwargs(DefaultFastImageProcessorInitKwargs):
+    return_tensors: Optional[Union[str, TensorType]]
+    data_format: Optional[ChannelDimension]
+    input_data_format: Optional[Union[str, ChannelDimension]]
+    device: Optional["torch.device"]
+
+
+BASE_IMAGE_PROCESSOR_FAST_DOCSTRING = r"""
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
+            `do_resize` parameter in the `preprocess` method.
+        size (`dict`, *optional*, defaults to `self.size`):
+            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
+            method.
+        default_to_square (`bool`, *optional*, defaults to `self.default_to_square`):
+            Whether to default to a square image when resizing, if size is an int.
+        resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+            Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
+            overridden by the `resample` parameter in the `preprocess` method.
+        do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
+            Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
+            `preprocess` method.
+        crop_size (`Dict[str, int]` *optional*, defaults to `self.crop_size`):
+            Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
+            method.
+        do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
+            `do_rescale` parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `self.rescale_factor`):
+            Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be
+            overridden by the `rescale_factor` parameter in the `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method. Can be overridden by the `do_normalize` parameter in the `preprocess` method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
+            overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*, defaults to `self.image_std`):
+            Whether to convert the image to RGB."""
+
+BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS = r"""
+    Preprocess an image or batch of images.
+
+    Args:
+        images (`ImageInput`):
+            Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+            passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+        do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+            Whether to resize the image.
+        size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+            Describes the maximum input dimensions to the model.
+        resample (`PILImageResampling` or `InterpolationMode`, *optional*, defaults to `self.resample`):
+            Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+            has an effect if `do_resize` is set to `True`.
+        do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
+            Whether to center crop the image.
+        crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
+            Size of the output image after applying `center_crop`.
+        do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+            Whether to rescale the image.
+        rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+            Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+        do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+            Whether to normalize the image.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+            Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+        image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+            Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+            `True`.
+        do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+            Whether to convert the image to RGB.
+        return_tensors (`str` or `TensorType`, *optional*):
+            Returns stacked tensors if set to `pt, otherwise returns a list of tensors.
+        data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+            The channel dimension format for the output image. Can be one of:
+            - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+            - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            - Unset: Use the channel dimension format of the input image.
+        input_data_format (`ChannelDimension` or `str`, *optional*):
+            The channel dimension format for the input image. If unset, the channel dimension format is inferred
+            from the input image. Can be one of:
+            - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+            - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        device (`torch.device`, *optional*):
+            The device to process the images on. If unset, the device is inferred from the input images."""
+
+
+@add_start_docstrings(
+    "Constructs a fast base image processor.",
+    BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,
+)
+class BaseImageProcessorFast(BaseImageProcessor):
+    resample = None
+    image_mean = None
+    image_std = None
+    size = None
+    default_to_square = True
+    crop_size = None
+    do_resize = None
+    do_center_crop = None
+    do_rescale = None
+    rescale_factor = 1 / 255
+    do_normalize = None
+    do_convert_rgb = None
+    model_input_names = ["pixel_values"]
+    valid_init_kwargs = DefaultFastImageProcessorInitKwargs
+    valid_preprocess_kwargs = DefaultFastImageProcessorPreprocessKwargs
+
+    def __init__(
+        self,
+        **kwargs: Unpack[DefaultFastImageProcessorInitKwargs],
+    ) -> None:
+        super().__init__(**kwargs)
+        size = kwargs.pop("size", self.size)
+        self.size = (
+            get_size_dict(size=size, default_to_square=kwargs.pop("default_to_square", self.default_to_square))
+            if size is not None
+            else None
+        )
+        crop_size = kwargs.pop("crop_size", self.crop_size)
+        self.crop_size = get_size_dict(crop_size, param_name="crop_size") if crop_size is not None else None
+        for key in self.valid_init_kwargs.__annotations__.keys():
+            kwarg = kwargs.pop(key, None)
+            if kwarg is not None:
+                setattr(self, key, kwarg)
+            else:
+                setattr(self, key, getattr(self, key, None))
+
+    def resize(
+        self,
+        image: "torch.Tensor",
+        size: SizeDict,
+        interpolation: "F.InterpolationMode" = None,
+        antialias: bool = True,
+        **kwargs,
+    ) -> "torch.Tensor":
+        """
+        Resize an image to `(size["height"], size["width"])`.
+
+        Args:
+            image (`torch.Tensor`):
+                Image to resize.
+            size (`SizeDict`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
+                `InterpolationMode` filter to use when resizing the image e.g. `InterpolationMode.BICUBIC`.
+
+        Returns:
+            `torch.Tensor`: The resized image.
+        """
+        interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR
+        if size.shortest_edge and size.longest_edge:
+            # Resize the image so that the shortest edge or the longest edge is of the given size
+            # while maintaining the aspect ratio of the original image.
+            new_size = get_size_with_aspect_ratio(
+                image.size()[-2:],
+                size.shortest_edge,
+                size.longest_edge,
+            )
+        elif size.shortest_edge:
+            new_size = get_resize_output_image_size(
+                image,
+                size=size.shortest_edge,
+                default_to_square=False,
+                input_data_format=ChannelDimension.FIRST,
+            )
+        elif size.max_height and size.max_width:
+            new_size = get_image_size_for_max_height_width(image.size()[-2:], size.max_height, size.max_width)
+        elif size.height and size.width:
+            new_size = (size.height, size.width)
+        else:
+            raise ValueError(
+                "Size must contain 'height' and 'width' keys, or 'max_height' and 'max_width', or 'shortest_edge' key. Got"
+                f" {size}."
+            )
+        return F.resize(image, new_size, interpolation=interpolation, antialias=antialias)
+
+    def rescale(
+        self,
+        image: "torch.Tensor",
+        scale: float,
+        **kwargs,
+    ) -> "torch.Tensor":
+        """
+        Rescale an image by a scale factor. image = image * scale.
+
+        Args:
+            image (`torch.Tensor`):
+                Image to rescale.
+            scale (`float`):
+                The scaling factor to rescale pixel values by.
+
+        Returns:
+            `torch.Tensor`: The rescaled image.
+        """
+        return image * scale
+
+    def normalize(
+        self,
+        image: "torch.Tensor",
+        mean: Union[float, Iterable[float]],
+        std: Union[float, Iterable[float]],
+        **kwargs,
+    ) -> "torch.Tensor":
+        """
+        Normalize an image. image = (image - image_mean) / image_std.
+
+        Args:
+            image (`torch.Tensor`):
+                Image to normalize.
+            mean (`torch.Tensor`, `float` or `Iterable[float]`):
+                Image mean to use for normalization.
+            std (`torch.Tensor`, `float` or `Iterable[float]`):
+                Image standard deviation to use for normalization.
+
+        Returns:
+            `torch.Tensor`: The normalized image.
+        """
+        return F.normalize(image, mean, std)
+
+    def rescale_and_normalize(
+        self,
+        images: "torch.Tensor",
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Union[float, List[float]],
+        image_std: Union[float, List[float]],
+    ) -> "torch.Tensor":
+        """
+        Rescale and normalize images.
+        """
+        if do_rescale and do_normalize:
+            images = self.normalize(images.to(dtype=torch.float32), image_mean, image_std)
+        elif do_rescale:
+            images = images * rescale_factor
+        elif do_normalize:
+            images = self.normalize(images, image_mean, image_std)
+
+        return images
+
+    def center_crop(
+        self,
+        image: "torch.Tensor",
+        size: Dict[str, int],
+        **kwargs,
+    ) -> "torch.Tensor":
+        """
+        Center crop an image to `(size["height"], size["width"])`. If the input size is smaller than `crop_size` along
+        any edge, the image is padded with 0's and then center cropped.
+
+        Args:
+            image (`"torch.Tensor"`):
+                Image to center crop.
+            size (`Dict[str, int]`):
+                Size of the output image.
+
+        Returns:
+            `torch.Tensor`: The center cropped image.
+        """
+        if size.height is None or size.width is None:
+            raise ValueError(f"The size dictionary must have keys 'height' and 'width'. Got {size.keys()}")
+        return F.center_crop(image, (size["height"], size["width"]))
+
+    def convert_to_rgb(
+        self,
+        image: ImageInput,
+    ) -> ImageInput:
+        """
+        Converts an image to RGB format. Only converts if the image is of type PIL.Image.Image, otherwise returns the image
+        as is.
+        Args:
+            image (ImageInput):
+                The image to convert.
+
+        Returns:
+            ImageInput: The converted image.
+        """
+        return convert_to_rgb(image)
+
+    def _prepare_images_structure(
+        self,
+        images: ImageInput,
+    ) -> ImageInput:
+        """
+        Prepare the images structure for processing.
+
+        Args:
+            images (`ImageInput`):
+                The input images to process.
+
+        Returns:
+            `ImageInput`: The images with a valid nesting.
+        """
+        return make_flat_list_of_images(images)
+
+    def _process_image(
+        self,
+        image: ImageInput,
+        do_convert_rgb: Optional[bool] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        device: Optional["torch.device"] = None,
+    ) -> "torch.Tensor":
+        image_type = get_image_type(image)
+        if image_type not in [ImageType.PIL, ImageType.TORCH, ImageType.NUMPY]:
+            raise ValueError(f"Unsupported input image type {image_type}")
+
+        if do_convert_rgb:
+            image = self.convert_to_rgb(image)
+
+        if image_type == ImageType.PIL:
+            image = F.pil_to_tensor(image)
+        elif image_type == ImageType.NUMPY:
+            # not using F.to_tensor as it doesn't handle (C, H, W) numpy arrays
+            image = torch.from_numpy(image).contiguous()
+
+        # Infer the channel dimension format if not provided
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(image)
+
+        if input_data_format == ChannelDimension.LAST:
+            # We force the channel dimension to be first for torch tensors as this is what torchvision expects.
+            image = image.permute(2, 0, 1).contiguous()
+
+        # Now that we have torch tensors, we can move them to the right device
+        if device is not None:
+            image = image.to(device)
+
+        return image
+
+    def _prepare_input_images(
+        self,
+        images: ImageInput,
+        do_convert_rgb: bool = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        device: Optional["torch.device"] = None,
+    ) -> List["torch.Tensor"]:
+        """
+        Prepare the input images for processing.
+        """
+        images = self._prepare_images_structure(images)
+        process_image_fn = partial(
+            self._process_image,
+            do_convert_rgb=do_convert_rgb,
+            input_data_format=input_data_format,
+            device=device,
+        )
+        # todo: yoni - check if we can parallelize this efficiently
+        processed_images = []
+        for image in images:
+            processed_images.append(process_image_fn(image))
+
+        return processed_images
+
+    @lru_cache(maxsize=10)
+    def _prepare_process_arguments(
+        self,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: Optional[Union["PILImageResampling", "F.InterpolationMode"]] = None,
+        do_center_crop: bool = None,
+        crop_size: int = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_normalize: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        device: Optional["torch.device"] = None,
+    ) -> tuple:
+        """
+        Prepare the arguments for the process method.
+        """
+        validate_fast_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            do_center_crop=do_center_crop,
+            crop_size=crop_size,
+            resample=resample,
+            return_tensors=return_tensors,
+            data_format=data_format,
+        )
+
+        if do_rescale and do_normalize:
+            # Fused rescale and normalize
+            image_mean = torch.tensor(image_mean, device=device) * (1.0 / rescale_factor)
+            image_std = torch.tensor(image_std, device=device) * (1.0 / rescale_factor)
+
+        interpolation = (
+            pil_torch_interpolation_mapping[resample] if isinstance(resample, (PILImageResampling, int)) else resample
+        )
+
+        return image_mean, image_std, interpolation
+
+    @add_start_docstrings(BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS)
+    def preprocess(
+        self,
+        images: ImageInput,
+        **kwargs: Unpack[DefaultFastImageProcessorPreprocessKwargs],
+    ) -> BatchFeature:
+        validate_kwargs(
+            captured_kwargs=kwargs.keys(), valid_processor_keys=self.valid_preprocess_kwargs.__annotations__.keys()
+        )
+        # Set default kwargs from self. This ensures that if a kwarg is not provided
+        # by the user, it gets its default value from the instance, or is set to None.
+        for kwarg_name in self.valid_preprocess_kwargs.__annotations__:
+            kwargs.setdefault(kwarg_name, getattr(self, kwarg_name, None))
+
+        # Extract parameters that are only used for preparing the input images
+        do_convert_rgb = kwargs.pop("do_convert_rgb")
+        input_data_format = kwargs.pop("input_data_format")
+        device = kwargs.pop("device")
+
+        images = self._prepare_input_images(
+            images=images, do_convert_rgb=do_convert_rgb, input_data_format=input_data_format, device=device
+        )
+
+        # Pop kwargs that need further processing or won't be used in _preprocess
+        default_to_square = kwargs.pop("default_to_square")
+        size = kwargs.pop("size")
+        crop_size = kwargs.pop("crop_size")
+        image_mean = kwargs.pop("image_mean")
+        image_std = kwargs.pop("image_std")
+        data_format = kwargs.pop("data_format")
+        resample = kwargs.pop("resample")
+
+        # Make hashable for cache
+        size = SizeDict(**get_size_dict(size=size, default_to_square=default_to_square)) if size is not None else None
+        crop_size = SizeDict(**get_size_dict(crop_size, param_name="crop_size")) if crop_size is not None else None
+        image_mean = tuple(image_mean) if isinstance(image_mean, list) else image_mean
+        image_std = tuple(image_std) if isinstance(image_std, list) else image_std
+
+        image_mean, image_std, interpolation = self._prepare_process_arguments(
+            size=size,
+            crop_size=crop_size,
+            resample=resample,
+            image_mean=image_mean,
+            image_std=image_std,
+            data_format=data_format if data_format is not None else ChannelDimension.FIRST,
+            device=images[0].device,
+            do_resize=kwargs.get("do_resize"),
+            do_center_crop=kwargs.get("do_center_crop"),
+            do_rescale=kwargs.get("do_rescale"),
+            rescale_factor=kwargs.get("rescale_factor"),
+            do_normalize=kwargs.get("do_normalize"),
+            return_tensors=kwargs.get("return_tensors"),
+        )
+
+        return self._preprocess(
+            images=images,
+            size=size,
+            crop_size=crop_size,
+            interpolation=interpolation,
+            image_mean=image_mean,
+            image_std=image_std,
+            **kwargs,
+        )
+
+    def _preprocess(
+        self,
+        images: List["torch.Tensor"],
+        do_resize: bool,
+        size: SizeDict,
+        interpolation: Optional["F.InterpolationMode"],
+        do_center_crop: bool,
+        crop_size: SizeDict,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, List[float]]],
+        image_std: Optional[Union[float, List[float]]],
+        return_tensors: Optional[Union[str, TensorType]],
+    ) -> BatchFeature:
+        # Group images by size for batched resizing
+        grouped_images, grouped_images_index = group_images_by_shape(images)
+        resized_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_resize:
+                stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation)
+            resized_images_grouped[shape] = stacked_images
+        resized_images = reorder_images(resized_images_grouped, grouped_images_index)
+
+        # Group images by size for further processing
+        # Needed in case do_resize is False, or resize returns images with different sizes
+        grouped_images, grouped_images_index = group_images_by_shape(resized_images)
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_center_crop:
+                stacked_images = self.center_crop(stacked_images, crop_size)
+            # Fused rescale and normalize
+            stacked_images = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            processed_images_grouped[shape] = stacked_images
+
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
+
+        return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
+
+    def to_dict(self):
+        encoder_dict = super().to_dict()
+        encoder_dict.pop("_valid_processor_keys", None)
+        return encoder_dict
+
+
+class SemanticSegmentationMixin:
+    def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple] = None):
+        """
+        Converts the output of [`MobileNetV2ForSemanticSegmentation`] into semantic segmentation maps. Only supports PyTorch.
+
+        Args:
+            outputs ([`MobileNetV2ForSemanticSegmentation`]):
+                Raw outputs of the model.
+            target_sizes (`List[Tuple]` of length `batch_size`, *optional*):
+                List of tuples corresponding to the requested final size (height, width) of each prediction. If unset,
+                predictions will not be resized.
+
+        Returns:
+            semantic_segmentation: `List[torch.Tensor]` of length `batch_size`, where each item is a semantic
+            segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is
+            specified). Each entry of each `torch.Tensor` correspond to a semantic class id.
+        """
+        logits = outputs.logits
+
+        # Resize logits and compute semantic segmentation maps
+        if target_sizes is not None:
+            if len(logits) != len(target_sizes):
+                raise ValueError(
+                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+                )
+
+            # if is_torch_tensor(target_sizes):
+            #     target_sizes = target_sizes.numpy()
+
+            semantic_segmentation = []
+
+            for idx in range(len(logits)):
+                resized_logits = torch.nn.functional.interpolate(
+                    logits[idx].unsqueeze(dim=0), size=target_sizes[idx], mode="bilinear", align_corners=False
+                )
+                semantic_map = resized_logits[0].argmax(dim=0)
+                semantic_segmentation.append(semantic_map)
+        else:
+            semantic_segmentation = logits.argmax(dim=1)
+            semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])]
+
+        return semantic_segmentation
--- a/src/transformers/image_transforms.py
+++ b/src/transformers/image_transforms.py
@ -15,7 +15,7 @@

 import warnings
 from math import ceil
-from typing import Iterable, List, Optional, Sequence, Tuple, Union
+from typing import Dict, Iterable, List, Optional, Sequence, Tuple, Union

 import numpy as np

@ -31,8 +31,6 @@ from .utils.import_utils import (
    is_flax_available,
    is_tf_available,
    is_torch_available,
-    is_torchvision_available,
-    is_torchvision_v2_available,
    is_vision_available,
    requires_backends,
 )
@ -52,11 +50,6 @@ if is_tf_available():
 if is_flax_available():
    import jax.numpy as jnp

-if is_torchvision_v2_available():
-    from torchvision.transforms.v2 import functional as F
-elif is_torchvision_available():
-    from torchvision.transforms import functional as F
-

 def to_channel_dimension_format(
    image: np.ndarray,
@ -216,6 +209,45 @@ def to_pil_image(
    return PIL.Image.fromarray(image, mode=image_mode)


+def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, int]:
+    """
+    Computes the output image size given the input image size and the desired output size.
+
+    Args:
+        image_size (`Tuple[int, int]`):
+            The input image size.
+        size (`int`):
+            The desired output size.
+        max_size (`int`, *optional*):
+            The maximum allowed output size.
+    """
+    height, width = image_size
+    raw_size = None
+    if max_size is not None:
+        min_original_size = float(min((height, width)))
+        max_original_size = float(max((height, width)))
+        if max_original_size / min_original_size * size > max_size:
+            raw_size = max_size * min_original_size / max_original_size
+            size = int(round(raw_size))
+
+    if (height <= width and height == size) or (width <= height and width == size):
+        oh, ow = height, width
+    elif width < height:
+        ow = size
+        if max_size is not None and raw_size is not None:
+            oh = int(raw_size * height / width)
+        else:
+            oh = int(size * height / width)
+    else:
+        oh = size
+        if max_size is not None and raw_size is not None:
+            ow = int(raw_size * width / height)
+        else:
+            ow = int(size * width / height)
+
+    return (oh, ow)
+
+
 # Logic adapted from torchvision resizing logic: https://github.com/pytorch/vision/blob/511924c1ced4ce0461197e5caa64ce5b9e558aab/torchvision/transforms/functional.py#L366
 def get_resize_output_image_size(
    input_image: np.ndarray,
@ -821,32 +853,37 @@ def _cast_tensor_to_float(x):
    return x.float()


-class FusedRescaleNormalize:
+def group_images_by_shape(
+    images: List["torch.Tensor"],
+) -> Tuple[Dict[Tuple[int, int], List["torch.Tensor"]], Dict[int, Tuple[Tuple[int, int], int]]]:
    """
-    Rescale and normalize the input image in one step.
+    Groups images by shape.
+    Returns a dictionary with the shape as key and a list of images with that shape as value,
+    and a dictionary with the index of the image in the original list as key and the shape and index in the grouped list as value.
    """
-
-    def __init__(self, mean, std, rescale_factor: float = 1.0, inplace: bool = False):
-        self.mean = torch.tensor(mean) * (1.0 / rescale_factor)
-        self.std = torch.tensor(std) * (1.0 / rescale_factor)
-        self.inplace = inplace
-
-    def __call__(self, image: "torch.Tensor"):
-        image = _cast_tensor_to_float(image)
-        return F.normalize(image, self.mean, self.std, inplace=self.inplace)
+    grouped_images = {}
+    grouped_images_index = {}
+    for i, image in enumerate(images):
+        shape = image.shape[1:]
+        if shape not in grouped_images:
+            grouped_images[shape] = []
+        grouped_images[shape].append(image)
+        grouped_images_index[i] = (shape, len(grouped_images[shape]) - 1)
+    # stack images with the same shape
+    grouped_images = {shape: torch.stack(images, dim=0) for shape, images in grouped_images.items()}
+    return grouped_images, grouped_images_index


-class Rescale:
+def reorder_images(
+    processed_images: Dict[Tuple[int, int], "torch.Tensor"], grouped_images_index: Dict[int, Tuple[int, int]]
+) -> List["torch.Tensor"]:
    """
-    Rescale the input image by rescale factor: image *= rescale_factor.
+    Reconstructs a list of images in the original order.
    """
-
-    def __init__(self, rescale_factor: float = 1.0):
-        self.rescale_factor = rescale_factor
-
-    def __call__(self, image: "torch.Tensor"):
-        image = image * self.rescale_factor
-        return image
+    return [
+        processed_images[grouped_images_index[i][0]][grouped_images_index[i][1]]
+        for i in range(len(grouped_images_index))
+    ]


 class NumpyToTensor:
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@ -16,6 +16,7 @@
 import base64
 import os
 from contextlib import redirect_stdout
+from dataclasses import dataclass
 from io import BytesIO
 from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union

@ -158,6 +159,10 @@ def is_valid_image(img):
    return is_pil_image(img) or is_numpy_array(img) or is_torch_tensor(img) or is_tf_tensor(img) or is_jax_tensor(img)


+def is_valid_list_of_images(images: List):
+    return images and all(is_valid_image(image) for image in images)
+
+
 def valid_images(imgs):
    # If we have an list of images, make sure every image is valid
    if isinstance(imgs, (list, tuple)):
@ -189,7 +194,7 @@ def is_scaled_image(image: np.ndarray) -> bool:

 def make_list_of_images(images, expected_ndims: int = 3) -> List[ImageInput]:
    """
-    Ensure that the input is a list of images. If the input is a single image, it is converted to a list of length 1.
+    Ensure that the output is a list of images. If the input is a single image, it is converted to a list of length 1.
    If the input is a batch of images, it is converted to a list of images.

    Args:
@ -203,7 +208,7 @@ def make_list_of_images(images, expected_ndims: int = 3) -> List[ImageInput]:
        return images

    # Either the input is a single image, in which case we create a list of length 1
-    if isinstance(images, PIL.Image.Image):
+    if is_pil_image(images):
        # PIL images are never batched
        return [images]

@ -226,6 +231,108 @@ def make_list_of_images(images, expected_ndims: int = 3) -> List[ImageInput]:
    )


+def make_flat_list_of_images(
+    images: Union[List[ImageInput], ImageInput],
+) -> ImageInput:
+    """
+    Ensure that the output is a flat list of images. If the input is a single image, it is converted to a list of length 1.
+    If the input is a nested list of images, it is converted to a flat list of images.
+    Args:
+        images (`Union[List[ImageInput], ImageInput]`):
+            The input image.
+    Returns:
+        list: A list of images or a 4d array of images.
+    """
+    # If the input is a nested list of images, we flatten it
+    if (
+        isinstance(images, (list, tuple))
+        and all(isinstance(images_i, (list, tuple)) for images_i in images)
+        and all(is_valid_list_of_images(images_i) for images_i in images)
+    ):
+        return [img for img_list in images for img in img_list]
+
+    if isinstance(images, (list, tuple)) and is_valid_list_of_images(images):
+        if is_pil_image(images[0]) or images[0].ndim == 3:
+            return images
+        if images[0].ndim == 4:
+            return [img for img_list in images for img in img_list]
+
+    if is_valid_image(images):
+        if is_pil_image(images) or images.ndim == 3:
+            return [images]
+        if images.ndim == 4:
+            return list(images)
+
+    raise ValueError(f"Could not make a flat list of images from {images}")
+
+
+def make_nested_list_of_images(
+    images: Union[List[ImageInput], ImageInput],
+) -> ImageInput:
+    """
+    Ensure that the output is a nested list of images.
+    Args:
+        images (`Union[List[ImageInput], ImageInput]`):
+            The input image.
+    Returns:
+        list: A list of list of images or a list of 4d array of images.
+    """
+    # If it's a list of batches, it's already in the right format
+    if (
+        isinstance(images, (list, tuple))
+        and all(isinstance(images_i, (list, tuple)) for images_i in images)
+        and all(is_valid_list_of_images(images_i) for images_i in images)
+    ):
+        return images
+
+    # If it's a list of images, it's a single batch, so convert it to a list of lists
+    if isinstance(images, (list, tuple)) and is_valid_list_of_images(images):
+        if is_pil_image(images[0]) or images[0].ndim == 3:
+            return [images]
+        if images[0].ndim == 4:
+            return [list(image) for image in images]
+
+    # If it's a single image, convert it to a list of lists
+    if is_valid_image(images):
+        if is_pil_image(images) or images.ndim == 3:
+            return [[images]]
+        if images.ndim == 4:
+            return [list(images)]
+
+    raise ValueError("Invalid input type. Must be a single image, a list of images, or a list of batches of images.")
+
+
+def make_batched_videos(videos) -> VideoInput:
+    """
+    Ensure that the input is a list of videos.
+    Args:
+        videos (`VideoInput`):
+            Video or videos to turn into a list of videos.
+    Returns:
+        list: A list of videos.
+    """
+    if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
+        # case 1: nested batch of videos so we flatten it
+        if not is_pil_image(videos[0][0]) and videos[0][0].ndim == 4:
+            videos = [video for batch_list in videos for video in batch_list]
+        # case 2: list of videos represented as list of video frames
+        return videos
+
+    elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
+        if is_pil_image(videos[0]) or videos[0].ndim == 3:
+            return [videos]
+        elif videos[0].ndim == 4:
+            return [list(video) for video in videos]
+
+    elif is_valid_image(videos):
+        if is_pil_image(videos) or videos.ndim == 3:
+            return [[videos]]
+        elif videos.ndim == 4:
+            return [list(videos)]
+
+    raise ValueError(f"Could not make batched video from {videos}")
+
+
 def to_numpy_array(img) -> np.ndarray:
    if not is_valid_image(img):
        raise ValueError(f"Invalid image type: {type(img)}")
@ -320,6 +427,37 @@ def get_image_size(image: np.ndarray, channel_dim: ChannelDimension = None) -> T
        raise ValueError(f"Unsupported data format: {channel_dim}")


+def get_image_size_for_max_height_width(
+    image_size: Tuple[int, int],
+    max_height: int,
+    max_width: int,
+) -> Tuple[int, int]:
+    """
+    Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio.
+    Important, even if image_height < max_height and image_width < max_width, the image will be resized
+    to at least one of the edges be equal to max_height or max_width.
+
+    For example:
+        - input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50)
+        - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400)
+
+    Args:
+        image_size (`Tuple[int, int]`):
+            The image to resize.
+        max_height (`int`):
+            The maximum allowed height.
+        max_width (`int`):
+            The maximum allowed width.
+    """
+    height, width = image_size
+    height_scale = max_height / height
+    width_scale = max_width / width
+    min_scale = min(height_scale, width_scale)
+    new_height = int(height * min_scale)
+    new_width = int(width * min_scale)
+    return new_height, new_width
+
+
 def is_valid_annotation_coco_detection(annotation: Dict[str, Union[List, Tuple]]) -> bool:
    if (
        isinstance(annotation, dict)
@ -424,7 +562,7 @@ def get_uniform_frame_indices(total_num_frames: int, num_frames: Optional[int] =
    return indices


-def read_video_opencv(video_path: str, num_frames: Optional[int] = None):
+def read_video_opencv(video_path: str, num_frames: Optional[int] = None, fps: Optional[int] = None):
    """
    Decode the video with open-cv decoder.

@ -432,13 +570,25 @@ def read_video_opencv(video_path: str, num_frames: Optional[int] = None):
        video_path (`str`):
            Path to the video file.
        num_frames (`int`, *optional*):
-            Number of frames to sample uniformly. If not specified, all frames are sampled.
+            Number of frames to sample uniformly. Should be passed only when `fps=None`.
+            If not specified and `fps==None`, all frames are sampled.
+        fps (`int`, *optional*):
+            Number of frames to sample per second. Should be passed only when `num_frames=None`.
+            If not specified and `num_frames==None`, all frames are sampled.

    Returns:
        np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3).
    """
    video = cv2.VideoCapture(video_path)
    total_num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
+    video_fps = video.get(cv2.CAP_PROP_FPS)
+    if num_frames is None and fps is not None:
+        num_frames = int(total_num_frames / video_fps * fps)
+        if num_frames > total_num_frames:
+            raise ValueError(
+                f"When loading the video with fps={fps}, we identified that num_frames ({num_frames}) > total_frames ({total_num_frames}) ."
+                f"Make sure that fps of a video is less than the requested fps for loading. Detected video_fps={video_fps}"
+            )
    indices = get_uniform_frame_indices(total_num_frames, num_frames=num_frames)

    index = 0
@ -457,7 +607,7 @@ def read_video_opencv(video_path: str, num_frames: Optional[int] = None):
    return np.stack(frames)


-def read_video_decord(video_path: str, num_frames: Optional[int] = None):
+def read_video_decord(video_path: str, num_frames: Optional[int] = None, fps: Optional[int] = None):
    """
    Decode the video with Decord decoder.

@ -465,18 +615,31 @@ def read_video_decord(video_path: str, num_frames: Optional[int] = None):
        video_path (`str`):
            Path to the video file.
        num_frames (`int`, *optional*):
-            Number of frames to sample uniformly. If not specified, all frames are sampled.
+            Number of frames to sample uniformly. Should be passed only when `fps=None`.
+            If not specified and `fps==None`, all frames are sampled.
+        fps (`int`, *optional*):
+            Number of frames to sample per second. Should be passed only when `num_frames=None`.
+            If not specified and `num_frames==None`, all frames are sampled.

    Returns:
        np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3).
    """
    vr = VideoReader(uri=video_path, ctx=cpu(0))  # decord has problems with gpu
-    indices = get_uniform_frame_indices(total_num_frames=len(vr), num_frames=num_frames)
+    video_fps = vr.get_avg_fps()
+    total_num_frames = len(vr)
+    if num_frames is None and fps is not None:
+        num_frames = int(total_num_frames / video_fps * fps)
+        if num_frames > total_num_frames:
+            raise ValueError(
+                f"When loading the video with fps={fps}, we identified that num_frames ({num_frames}) > total_frames ({total_num_frames}) ."
+                f"Make sure that fps of a video is less than the requested fps for loading. Detected video_fps={video_fps}"
+            )
+    indices = get_uniform_frame_indices(total_num_frames=total_num_frames, num_frames=num_frames)
    frames = vr.get_batch(indices).asnumpy()
    return frames


-def read_video_pyav(video_path: str, num_frames: Optional[int] = None):
+def read_video_pyav(video_path: str, num_frames: Optional[int] = None, fps: Optional[int] = None):
    """
    Decode the video with PyAV decoder.

@ -484,15 +647,26 @@ def read_video_pyav(video_path: str, num_frames: Optional[int] = None):
        video_path (`str`):
            Path to the video file.
        num_frames (`int`, *optional*):
-            Number of frames to sample uniformly. If not specified, all frames are sampled.
+            Number of frames to sample uniformly. Should be passed only when `fps=None`.
+            If not specified and `fps==None`, all frames are sampled.
+        fps (`int`, *optional*):
+            Number of frames to sample per second. Should be passed only when `num_frames=None`.
+            If not specified and `num_frames==None`, all frames are sampled.

    Returns:
        np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3).
    """
    container = av.open(video_path)

-    # sample uniformly "num_frames" frames from the video
    total_num_frames = container.streams.video[0].frames
+    video_fps = container.streams.video[0].average_rate  # should we better use `av_guess_frame_rate`?
+    if num_frames is None and fps is not None:
+        num_frames = int(total_num_frames / video_fps * fps)
+        if num_frames > total_num_frames:
+            raise ValueError(
+                f"When loading the video with fps={fps}, we identified that num_frames ({num_frames}) > total_frames ({total_num_frames}) ."
+                f"Make sure that fps of a video is less than the requested fps for loading. Detected video_fps={video_fps}"
+            )
    indices = get_uniform_frame_indices(total_num_frames, num_frames=num_frames)

    frames = []
@ -506,7 +680,7 @@ def read_video_pyav(video_path: str, num_frames: Optional[int] = None):
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])


-def read_video_torchvision(video_path: str, num_frames: Optional[int] = None):
+def read_video_torchvision(video_path: str, num_frames: Optional[int] = None, fps: Optional[int] = None):
    """
    Decode the video with torchvision decoder.

@ -514,7 +688,11 @@ def read_video_torchvision(video_path: str, num_frames: Optional[int] = None):
        video_path (`str`):
            Path to the video file.
        num_frames (`int`, *optional*):
-            Number of frames to sample uniformly. If not specified, all frames are sampled.
+            Number of frames to sample uniformly. Should be passed only when `fps=None`.
+            If not specified and `fps==None`, all frames are sampled.
+        fps (`int`, *optional*):
+            Number of frames to sample per second. Should be passed only when `num_frames=None`.
+            If not specified and `num_frames==None`, all frames are sampled.

    Returns:
        np.ndarray: np array of decoded frames of shape (num_frames, height, width, 3).
@ -526,6 +704,15 @@ def read_video_torchvision(video_path: str, num_frames: Optional[int] = None):
        pts_unit="sec",
        output_format="TCHW",
    )
+    video_fps = info["video_fps"]
+    total_num_frames = video.size(0) - 1
+    if num_frames is None and fps is not None:
+        num_frames = int(total_num_frames / video_fps * fps)
+        if num_frames > total_num_frames:
+            raise ValueError(
+                f"When loading the video with fps={fps}, we identified that num_frames ({num_frames}) > total_frames ({total_num_frames}) ."
+                f"Make sure that fps of a video is less than the requested fps for loading. Detected video_fps={video_fps}"
+            )

    if num_frames is not None:
        idx = torch.linspace(0, video.size(0) - 1, num_frames, dtype=torch.int64)
@ -542,7 +729,12 @@ VIDEO_DECODERS = {
 }


-def load_video(video: Union[str, "VideoInput"], num_frames: Optional[int] = None, backend: str = "opencv") -> np.array:
+def load_video(
+    video: Union[str, "VideoInput"],
+    num_frames: Optional[int] = None,
+    fps: Optional[int] = None,
+    backend: str = "opencv",
+) -> np.array:
    """
    Loads `video` to a numpy array.

@ -551,12 +743,19 @@ def load_video(video: Union[str, "VideoInput"], num_frames: Optional[int] = None
            The video to convert to the numpy array format. Can be a link to video or local path.
        num_frames (`int`, *optional*):
            Number of frames to sample uniformly. If not passed, the whole video is loaded.
+        fps (`int`, *optional*):
+            Number of frames to sample per second. Should be passed only when `num_frames=None`.
+            If not specified and `num_frames==None`, all frames are sampled.
        backend (`str`, *optional*, defaults to `"opencv"`):
            The backend to use when loading the video. Can be any of ["decord", "pyav", "opencv", "torchvision"]. Defaults to "opencv".

    Returns:
        `np.array`: A numpy array of shape (num_frames, channels, height, width).
    """
+
+    if fps is not None and num_frames is not None:
+        raise ValueError("`num_frames` and `fps` are mutually exclusive arguments, please use only one!")
+
    if video.startswith("https://www.youtube.com") or video.startswith("http://www.youtube.com"):
        if not is_yt_dlp_available():
            raise ImportError("To load a video from YouTube url you have  to install `yt_dlp` first.")
@ -597,7 +796,7 @@ def load_video(video: Union[str, "VideoInput"], num_frames: Optional[int] = None
        )

    video_decoder = VIDEO_DECODERS[backend]
-    video = video_decoder(file_obj)
+    video = video_decoder(file_obj, num_frames=num_frames, fps=fps)
    return video


@ -689,12 +888,16 @@ def validate_fast_preprocess_arguments(
        do_normalize=do_normalize,
        image_mean=image_mean,
        image_std=image_std,
+        do_pad=do_pad,
+        size_divisibility=size_divisibility,
+        do_center_crop=do_center_crop,
+        crop_size=crop_size,
        do_resize=do_resize,
        size=size,
        resample=resample,
    )
    # Extra checks for ImageProcessorFast
-    if return_tensors != "pt":
+    if return_tensors is not None and return_tensors != "pt":
        raise ValueError("Only returning PyTorch tensors is currently supported.")

    if data_format != ChannelDimension.FIRST:
@ -1084,3 +1287,22 @@ def validate_kwargs(valid_processor_keys: List[str], captured_kwargs: List[str])
        unused_key_str = ", ".join(unused_keys)
        # TODO raise a warning here instead of simply logging?
        logger.warning(f"Unused or unrecognized kwargs: {unused_key_str}.")
+
+
+@dataclass(frozen=True)
+class SizeDict:
+    """
+    Hashable dictionary to store image size information.
+    """
+
+    height: int = None
+    width: int = None
+    longest_edge: int = None
+    shortest_edge: int = None
+    max_height: int = None
+    max_width: int = None
+
+    def __getitem__(self, key):
+        if hasattr(self, key):
+            return getattr(self, key)
+        raise KeyError(f"Key {key} not found in SizeDict.")
--- a/src/transformers/integrations/deepspeed.py
+++ b/src/transformers/integrations/deepspeed.py
@ -383,8 +383,8 @@ def deepspeed_init(trainer, num_training_steps, inference=False):
    Returns: optimizer, lr_scheduler

    We may use `deepspeed_init` more than once during the life of Trainer, when we do - it's a temp hack based on:
-    https://github.com/microsoft/DeepSpeed/issues/1394#issuecomment-937405374 until Deepspeed fixes a bug where it
-    can't resume from a checkpoint after it did some stepping https://github.com/microsoft/DeepSpeed/issues/1612
+    https://github.com/deepspeedai/DeepSpeed/issues/1394#issuecomment-937405374 until Deepspeed fixes a bug where it
+    can't resume from a checkpoint after it did some stepping https://github.com/deepspeedai/DeepSpeed/issues/1612

    """
    from deepspeed.utils import logger as ds_logger
--- a/src/transformers/integrations/flex_attention.py
+++ b/src/transformers/integrations/flex_attention.py
@ -17,6 +17,7 @@ def flex_attention_forward(
    attention_mask: Optional[torch.Tensor],
    scaling: Optional[float] = None,
    softcap: Optional[float] = None,
+    head_mask: Optional[torch.Tensor] = None,
    **kwargs,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
    causal_mask = attention_mask
@ -28,6 +29,8 @@ def flex_attention_forward(
            score = softcap * torch.tanh(score / softcap)
        if causal_mask is not None:
            score = score + causal_mask[b][0][q_idx][kv_idx]
+        if head_mask is not None:
+            score = score + head_mask[b][h][0][0]
        return score

    attn_output, attention_weights = flex_attention(
--- a/src/transformers/integrations/mistral.py
+++ b/src/transformers/integrations/mistral.py
@ -0,0 +1,105 @@
+from tokenizers import Regex, Tokenizer, decoders, pre_tokenizers, processors
+from tokenizers.models import BPE
+
+from transformers import LlamaTokenizerFast
+from transformers.convert_slow_tokenizer import bytes_to_unicode
+
+
+class MistralConverter:
+    """
+    A general tiktoken converter.
+    """
+
+    def __init__(
+        self,
+        vocab=None,
+        pattern=r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""",
+        add_prefix_space=False,
+        additional_special_tokens=None,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args)
+        self.vocab = vocab
+        self.pattern = pattern
+        self.add_prefix_space = add_prefix_space
+        self.additional_special_tokens = additional_special_tokens
+
+    def extract_vocab_merges_from_model(self, vocab: str):
+        bpe_ranks = vocab
+        byte_encoder = bytes_to_unicode()
+
+        def token_bytes_to_string(b):
+            return "".join([byte_encoder[ord(char)] for char in b.decode("latin-1")])
+
+        merges = []
+        vocab = {}
+        for idx, (token, rank) in enumerate(bpe_ranks.items()):
+            if token not in self.additional_special_tokens:
+                vocab[token_bytes_to_string(token)] = idx
+                if len(token) == 1:
+                    continue
+                local = []
+                for index in range(1, len(token)):
+                    piece_l, piece_r = token[:index], token[index:]
+                    if piece_l in bpe_ranks and piece_r in bpe_ranks and (piece_l + piece_r) in bpe_ranks:
+                        local.append((piece_l, piece_r, rank))
+                local = sorted(local, key=lambda x: (bpe_ranks[x[0]], bpe_ranks[x[1]]), reverse=False)
+                merges.extend(local)
+            else:
+                vocab[token] = idx
+        merges = sorted(merges, key=lambda val: val[2], reverse=False)
+        merges = [(token_bytes_to_string(val[0]), token_bytes_to_string(val[1])) for val in merges]
+        return vocab, merges
+
+    def tokenizer(self):
+        vocab_scores, merges = self.extract_vocab_merges_from_model(self.vocab)
+        tokenizer = Tokenizer(BPE(vocab_scores, merges, fuse_unk=False))
+        if hasattr(tokenizer.model, "ignore_merges"):
+            tokenizer.model.ignore_merges = True
+        return tokenizer
+
+    def converted(self) -> Tokenizer:
+        tokenizer = self.tokenizer()
+        tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
+            [
+                pre_tokenizers.Split(Regex(self.pattern), behavior="isolated", invert=False),
+                pre_tokenizers.ByteLevel(add_prefix_space=self.add_prefix_space, use_regex=False),
+            ]
+        )
+        tokenizer.decoder = decoders.ByteLevel()
+        tokenizer.add_special_tokens(self.additional_special_tokens)
+
+        tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
+
+        return tokenizer
+
+
+def convert_tekken_tokenizer(tokenizer_file: str):
+    """Convert a "tekken" tokenizer to a fast Tokenizer."""
+    # Tekken format -- need to use the Converter
+
+    from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
+
+    # Load directly using their lib
+    mistral_tokenizer = MistralTokenizer.from_file(tokenizer_file)
+
+    # Extract vocab and special tokens
+    vocab = mistral_tokenizer.instruct_tokenizer.tokenizer._tekken_token2id_nospecial
+    all_special = [
+        token.value if hasattr(token, "value") else token
+        for token in mistral_tokenizer.instruct_tokenizer.tokenizer._all_special_tokens
+    ]
+    specials_tokens = {token: all_special.index(token) for token in all_special}
+    specials_tokens.update(vocab)
+    vocab = specials_tokens
+
+    # Convert
+    tokenizer = LlamaTokenizerFast(
+        tokenizer_object=MistralConverter(vocab=vocab, additional_special_tokens=all_special).converted(),
+    )
+
+    # Post-process
+    tokenizer.add_special_tokens({"additional_special_tokens": all_special})
+
+    return tokenizer
--- a/src/transformers/integrations/sdpa_attention.py
+++ b/src/transformers/integrations/sdpa_attention.py
@ -45,6 +45,11 @@ def sdpa_attention_forward(
    if is_causal is None:
        is_causal = causal_mask is None and query.shape[2] > 1

+    # Shapes (e.g. query.shape[2]) are tensors during jit tracing, resulting in `is_causal` being a tensor.
+    # We convert it to a bool for the SDPA kernel that only accepts bools.
+    if torch.jit.is_tracing() and isinstance(is_causal, torch.Tensor):
+        is_causal = is_causal.item()
+
    attn_output = torch.nn.functional.scaled_dot_product_attention(
        query,
        key,
--- a/src/transformers/kernels/deformable_detr/cuda/ms_deform_attn_cuda.cu
+++ b/src/transformers/kernels/deformable_detr/cuda/ms_deform_attn_cuda.cu
@ -36,11 +36,11 @@ at::Tensor ms_deform_attn_cuda_forward(
    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");

-    AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
-    AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
-    AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
-    AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
-    AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
+    AT_ASSERTM(value.is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(spatial_shapes.is_cuda(), "spatial_shapes must be a CUDA tensor");
+    AT_ASSERTM(level_start_index.is_cuda(), "level_start_index must be a CUDA tensor");
+    AT_ASSERTM(sampling_loc.is_cuda(), "sampling_loc must be a CUDA tensor");
+    AT_ASSERTM(attn_weight.is_cuda(), "attn_weight must be a CUDA tensor");

    const int batch = value.size(0);
    const int spatial_size = value.size(1);
@ -66,15 +66,15 @@ at::Tensor ms_deform_attn_cuda_forward(
    for (int n = 0; n < batch/im2col_step_; ++n)
    {
        auto columns = output_n.select(0, n);
-        AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, value.type(), "ms_deform_attn_forward_cuda", ([&] {
+        AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, value.scalar_type(), "ms_deform_attn_forward_cuda", ([&] {
            ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
-                value.data<scalar_t>() + n * im2col_step_ * per_value_size,
-                spatial_shapes.data<int64_t>(),
-                level_start_index.data<int64_t>(),
-                sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
-                attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
+                value.data_ptr<scalar_t>() + n * im2col_step_ * per_value_size,
+                spatial_shapes.data_ptr<int64_t>(),
+                level_start_index.data_ptr<int64_t>(),
+                sampling_loc.data_ptr<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                attn_weight.data_ptr<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
                batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
-                columns.data<scalar_t>());
+                columns.data_ptr<scalar_t>());

        }));
    }
@ -103,12 +103,12 @@ std::vector<at::Tensor> ms_deform_attn_cuda_backward(
    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
    AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");

-    AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
-    AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
-    AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
-    AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
-    AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
-    AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor");
+    AT_ASSERTM(value.is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(spatial_shapes.is_cuda(), "spatial_shapes must be a CUDA tensor");
+    AT_ASSERTM(level_start_index.is_cuda(), "level_start_index must be a CUDA tensor");
+    AT_ASSERTM(sampling_loc.is_cuda(), "sampling_loc must be a CUDA tensor");
+    AT_ASSERTM(attn_weight.is_cuda(), "attn_weight must be a CUDA tensor");
+    AT_ASSERTM(grad_output.is_cuda(), "grad_output must be a CUDA tensor");

    const int batch = value.size(0);
    const int spatial_size = value.size(1);
@ -137,18 +137,18 @@ std::vector<at::Tensor> ms_deform_attn_cuda_backward(
    for (int n = 0; n < batch/im2col_step_; ++n)
    {
        auto grad_output_g = grad_output_n.select(0, n);
-        AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, value.type(), "ms_deform_attn_backward_cuda", ([&] {
+        AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, value.scalar_type(), "ms_deform_attn_backward_cuda", ([&] {
            ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
-                                    grad_output_g.data<scalar_t>(),
-                                    value.data<scalar_t>() + n * im2col_step_ * per_value_size,
-                                    spatial_shapes.data<int64_t>(),
-                                    level_start_index.data<int64_t>(),
-                                    sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
-                                    attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
+                                    grad_output_g.data_ptr<scalar_t>(),
+                                    value.data_ptr<scalar_t>() + n * im2col_step_ * per_value_size,
+                                    spatial_shapes.data_ptr<int64_t>(),
+                                    level_start_index.data_ptr<int64_t>(),
+                                    sampling_loc.data_ptr<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                                    attn_weight.data_ptr<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
                                    batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
-                                    grad_value.data<scalar_t>() +  n * im2col_step_ * per_value_size,
-                                    grad_sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
-                                    grad_attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size);
+                                    grad_value.data_ptr<scalar_t>() +  n * im2col_step_ * per_value_size,
+                                    grad_sampling_loc.data_ptr<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                                    grad_attn_weight.data_ptr<scalar_t>() + n * im2col_step_ * per_attn_weight_size);

        }));
    }
--- a/src/transformers/kernels/deformable_detr/cuda/ms_deform_attn_cuda.cuh
+++ b/src/transformers/kernels/deformable_detr/cuda/ms_deform_attn_cuda.cuh
@ -42,11 +42,11 @@ at::Tensor ms_deform_attn_cuda_forward(
    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");

-    AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
-    AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
-    AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
-    AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
-    AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
+    AT_ASSERTM(value.is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(spatial_shapes.is_cuda(), "spatial_shapes must be a CUDA tensor");
+    AT_ASSERTM(level_start_index.is_cuda(), "level_start_index must be a CUDA tensor");
+    AT_ASSERTM(sampling_loc.is_cuda(), "sampling_loc must be a CUDA tensor");
+    AT_ASSERTM(attn_weight.is_cuda(), "attn_weight must be a CUDA tensor");

    const int batch = value.size(0);
    const int spatial_size = value.size(1);
@ -72,15 +72,15 @@ at::Tensor ms_deform_attn_cuda_forward(
    for (int n = 0; n < batch/im2col_step_; ++n)
    {
        auto columns = output_n.select(0, n);
-        AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, value.type(), "ms_deform_attn_forward_cuda", ([&] {
+        AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, value.scalar_type(), "ms_deform_attn_forward_cuda", ([&] {
            ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
-                value.data<scalar_t>() + n * im2col_step_ * per_value_size,
-                spatial_shapes.data<int64_t>(),
-                level_start_index.data<int64_t>(),
-                sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
-                attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
+                value.data_ptr<scalar_t>() + n * im2col_step_ * per_value_size,
+                spatial_shapes.data_ptr<int64_t>(),
+                level_start_index.data_ptr<int64_t>(),
+                sampling_loc.data_ptr<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                attn_weight.data_ptr<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
                batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
-                columns.data<scalar_t>());
+                columns.data_ptr<scalar_t>());

        }));
    }
@ -108,12 +108,12 @@ std::vector<at::Tensor> ms_deform_attn_cuda_backward(
    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
    AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");

-    AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
-    AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
-    AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
-    AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
-    AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
-    AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor");
+    AT_ASSERTM(value.is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(spatial_shapes.is_cuda(), "spatial_shapes must be a CUDA tensor");
+    AT_ASSERTM(level_start_index.is_cuda(), "level_start_index must be a CUDA tensor");
+    AT_ASSERTM(sampling_loc.is_cuda(), "sampling_loc must be a CUDA tensor");
+    AT_ASSERTM(attn_weight.is_cuda(), "attn_weight must be a CUDA tensor");
+    AT_ASSERTM(grad_output.is_cuda(), "grad_output must be a CUDA tensor");

    const int batch = value.size(0);
    const int spatial_size = value.size(1);
@ -142,18 +142,18 @@ std::vector<at::Tensor> ms_deform_attn_cuda_backward(
    for (int n = 0; n < batch/im2col_step_; ++n)
    {
        auto grad_output_g = grad_output_n.select(0, n);
-        AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, value.type(), "ms_deform_attn_backward_cuda", ([&] {
+        AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, value.scalar_type(), "ms_deform_attn_backward_cuda", ([&] {
            ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
-                                    grad_output_g.data<scalar_t>(),
-                                    value.data<scalar_t>() + n * im2col_step_ * per_value_size,
-                                    spatial_shapes.data<int64_t>(),
-                                    level_start_index.data<int64_t>(),
-                                    sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
-                                    attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
+                                    grad_output_g.data_ptr<scalar_t>(),
+                                    value.data_ptr<scalar_t>() + n * im2col_step_ * per_value_size,
+                                    spatial_shapes.data_ptr<int64_t>(),
+                                    level_start_index.data_ptr<int64_t>(),
+                                    sampling_loc.data_ptr<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                                    attn_weight.data_ptr<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
                                    batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
-                                    grad_value.data<scalar_t>() +  n * im2col_step_ * per_value_size,
-                                    grad_sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
-                                    grad_attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size);
+                                    grad_value.data_ptr<scalar_t>() +  n * im2col_step_ * per_value_size,
+                                    grad_sampling_loc.data_ptr<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                                    grad_attn_weight.data_ptr<scalar_t>() + n * im2col_step_ * per_attn_weight_size);

        }));
    }
@ -398,7 +398,7 @@ __global__ void ms_deformable_im2col_gpu_kernel(const int n,
    const int sampling_index = _temp; 
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
-    const int q_col = _temp % num_query;
+    [[maybe_unused]] const int q_col = _temp % num_query;
    _temp /= num_query;
    const int b_col = _temp;

@ -468,7 +468,7 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(co
    const int sampling_index = _temp; 
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
-    const int q_col = _temp % num_query;
+    [[maybe_unused]] const int q_col = _temp % num_query;
    _temp /= num_query;
    const int b_col = _temp;

@ -573,7 +573,7 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(co
    const int sampling_index = _temp; 
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
-    const int q_col = _temp % num_query;
+    [[maybe_unused]] const int q_col = _temp % num_query;
    _temp /= num_query;
    const int b_col = _temp;

@ -681,7 +681,7 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n,
    const int sampling_index = _temp; 
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
-    const int q_col = _temp % num_query;
+    [[maybe_unused]] const int q_col = _temp % num_query;
    _temp /= num_query;
    const int b_col = _temp;

@ -786,7 +786,7 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n,
    const int sampling_index = _temp; 
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
-    const int q_col = _temp % num_query;
+    [[maybe_unused]] const int q_col = _temp % num_query;
    _temp /= num_query;
    const int b_col = _temp;

@ -899,7 +899,7 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const
    const int sampling_index = _temp; 
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
-    const int q_col = _temp % num_query;
+    [[maybe_unused]] const int q_col = _temp % num_query;
    _temp /= num_query;
    const int b_col = _temp;

@ -1009,7 +1009,7 @@ __global__ void ms_deformable_col2im_gpu_kernel_gm(const int n,
    const int sampling_index = _temp; 
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
-    const int q_col = _temp % num_query;
+    [[maybe_unused]] const int q_col = _temp % num_query;
    _temp /= num_query;
    const int b_col = _temp;

--- a/src/transformers/kernels/deformable_detr/cuda/ms_deform_im2col_cuda.cuh
+++ b/src/transformers/kernels/deformable_detr/cuda/ms_deform_im2col_cuda.cuh
@ -258,7 +258,7 @@ __global__ void ms_deformable_im2col_gpu_kernel(const int n,
    const int sampling_index = _temp; 
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
-    const int q_col = _temp % num_query;
+    [[maybe_unused]] const int q_col = _temp % num_query;
    _temp /= num_query;
    const int b_col = _temp;

@ -328,7 +328,7 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(co
    const int sampling_index = _temp; 
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
-    const int q_col = _temp % num_query;
+    [[maybe_unused]] const int q_col = _temp % num_query;
    _temp /= num_query;
    const int b_col = _temp;

@ -433,7 +433,7 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(co
    const int sampling_index = _temp; 
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
-    const int q_col = _temp % num_query;
+    [[maybe_unused]] const int q_col = _temp % num_query;
    _temp /= num_query;
    const int b_col = _temp;

@ -541,7 +541,7 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n,
    const int sampling_index = _temp; 
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
-    const int q_col = _temp % num_query;
+    [[maybe_unused]] const int q_col = _temp % num_query;
    _temp /= num_query;
    const int b_col = _temp;

@ -646,7 +646,7 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n,
    const int sampling_index = _temp; 
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
-    const int q_col = _temp % num_query;
+    [[maybe_unused]] const int q_col = _temp % num_query;
    _temp /= num_query;
    const int b_col = _temp;

@ -759,7 +759,7 @@ __global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const
    const int sampling_index = _temp; 
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
-    const int q_col = _temp % num_query;
+    [[maybe_unused]] const int q_col = _temp % num_query;
    _temp /= num_query;
    const int b_col = _temp;

@ -869,7 +869,7 @@ __global__ void ms_deformable_col2im_gpu_kernel_gm(const int n,
    const int sampling_index = _temp; 
    const int m_col = _temp % num_heads;
    _temp /= num_heads;
-    const int q_col = _temp % num_query;
+    [[maybe_unused]] const int q_col = _temp % num_query;
    _temp /= num_query;
    const int b_col = _temp;

--- a/src/transformers/kernels/deformable_detr/ms_deform_attn.h
+++ b/src/transformers/kernels/deformable_detr/ms_deform_attn.h
@ -26,7 +26,7 @@ ms_deform_attn_forward(
    const at::Tensor &attn_weight,
    const int im2col_step)
 {
-    if (value.type().is_cuda())
+    if (value.is_cuda())
    {
 #ifdef WITH_CUDA
        return ms_deform_attn_cuda_forward(
@ -48,7 +48,7 @@ ms_deform_attn_backward(
    const at::Tensor &grad_output,
    const int im2col_step)
 {
-    if (value.type().is_cuda())
+    if (value.is_cuda())
    {
 #ifdef WITH_CUDA
        return ms_deform_attn_cuda_backward(
--- a/src/transformers/loss/loss_rt_detr.py
+++ b/src/transformers/loss/loss_rt_detr.py
@ -18,7 +18,6 @@ import torch.nn.functional as F

 from ..utils import is_scipy_available, is_vision_available, requires_backends
 from .loss_for_object_detection import (
-    _set_aux_loss,
    box_iou,
    dice_loss,
    generalized_box_iou,
@ -35,6 +34,15 @@ if is_vision_available():
    from transformers.image_transforms import center_to_corners_format


+# different for RT-DETR: not slicing the last element like in DETR one
+@torch.jit.unused
+def _set_aux_loss(outputs_class, outputs_coord):
+    # this is a workaround to make torchscript happy, as torchscript
+    # doesn't support dictionary with non-homogeneous values, such
+    # as a dict having both a Tensor and a list.
+    return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class, outputs_coord)]
+
+
 class RTDetrHungarianMatcher(nn.Module):
    """This class computes an assignment between the targets and the predictions of the network

--- a/src/transformers/loss/loss_utils.py
+++ b/src/transformers/loss/loss_utils.py
@ -128,7 +128,9 @@ LOSS_MAPPING = {
    "ForObjectDetection": ForObjectDetectionLoss,
    "DeformableDetrForObjectDetection": DeformableDetrForObjectDetectionLoss,
    "ConditionalDetrForObjectDetection": DeformableDetrForObjectDetectionLoss,
+    "DabDetrForObjectDetection": DeformableDetrForObjectDetectionLoss,
    "GroundingDinoForObjectDetection": DeformableDetrForObjectDetectionLoss,
    "ConditionalDetrForSegmentation": DeformableDetrForSegmentationLoss,
    "RTDetrForObjectDetection": RTDetrForObjectDetectionLoss,
+    "RTDetrV2ForObjectDetection": RTDetrForObjectDetectionLoss,
 }
--- a/src/transformers/modelcard.py
+++ b/src/transformers/modelcard.py
@ -489,7 +489,7 @@ class TrainingSummary:
                f" [{self.finetuned_from}](https://huggingface.co/{self.finetuned_from}) on "
            )

-        if self.dataset is None:
+        if self.dataset is None or (isinstance(self.dataset, list) and len(self.dataset) == 0):
            model_card += "an unknown dataset."
        else:
            if isinstance(self.dataset, str):
--- a/src/transformers/modeling_flash_attention_utils.py
+++ b/src/transformers/modeling_flash_attention_utils.py
@ -121,7 +121,7 @@ def _upad_input(
    else:
        # The -q_len: slice assumes left padding.
        attention_mask = attention_mask[:, -query_length:]
-        query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+        query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q, *_ = unpad_input(query_layer, attention_mask)

    return (
        query_layer,
@ -209,7 +209,7 @@ def fa_peft_integration_check(
    if target_dtype is None:
        return query, key, value

-    input_dtype = value.dtype
+    input_dtype = query.dtype
    if input_dtype == torch.float32:
        logger.warning_once(
            f"The input hidden states seems to be silently casted in float32, this might be related to"
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@ -37,6 +37,7 @@ import torch
 from huggingface_hub import split_torch_state_dict_into_shards
 from packaging import version
 from torch import Tensor, nn
+from torch.distributions import constraints
 from torch.nn import CrossEntropyLoss, Identity
 from torch.utils.checkpoint import checkpoint

@ -245,6 +246,25 @@ def set_zero3_state():
        _is_ds_init_called = False


+def restore_default_torch_dtype(func):
+    """
+    Decorator to restore the default torch dtype
+    at the end of the function. Serves
+    as a backup in case calling the function raises
+    an error after the function has changed the default dtype but before it could restore it.
+    """
+
+    @wraps(func)
+    def _wrapper(*args, **kwargs):
+        old_dtype = torch.get_default_dtype()
+        try:
+            return func(*args, **kwargs)
+        finally:
+            torch.set_default_dtype(old_dtype)
+
+    return _wrapper
+
+
 def get_parameter_device(parameter: Union[nn.Module, "ModuleUtilsMixin"]):
    try:
        return next(parameter.parameters()).device
@ -1406,6 +1426,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
                self.model_tags.append(tag)

    @classmethod
+    @restore_default_torch_dtype
    def _from_config(cls, config, **kwargs):
        """
        All context managers that the model should be initialized under go here.
@ -2425,14 +2446,12 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
        covariance = old_centered_embeddings.T @ old_centered_embeddings / old_num_tokens

        # Check if the covariance is positive definite.
-        eigenvalues = torch.linalg.eigvals(covariance)
-        is_covariance_psd = bool(
-            (covariance == covariance.T).all() and not torch.is_complex(eigenvalues) and (eigenvalues > 0).all()
-        )
+        epsilon = 1e-9
+        is_covariance_psd = constraints.positive_definite.check(epsilon * covariance).all()
        if is_covariance_psd:
            # If covariances is positive definite, a distribution can be created. and we can sample new weights from it.
            distribution = torch.distributions.multivariate_normal.MultivariateNormal(
-                mean_embeddings, covariance_matrix=1e-9 * covariance
+                mean_embeddings, covariance_matrix=epsilon * covariance
            )
            new_embeddings.weight.data[-1 * added_num_tokens :, :] = distribution.sample(
                sample_shape=(added_num_tokens,)
@ -3143,6 +3162,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
            return super().float(*args)

    @classmethod
+    @restore_default_torch_dtype
    def from_pretrained(
        cls: Type[SpecificPreTrainedModelType],
        pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
@ -3443,6 +3463,29 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
            # TODO: we can relax this check when we support taking tp_plan from a json file, for example.
            raise ValueError(f"tp_plan supports 'auto' only for now but got {tp_plan}.")

+        if tp_plan is not None and device_map is not None:
+            raise ValueError(
+                "`tp_plan` and `device_map` are mutually exclusive. Choose either one for parallelization."
+            )
+
+        # We need to correctly dispatch the model on the current process device. The easiest way for this is to use a simple
+        # `device_map` pointing to the correct device. If we don't, torch will use the default device (index 0) for all
+        # childs processes at parallelization time, resulting in excessive memory usage on device 0 and OOMs.
+        # And temporarily setting the default device to current process rank result in the following error
+        # `torch.distributed.DistBackendError: Attempt to perform collective on tensor not on device passed to init_process_group`
+        tp_device = None
+        if tp_plan is not None:
+            if not torch.distributed.is_initialized():
+                raise ValueError("Tensor Parallel requires torch.distributed to be initialized first.")
+
+            # Detect the accelerator on the machine. If no accelerator is available, it returns CPU.
+            device_type = torch._C._get_accelerator().type
+            device_module = torch.get_device_module(device_type)
+            # Get device with index assuming equal number of devices per host
+            tp_device = torch.device(device_type, torch.distributed.get_rank() % device_module.device_count())
+            # This is the easiest way to dispatch to the current process device
+            device_map = tp_device
+
        if is_fsdp_enabled():
            low_cpu_mem_usage = True

@ -3612,7 +3655,10 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix

            model_kwargs = kwargs

-        pre_quantized = getattr(config, "quantization_config", None) is not None
+        pre_quantized = hasattr(config, "quantization_config")
+        if pre_quantized and not AutoHfQuantizer.supports_quant_method(config.quantization_config):
+            pre_quantized = False
+
        if pre_quantized or quantization_config is not None:
            if pre_quantized:
                config.quantization_config = AutoHfQuantizer.merge_quantization_configs(
@ -3625,7 +3671,6 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
                config.quantization_config,
                pre_quantized=pre_quantized,
            )
-
        else:
            hf_quantizer = None

@ -4090,7 +4135,6 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix

        # Instantiate model.
        init_contexts = [no_init_weights(_enable=_fast_init)]
-        tp_device = None

        if is_deepspeed_zero3_enabled() and not is_quantized and not _is_ds_init_called:
            import deepspeed
@ -4106,16 +4150,6 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
                    f"Using `low_cpu_mem_usage=True` or a `device_map` requires Accelerate: `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`"
                )
            init_contexts.append(init_empty_weights())
-        elif tp_plan is not None:
-            if not torch.distributed.is_initialized():
-                raise ValueError("Tensor Parallel requires torch.distributed to be initialized first.")
-
-            # Detect the accelerator on the machine. If no accelerator is available, it returns CPU.
-            device_type = torch._C._get_accelerator().type
-            device_module = torch.get_device_module(device_type)
-            # Get device with index assuming equal number of devices per host
-            tp_device = torch.device(device_type, torch.distributed.get_rank() % device_module.device_count())
-            init_contexts.append(tp_device)

        if is_deepspeed_zero3_enabled() and is_quantized:
            init_contexts.append(set_quantized_state())
@ -4249,38 +4283,32 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
            if dtype_orig is not None:
                torch.set_default_dtype(dtype_orig)

-            load_contexts = []
-            # Make sure we load onto targeted device
-            if tp_device is not None:
-                load_contexts.append(tp_device)
-
-            with ContextManagers(load_contexts):
-                (
-                    model,
-                    missing_keys,
-                    unexpected_keys,
-                    mismatched_keys,
-                    offload_index,
-                    error_msgs,
-                ) = cls._load_pretrained_model(
-                    model,
-                    state_dict,
-                    loaded_state_dict_keys,  # XXX: rename?
-                    resolved_archive_file,
-                    pretrained_model_name_or_path,
-                    ignore_mismatched_sizes=ignore_mismatched_sizes,
-                    sharded_metadata=sharded_metadata,
-                    _fast_init=_fast_init,
-                    low_cpu_mem_usage=low_cpu_mem_usage,
-                    device_map=device_map,
-                    offload_folder=offload_folder,
-                    offload_state_dict=offload_state_dict,
-                    dtype=torch_dtype,
-                    hf_quantizer=hf_quantizer,
-                    keep_in_fp32_modules=keep_in_fp32_modules,
-                    gguf_path=gguf_path,
-                    weights_only=weights_only,
-                )
+            (
+                model,
+                missing_keys,
+                unexpected_keys,
+                mismatched_keys,
+                offload_index,
+                error_msgs,
+            ) = cls._load_pretrained_model(
+                model,
+                state_dict,
+                loaded_state_dict_keys,  # XXX: rename?
+                resolved_archive_file,
+                pretrained_model_name_or_path,
+                ignore_mismatched_sizes=ignore_mismatched_sizes,
+                sharded_metadata=sharded_metadata,
+                _fast_init=_fast_init,
+                low_cpu_mem_usage=low_cpu_mem_usage,
+                device_map=device_map,
+                offload_folder=offload_folder,
+                offload_state_dict=offload_state_dict,
+                dtype=torch_dtype,
+                hf_quantizer=hf_quantizer,
+                keep_in_fp32_modules=keep_in_fp32_modules,
+                gguf_path=gguf_path,
+                weights_only=weights_only,
+            )

        # make sure token embedding weights are still tied if needed
        model.tie_weights()
@ -5170,6 +5198,9 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix

    @property
    def loss_function(self):
+        if hasattr(self, "_loss_function"):
+            return self._loss_function
+
        loss_type = getattr(self, "loss_type", None)

        if loss_type is None or loss_type not in LOSS_MAPPING:
@ -5180,6 +5211,10 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
            loss_type = "ForCausalLM"
        return LOSS_MAPPING[loss_type]

+    @loss_function.setter
+    def loss_function(self, value):
+        self._loss_function = value
+
    def get_compiled_call(self, compile_config: CompileConfig):
        """Return a `torch.compile`'d version of `self.__call__`. This is useful to dynamically choose between
        non-compiled/compiled `forward` during inference, especially to switch between prefill (where we don't
--- a/src/transformers/models/init.py
+++ b/src/transformers/models/init.py
@ -63,6 +63,7 @@ from . import (
    cpmant,
    ctrl,
    cvt,
+    dab_detr,
    dac,
    data2vec,
    dbrx,
@ -73,6 +74,7 @@ from . import (
    deit,
    deprecated,
    depth_anything,
+    depth_pro,
    detr,
    dialogpt,
    diffllama,
@ -106,6 +108,7 @@ from . import (
    git,
    glm,
    glpn,
+    got_ocr2,
    gpt2,
    gpt_bigcode,
    gpt_neo,
@ -231,6 +234,7 @@ from . import (
    roc_bert,
    roformer,
    rt_detr,
+    rt_detr_v2,
    rwkv,
    sam,
    seamless_m4t,
--- a/src/transformers/models/aria/image_processing_aria.py
+++ b/src/transformers/models/aria/image_processing_aria.py
@ -31,7 +31,7 @@ from ...image_utils import (
    PILImageResampling,
    get_image_size,
    infer_channel_dimension_format,
-    is_valid_image,
+    make_flat_list_of_images,
    to_numpy_array,
    valid_images,
    validate_preprocess_arguments,
@ -39,29 +39,6 @@ from ...image_utils import (
 from ...utils import TensorType


-def make_batched_images(images) -> List[List[ImageInput]]:
-    """
-    Accepts images in list or nested list format, and makes a list of images for preprocessing.
-
-    Args:
-        images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`):
-            The input image.
-
-    Returns:
-        list: A list of images.
-    """
-    if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]):
-        return [img for img_list in images for img in img_list]
-
-    elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
-        return images
-
-    elif is_valid_image(images):
-        return [images]
-
-    raise ValueError(f"Could not make batched video from {images}")
-
-
 def divide_to_patches(image: np.array, patch_size: int, input_data_format) -> List[np.array]:
    """
    Divides an image into patches of a specified size.
@ -133,6 +110,8 @@ class AriaImageProcessor(BaseImageProcessor):
            The resampling filter to use if resizing the image.
    """

+    model_input_names = ["pixel_values", "pixel_mask", "num_crops"]
+
    def __init__(
        self,
        image_mean: List[float] = None,
@ -244,7 +223,7 @@ class AriaImageProcessor(BaseImageProcessor):
        if max_image_size not in [490, 980]:
            raise ValueError("max_image_size must be either 490 or 980")

-        images = make_batched_images(images)
+        images = make_flat_list_of_images(images)

        if not valid_images(images):
            raise ValueError(
--- a/Show More
+++ b/Show More