Merge branch 'main' into tensor-cache

fixed dynamic cache
rebased
2025-11-04 20:14:36 +08:00 · 2025-01-24 12:02:45 +01:00 · 2025-01-23 16:45:28 +01:00 · 2025-01-22 17:31:39 +01:00 · 2025-01-22 17:30:23 +01:00 · 2025-01-22 17:29:48 +01:00
742 changed files with 7587 additions and 32488 deletions
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@ -18,8 +18,7 @@ jobs:
    name: Benchmark
    strategy:
      matrix:
-        # group: [aws-g5-4xlarge-cache, aws-p4d-24xlarge-plus] (A100 runner is not enabled)
-        group: [aws-g5-4xlarge-cache]
+        group: [aws-g5-4xlarge-cache, aws-p4d-24xlarge-plus]
    runs-on:
      group: ${{ matrix.group }}
    if: |
--- a/.github/workflows/self-comment-ci.yml
+++ b/.github/workflows/self-comment-ci.yml
@ -30,7 +30,7 @@ jobs:
    runs-on: ubuntu-22.04
    name: Get PR number
    # For security: only allow team members to run
-    if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
+    if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
    outputs:
      PR_NUMBER: ${{ steps.set_pr_number.outputs.PR_NUMBER }}
    steps:
@ -98,7 +98,6 @@ jobs:
    if: ${{ needs.get-pr-number.outputs.PR_NUMBER != ''}}
    outputs:
      models: ${{ steps.models_to_run.outputs.models }}
-      quantizations: ${{ steps.models_to_run.outputs.quantizations }}
    steps:
      - uses: actions/checkout@v4
        with:
@ -122,8 +121,6 @@ jobs:
          python -m pip install GitPython
          python utils/pr_slow_ci_models.py --message "$PR_COMMENT" | tee output.txt
          echo "models=$(tail -n 1 output.txt)" >> $GITHUB_ENV
-          python utils/pr_slow_ci_models.py --message "$PR_COMMENT" --quantization | tee output2.txt
-          echo "quantizations=$(tail -n 1 output2.txt)" >> $GITHUB_ENV

      - name: Show models to test
        id: models_to_run
@ -131,12 +128,10 @@ jobs:
          echo "${{ env.models }}"
          echo "models=${{ env.models }}" >> $GITHUB_ENV
          echo "models=${{ env.models }}" >> $GITHUB_OUTPUT
-          echo "${{ env.quantizations }}"
-          echo "quantizations=${{ env.quantizations }}" >> $GITHUB_OUTPUT

  reply_to_comment:
    name: Reply to the comment
-    if: ${{ needs.get-tests.outputs.models != '[]'  || needs.get-tests.outputs.quantizations != '[]' }}
+    if: ${{ needs.get-tests.outputs.models != '[]' }}
    needs: [get-pr-number, get-tests]
    permissions:
      pull-requests: write
@ -146,18 +141,17 @@ jobs:
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          MODELS: ${{ needs.get-tests.outputs.models }}
-          BODY: "This comment contains run-slow, running the specified jobs:\n\nmodels: ${{ needs.get-tests.outputs.models }}\nquantizations: ${{ needs.get-tests.outputs.quantizations }}"
        run: |
          gh api \
            --method POST \
            -H "Accept: application/vnd.github+json" \
            -H "X-GitHub-Api-Version: 2022-11-28" \
            repos/${{ github.repository }}/issues/${{ needs.get-pr-number.outputs.PR_NUMBER }}/comments \
-            -f "body=This comment contains run-slow, running the specified jobs: ${{ env.BODY }} ..."
+            -f "body=This comment contains run-slow, running the specified jobs: ${{ env.MODELS }} ..."

  create_run:
    name: Create run
-    if: ${{ needs.get-tests.outputs.models != '[]' || needs.get-tests.outputs.quantizations != '[]' }}
+    if: ${{ needs.get-tests.outputs.models != '[]' }}
    needs: [get-sha, get-tests, reply_to_comment]
    permissions:
      statuses: write
@ -179,20 +173,20 @@ jobs:
            -f "target_url=$GITHUB_RUN_URL" -f "state=pending" -f "description=Slow CI job" -f "context=pytest/custom-tests"

  run_models_gpu:
-    name: Run all tests for the model
-    if: ${{ needs.get-tests.outputs.models != '[]' }}
-    needs: [get-pr-number, get-sha, get-tests, create_run]
-    strategy:
-      fail-fast: false
-      matrix:
-        folders: ${{ fromJson(needs.get-tests.outputs.models) }}
-        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
-    runs-on:
-       group: '${{ matrix.machine_type }}'
-    container:
-      image: huggingface/transformers-all-latest-gpu
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    steps:
+      name: Run all tests for the model
+      if: ${{ needs.get-tests.outputs.models != '[]' }}
+      needs: [get-pr-number, get-sha, get-tests, create_run]
+      strategy:
+        fail-fast: false
+        matrix:
+          folders: ${{ fromJson(needs.get-tests.outputs.models) }}
+          machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
+      runs-on:
+         group: '${{ matrix.machine_type }}'
+      container:
+        image: huggingface/transformers-all-latest-gpu
+        options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      steps:
      - name: Echo input and matrix info
        shell: bash
        run: |
@ -212,20 +206,20 @@ jobs:
      - name: Checkout to PR merge commit
        working-directory: /transformers
        run: |
-          git fetch origin refs/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge:refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge
-          git checkout refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge
-          git log -1 --format=%H
+            git fetch origin refs/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge:refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge
+            git checkout refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge
+            git log -1 --format=%H

      - name: Verify merge commit SHA
        env:
          VERIFIED_PR_MERGE_SHA: ${{ needs.get-sha.outputs.PR_MERGE_SHA }}
        working-directory: /transformers
        run: |
-          PR_MERGE_SHA=$(git log -1 --format=%H)
-          if [ $PR_MERGE_SHA != $VERIFIED_PR_MERGE_SHA ]; then
-            echo "The merged commit SHA is not the same as the verified one! Security issue detected, abort the workflow!";
-            exit -1;
-          fi
+            PR_MERGE_SHA=$(git log -1 --format=%H)
+            if [ $PR_MERGE_SHA != $VERIFIED_PR_MERGE_SHA ]; then
+              echo "The merged commit SHA is not the same as the verified one! Security issue detected, abort the workflow!";
+              exit -1;
+            fi

      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
        working-directory: /transformers
@ -285,106 +279,9 @@ jobs:
          name: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports
          path: /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports

-  run_quantization_torch_gpu:
-    name: Run all tests for a quantization
-    if: ${{ needs.get-tests.outputs.quantizations != '[]' }}
-    needs: [get-pr-number, get-sha, get-tests, create_run]
-    strategy:
-      fail-fast: false
-      matrix:
-        folders: ${{ fromJson(needs.get-tests.outputs.quantizations) }}
-        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
-    runs-on:
-      group: '${{ matrix.machine_type }}'
-    container:
-      image: huggingface/transformers-quantization-latest-gpu
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    steps:
-      - name: Echo folder ${{ matrix.folders }}
-        shell: bash
-        run: |
-          echo "${{ matrix.folders }}"
-          matrix_folders=${{ matrix.folders }}
-          matrix_folders=${matrix_folders/'quantization/'/'quantization_'}
-          echo "$matrix_folders"
-          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
-
-      - name: Checkout to PR merge commit
-        working-directory: /transformers
-        run: |
-          git fetch origin refs/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge:refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge
-          git checkout refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge
-          git log -1 --format=%H
-
-      - name: Verify merge commit SHA
-        env:
-          VERIFIED_PR_MERGE_SHA: ${{ needs.get-sha.outputs.PR_MERGE_SHA }}
-        working-directory: /transformers
-        run: |
-          PR_MERGE_SHA=$(git log -1 --format=%H)
-          if [ $PR_MERGE_SHA != $VERIFIED_PR_MERGE_SHA ]; then
-            echo "The merged commit SHA is not the same as the verified one! Security issue detected, abort the workflow!";
-            exit -1;
-          fi
-
-      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
-        working-directory: /transformers
-        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
-      - name: NVIDIA-SMI
-        run: |
-          nvidia-smi
-
-      - name: Set `machine_type` for report and artifact names
-        working-directory: /transformers
-        shell: bash
-        run: |
-          echo "${{ matrix.machine_type }}"
-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
-            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
-            machine_type=multi-gpu
-          else
-            machine_type=${{ matrix.machine_type }}
-          fi
-          echo "$machine_type"
-          echo "machine_type=$machine_type" >> $GITHUB_ENV
-
-      - name: Environment
-        working-directory: /transformers
-        run: |
-          python3 utils/print_env.py
-
-      - name: Show installed libraries and their versions
-        working-directory: /transformers
-        run: pip freeze
-
-      - name: Run quantization tests on GPU
-        working-directory: /transformers
-        run: |
-          python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
-
-      - name: Failure short reports
-        if: ${{ failure() }}
-        continue-on-error: true
-        run: cat /transformers/reports/${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
-
-      - name: Make sure report directory exists
-        shell: bash
-        run: |
-          mkdir -p /transformers/reports/${{ env.machine_type }}_run_quantization_gpu_${{ matrix.folders }}_test_reports
-          echo "hello" > /transformers/reports/${{ env.machine_type }}_run_quantization_gpu_${{ matrix.folders }}_test_reports/hello.txt
-          echo "${{ env.machine_type }}_run_quantization_gpu_${{ matrix.folders }}_test_reports"
-
-      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports"
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: ${{ env.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports
-          path: /transformers/reports/${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports
-
  update_run_status:
    name: Update Check Run Status
-    needs: [get-sha, create_run, run_models_gpu, run_quantization_torch_gpu]
+    needs: [get-sha, create_run, run_models_gpu]
    permissions:
      statuses: write
    if: ${{ always() && needs.create_run.result == 'success' }}
@ -392,17 +289,16 @@ jobs:
    env:
      GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}
-      STATUS_OK: ${{ contains(fromJSON('["skipped", "success"]'), needs.run_models_gpu.result) && contains(fromJSON('["skipped", "success"]'), needs.run_quantization_torch_gpu.result) }}
    steps:
      - name: Get `run_models_gpu` job status
        run: |
          echo "${{ needs.run_models_gpu.result }}"
-          echo "${{ needs.run_quantization_torch_gpu.result }}"
-          echo $STATUS_OK
-          if [ "$STATUS_OK" = "true" ]; then
+          if [ "${{ needs.run_models_gpu.result }}" = "cancelled" ]; then
+            echo "STATUS=failure" >> $GITHUB_ENV
+          elif [ "${{ needs.run_models_gpu.result }}" = "skipped" ]; then
            echo "STATUS=success" >> $GITHUB_ENV
          else
-            echo "STATUS=failure" >> $GITHUB_ENV
+            echo "STATUS=${{ needs.run_models_gpu.result }}" >> $GITHUB_ENV
          fi

      - name: Update PR commit statuses
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@ -366,7 +366,7 @@ jobs:
        run: |
          python3 -m pip uninstall -y deepspeed
          rm -rf DeepSpeed
-          git clone https://github.com/deepspeedai/DeepSpeed && cd DeepSpeed && rm -rf build
+          git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build
          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check

      - name: NVIDIA-SMI
--- a/.github/workflows/trufflehog.yml
+++ b/.github/workflows/trufflehog.yml
@ -16,5 +16,3 @@ jobs:
          fetch-depth: 0
      - name: Secret Scanning
        uses: trufflesecurity/trufflehog@main
-        with:
-          extra_args: --results=verified,unknown
--- a/README.md
+++ b/README.md
@ -283,7 +283,7 @@ If you'd like to play with the examples or need the bleeding edge of the code an
 ```
 git clone https://github.com/huggingface/transformers.git
 cd transformers
-pip install .
+pip install
 ```

 ### With conda
--- a/docker/consistency.dockerfile
+++ b/docker/consistency.dockerfile
@ -1,4 +1,4 @@
-FROM python:3.9-slim
+FROM python:3.10-slim
 ENV PYTHONDONTWRITEBYTECODE=1
 USER root
 ARG REF=main
--- a/docker/custom-tokenizers.dockerfile
+++ b/docker/custom-tokenizers.dockerfile
@ -1,4 +1,4 @@
-FROM python:3.9-slim
+FROM python:3.10-slim
 ENV PYTHONDONTWRITEBYTECODE=1
 USER root
 RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git cmake wget xz-utils build-essential g++5 libprotobuf-dev protobuf-compiler
--- a/docker/examples-tf.dockerfile
+++ b/docker/examples-tf.dockerfile
@ -1,4 +1,4 @@
-FROM python:3.9-slim
+FROM python:3.10-slim
 ENV PYTHONDONTWRITEBYTECODE=1
 USER root
 RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git
--- a/docker/examples-torch.dockerfile
+++ b/docker/examples-torch.dockerfile
@ -1,4 +1,4 @@
-FROM python:3.9-slim
+FROM python:3.10-slim
 ENV PYTHONDONTWRITEBYTECODE=1
 USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git
--- a/docker/exotic-models.dockerfile
+++ b/docker/exotic-models.dockerfile
@ -1,4 +1,4 @@
-FROM python:3.9-slim
+FROM python:3.10-slim
 ENV PYTHONDONTWRITEBYTECODE=1
 ARG REF=main
 USER root
--- a/docker/jax-light.dockerfile
+++ b/docker/jax-light.dockerfile
@ -1,4 +1,4 @@
-FROM python:3.9-slim
+FROM python:3.10-slim
 ENV PYTHONDONTWRITEBYTECODE=1
 ARG REF=main
 USER root
--- a/docker/pipeline-tf.dockerfile
+++ b/docker/pipeline-tf.dockerfile
@ -1,4 +1,4 @@
-FROM python:3.9-slim
+FROM python:3.10-slim
 ENV PYTHONDONTWRITEBYTECODE=1
 ARG REF=main
 USER root
--- a/docker/pipeline-torch.dockerfile
+++ b/docker/pipeline-torch.dockerfile
@ -1,4 +1,4 @@
-FROM python:3.9-slim
+FROM python:3.10-slim
 ENV PYTHONDONTWRITEBYTECODE=1
 ARG REF=main
 USER root
--- a/docker/quality.dockerfile
+++ b/docker/quality.dockerfile
@ -1,4 +1,4 @@
-FROM python:3.9-slim
+FROM python:3.10-slim
 ENV PYTHONDONTWRITEBYTECODE=1
 ARG REF=main
 USER root
--- a/docker/tf-light.dockerfile
+++ b/docker/tf-light.dockerfile
@ -1,4 +1,4 @@
-FROM python:3.9-slim
+FROM python:3.10-slim
 ENV PYTHONDONTWRITEBYTECODE=1
 ARG REF=main
 USER root
--- a/docker/torch-jax-light.dockerfile
+++ b/docker/torch-jax-light.dockerfile
@ -1,4 +1,4 @@
-FROM python:3.9-slim
+FROM python:3.10-slim
 ENV PYTHONDONTWRITEBYTECODE=1
 ARG REF=main
 USER root
--- a/docker/torch-light.dockerfile
+++ b/docker/torch-light.dockerfile
@ -1,4 +1,4 @@
-FROM python:3.9-slim
+FROM python:3.10-slim
 ENV PYTHONDONTWRITEBYTECODE=1
 ARG REF=main
 USER root
--- a/docker/torch-tf-light.dockerfile
+++ b/docker/torch-tf-light.dockerfile
@ -1,4 +1,4 @@
-FROM python:3.9-slim
+FROM python:3.10-slim
 ENV PYTHONDONTWRITEBYTECODE=1
 ARG REF=main
 RUN echo ${REF}
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@ -9,7 +9,7 @@ SHELL ["sh", "-lc"]
 # The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
 # to be used as arguments for docker build (so far).

-ARG PYTORCH='2.6.0'
+ARG PYTORCH='2.5.1'
 # (not always a valid torch version)
 ARG INTEL_TORCH_EXT='2.3.0'
 # Example: `cu102`, `cu113`, etc.
--- a/docker/transformers-past-gpu/Dockerfile
+++ b/docker/transformers-past-gpu/Dockerfile
@ -48,8 +48,8 @@ RUN python3 -m pip uninstall -y torch-tensorrt apex
 # Pre-build **nightly** release of DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
 RUN python3 -m pip uninstall -y deepspeed
 # This has to be run inside the GPU VMs running the tests. (So far, it fails here due to GPU checks during compilation.)
-# Issue: https://github.com/deepspeedai/DeepSpeed/issues/2010
-# RUN git clone https://github.com/deepspeedai/DeepSpeed && cd DeepSpeed && rm -rf build && \
+# Issue: https://github.com/microsoft/DeepSpeed/issues/2010
+# RUN git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build && \
 #    DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1

 RUN python3 -m pip install -U "itsdangerous<2.1.0"
--- a/docker/transformers-pytorch-amd-gpu/Dockerfile
+++ b/docker/transformers-pytorch-amd-gpu/Dockerfile
@ -1,4 +1,4 @@
-FROM rocm/dev-ubuntu-22.04:6.2.4
+FROM rocm/dev-ubuntu-22.04:6.3
 LABEL maintainer="Hugging Face"

 ARG DEBIAN_FRONTEND=noninteractive
@ -8,9 +8,11 @@ RUN apt update && \
    apt clean && \
    rm -rf /var/lib/apt/lists/*

+RUN export PATH="${PATH:+${PATH}:}~/opt/rocm/bin"
+
 RUN python3 -m pip install --no-cache-dir --upgrade pip numpy

-RUN python3 -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.2
+RUN python3 -m pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.3/

 RUN python3 -m pip install --no-cache-dir --upgrade importlib-metadata setuptools ninja git+https://github.com/facebookresearch/detectron2.git pytesseract "itsdangerous<2.1.0"

--- a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
@ -1,11 +1,11 @@
-FROM rocm/dev-ubuntu-22.04:6.2.4
+FROM rocm/dev-ubuntu-22.04:5.6
 LABEL maintainer="Hugging Face"

 ARG DEBIAN_FRONTEND=noninteractive
-ARG PYTORCH='2.5.1'
-ARG TORCH_VISION='0.20.0'
-ARG TORCH_AUDIO='2.5.0'
-ARG ROCM='6.2'
+ARG PYTORCH='2.1.1'
+ARG TORCH_VISION='0.16.1'
+ARG TORCH_AUDIO='2.1.1'
+ARG ROCM='5.6'

 RUN apt update && \
    apt install -y --no-install-recommends \
@ -45,4 +45,4 @@ RUN cd transformers && python3 setup.py develop
 RUN python3 -c "from deepspeed.launcher.runner import main"

 # Remove nvml as it is not compatible with ROCm
-RUN python3 -m pip uninstall py3nvml pynvml nvidia-ml-py apex -y
+RUN python3 -m pip uninstall py3nvml pynvml -y
--- a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
@ -1,5 +1,5 @@
 # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-23-11.html#rel-23-11
-FROM nvcr.io/nvidia/pytorch:23.11-py3
+FROM nvcr.io/nvidia/pytorch:23.04-py3
 LABEL maintainer="Hugging Face"

 ARG DEBIAN_FRONTEND=noninteractive
--- a/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
@ -34,8 +34,8 @@ RUN python3 -m pip uninstall -y torch-tensorrt apex
 # Pre-build **nightly** release of DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
 RUN python3 -m pip uninstall -y deepspeed
 # This has to be run inside the GPU VMs running the tests. (So far, it fails here due to GPU checks during compilation.)
-# Issue: https://github.com/deepspeedai/DeepSpeed/issues/2010
-# RUN git clone https://github.com/deepspeedai/DeepSpeed && cd DeepSpeed && rm -rf build && \
+# Issue: https://github.com/microsoft/DeepSpeed/issues/2010
+# RUN git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build && \
 #    DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1

 ## For `torchdynamo` tests
--- a/docker/transformers-pytorch-gpu/Dockerfile
+++ b/docker/transformers-pytorch-gpu/Dockerfile
@ -11,7 +11,7 @@ ARG REF=main
 RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF

 # If set to nothing, will install the latest version
-ARG PYTORCH='2.6.0'
+ARG PYTORCH='2.5.1'
 ARG TORCH_VISION=''
 ARG TORCH_AUDIO=''
 # Example: `cu102`, `cu113`, etc.
--- a/docs/source/ar/notebooks.md
+++ b/docs/source/ar/notebooks.md
@ -130,6 +130,7 @@
 | دفتر الملاحظات     |      الوصف      |   |   |
 |:----------|:-------------|:-------------|------:|
 | [كيفية تكميم نموذج باستخدام ONNX Runtime لتصنيف النص](https://github.com/huggingface/notebooks/blob/main/examples/text_classification_quantization_ort.ipynb)| يوضح كيفية تطبيق التكميم الثابت والديناميكي على نموذج باستخدام [ONNX Runtime](https://github.com/microsoft/onnxruntime) لأي مهمة GLUE. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification_quantization_ort.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/text_classification_quantization_ort.ipynb)|
+| [كيفية تكميم نموذج باستخدام Intel Neural Compressor لتصنيف النص](https://github.com/huggingface/notebooks/blob/main/examples/text_classification_quantization_inc.ipynb)| يوضح كيفية تطبيق التكميم الثابت والديناميكي والتدريبي على نموذج باستخدام [Intel Neural Compressor (INC)](https://github.com/intel/neural-compressor) لأي مهمة GLUE. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification_quantization_inc.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/text_classification_quantization_inc.ipynb)|
 | [كيفية ضبط نموذج بدقة على تصنيف النص باستخدام ONNX Runtime](https://github.com/huggingface/notebooks/blob/main/examples/text_classification_ort.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج بدقة على أي مهمة GLUE باستخدام [ONNX Runtime](https://github.com/microsoft/onnxruntime). | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification_ort.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/text_classification_ort.ipynb)|
 | [كيفية ضبط نموذج بدقة على التلخيص باستخدام ONNX Runtime](https://github.com/huggingface/notebooks/blob/main/examples/summarization_ort.ipynb)| يوضح كيفية معالجة البيانات مسبقًا وضبط نموذج بدقة على XSUM باستخدام [ONNX Runtime](https://github.com/microsoft/onnxruntime). | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization_ort.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/summarization_ort.ipynb)|

--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@ -626,8 +626,6 @@
        title: YOSO
      - local: model_doc/zamba
        title: Zamba
-      - local: model_doc/zamba2
-        title: Zamba2
      title: Text models
    - isExpanded: false
      sections:
@ -643,8 +641,6 @@
        title: ConvNeXTV2
      - local: model_doc/cvt
        title: CvT
-      - local: model_doc/dab-detr
-        title: DAB-DETR
      - local: model_doc/deformable_detr
        title: Deformable DETR
      - local: model_doc/deit
@ -653,8 +649,6 @@
        title: Depth Anything
      - local: model_doc/depth_anything_v2
        title: Depth Anything V2
-      - local: model_doc/depth_pro
-        title: DepthPro
      - local: model_doc/deta
        title: DETA
      - local: model_doc/detr
@ -711,8 +705,6 @@
        title: ResNet
      - local: model_doc/rt_detr
        title: RT-DETR
-      - local: model_doc/rt_detr_v2
-        title: RT-DETRv2
      - local: model_doc/segformer
        title: SegFormer
      - local: model_doc/seggpt
@ -878,8 +870,6 @@
        title: FLAVA
      - local: model_doc/git
        title: GIT
-      - local: model_doc/got_ocr2
-        title: GOT-OCR2
      - local: model_doc/grounding-dino
        title: Grounding DINO
      - local: model_doc/groupvit
--- a/docs/source/en/agents_advanced.md
+++ b/docs/source/en/agents_advanced.md
@ -162,7 +162,7 @@ agent.run(
 improved_prompt could be "A bright blue space suit wearing rabbit, on the surface of the moon, under a bright orange sunset, with the Earth visible in the background"

 Now that I have improved the prompt, I can use the image generator tool to generate an image based on this prompt.
-=== Agent is executing the code below:
+>>> Agent is executing the code below:
 image = image_generator(prompt="A bright blue space suit wearing rabbit, on the surface of the moon, under a bright orange sunset, with the Earth visible in the background")
 final_answer(image)
 ```
--- a/docs/source/en/chat_templating.md
+++ b/docs/source/en/chat_templating.md
@ -39,7 +39,7 @@ Let's make this concrete with a quick example using the `mistralai/Mistral-7B-In
 ... ]

 >>> tokenizer.apply_chat_template(chat, tokenize=False)
-"<s> [INST] Hello, how are you? [/INST] I'm doing great. How can I help you today?</s> [INST] I'd like to show off how chat templating works! [/INST]"
+"<s>[INST] Hello, how are you? [/INST]I'm doing great. How can I help you today?</s> [INST] I'd like to show off how chat templating works! [/INST]"
 ```

 Notice how the tokenizer has added the control tokens [INST] and [/INST] to indicate the start and end of
--- a/docs/source/en/debugging.md
+++ b/docs/source/en/debugging.md
@ -30,7 +30,7 @@ DeepSpeed compiles CUDA C++ code and it can be a potential source of errors when

 <Tip>

-For any other installation issues, please [open an issue](https://github.com/deepspeedai/DeepSpeed/issues) with the DeepSpeed team.
+For any other installation issues, please [open an issue](https://github.com/microsoft/DeepSpeed/issues) with the DeepSpeed team.

 </Tip>

@ -89,7 +89,7 @@ sudo ln -s /usr/bin/g++-7  /usr/local/cuda-10.2/bin/g++
 If you're still having issues with installing DeepSpeed or if you're building DeepSpeed at run time, you can try to prebuild the DeepSpeed modules before installing them. To make a local build for DeepSpeed:

 ```bash
-git clone https://github.com/deepspeedai/DeepSpeed/
+git clone https://github.com/microsoft/DeepSpeed/
 cd DeepSpeed
 rm -rf build
 TORCH_CUDA_ARCH_LIST="8.6" DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 pip install . \
@ -141,7 +141,7 @@ It is also possible to not specify `TORCH_CUDA_ARCH_LIST` and the build program
 For training on multiple machines with the same setup, you'll need to make a binary wheel:

 ```bash
-git clone https://github.com/deepspeedai/DeepSpeed/
+git clone https://github.com/microsoft/DeepSpeed/
 cd DeepSpeed
 rm -rf build
 TORCH_CUDA_ARCH_LIST="8.6" DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 \
--- a/docs/source/en/deepspeed.md
+++ b/docs/source/en/deepspeed.md
@ -28,7 +28,7 @@ This guide will walk you through how to deploy DeepSpeed training, the features

 ## Installation

-DeepSpeed is available to install from PyPI or Transformers (for more detailed installation options, take a look at the DeepSpeed [installation details](https://www.deepspeed.ai/tutorials/advanced-install/) or the GitHub [README](https://github.com/deepspeedai/DeepSpeed#installation)).
+DeepSpeed is available to install from PyPI or Transformers (for more detailed installation options, take a look at the DeepSpeed [installation details](https://www.deepspeed.ai/tutorials/advanced-install/) or the GitHub [README](https://github.com/microsoft/deepspeed#installation)).

 <Tip>

@ -114,10 +114,10 @@ DeepSpeed works with the [`Trainer`] class by way of a config file containing al

 <Tip>

-Find a complete list of DeepSpeed configuration options on the [DeepSpeed Configuration JSON](https://www.deepspeed.ai/docs/config-json/) reference. You can also find more practical examples of various DeepSpeed configuration examples on the [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples) repository or the main [DeepSpeed](https://github.com/deepspeedai/DeepSpeed) repository. To quickly find specific examples, you can:
+Find a complete list of DeepSpeed configuration options on the [DeepSpeed Configuration JSON](https://www.deepspeed.ai/docs/config-json/) reference. You can also find more practical examples of various DeepSpeed configuration examples on the [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) repository or the main [DeepSpeed](https://github.com/microsoft/DeepSpeed) repository. To quickly find specific examples, you can:

 ```bash
-git clone https://github.com/deepspeedai/DeepSpeedExamples
+git clone https://github.com/microsoft/DeepSpeedExamples
 cd DeepSpeedExamples
 find . -name '*json'
 # find examples with the Lamb optimizer
@ -303,7 +303,7 @@ For more information about initializing large models with ZeRO-3 and accessing t

 [ZeRO-Infinity](https://hf.co/papers/2104.07857) allows offloading model states to the CPU and/or NVMe to save even more memory. Smart partitioning and tiling algorithms allow each GPU to send and receive very small amounts of data during offloading such that a modern NVMe can fit an even larger total memory pool than is available to your training process. ZeRO-Infinity requires ZeRO-3.

-Depending on the CPU and/or NVMe memory available, you can offload both the [optimizer states](https://www.deepspeed.ai/docs/config-json/#optimizer-offloading) and [parameters](https://www.deepspeed.ai/docs/config-json/#parameter-offloading), just one of them, or none. You should also make sure the `nvme_path` is pointing to an NVMe device, because while it still works with a normal hard drive or solid state drive, it'll be significantly slower. With a modern NVMe, you can expect peak transfer speeds of ~3.5GB/s for read and ~3GB/s for write operations. Lastly, [run a benchmark](https://github.com/deepspeedai/DeepSpeed/issues/998) on your training setup to determine the optimal `aio` configuration.
+Depending on the CPU and/or NVMe memory available, you can offload both the [optimizer states](https://www.deepspeed.ai/docs/config-json/#optimizer-offloading) and [parameters](https://www.deepspeed.ai/docs/config-json/#parameter-offloading), just one of them, or none. You should also make sure the `nvme_path` is pointing to an NVMe device, because while it still works with a normal hard drive or solid state drive, it'll be significantly slower. With a modern NVMe, you can expect peak transfer speeds of ~3.5GB/s for read and ~3GB/s for write operations. Lastly, [run a benchmark](https://github.com/microsoft/DeepSpeed/issues/998) on your training setup to determine the optimal `aio` configuration.

 The example ZeRO-3/Infinity configuration file below sets most of the parameter values to `auto`, but you could also manually add these values.

@ -1157,7 +1157,7 @@ For Transformers>=4.28, if `synced_gpus` is automatically set to `True` if multi

 ## Troubleshoot

-When you encounter an issue, you should consider whether DeepSpeed is the cause of the problem because often it isn't (unless it's super obviously and you can see DeepSpeed modules in the exception)! The first step should be to retry your setup without DeepSpeed, and if the problem persists, then you can report the issue. If the issue is a core DeepSpeed problem and unrelated to the Transformers integration, open an Issue on the [DeepSpeed repository](https://github.com/deepspeedai/DeepSpeed).
+When you encounter an issue, you should consider whether DeepSpeed is the cause of the problem because often it isn't (unless it's super obviously and you can see DeepSpeed modules in the exception)! The first step should be to retry your setup without DeepSpeed, and if the problem persists, then you can report the issue. If the issue is a core DeepSpeed problem and unrelated to the Transformers integration, open an Issue on the [DeepSpeed repository](https://github.com/microsoft/DeepSpeed).

 For issues related to the Transformers integration, please provide the following information:

@ -1227,7 +1227,7 @@ This means the DeepSpeed loss scaler is unable to find a scaling coefficient to

 ## Resources

-DeepSpeed ZeRO is a powerful technology for training and loading very large models for inference with limited GPU resources, making it more accessible to everyone. To learn more about DeepSpeed, feel free to read the [blog posts](https://www.microsoft.com/en-us/research/search/?q=deepspeed), [documentation](https://www.deepspeed.ai/getting-started/), and [GitHub repository](https://github.com/deepspeedai/DeepSpeed). 
+DeepSpeed ZeRO is a powerful technology for training and loading very large models for inference with limited GPU resources, making it more accessible to everyone. To learn more about DeepSpeed, feel free to read the [blog posts](https://www.microsoft.com/en-us/research/search/?q=deepspeed), [documentation](https://www.deepspeed.ai/getting-started/), and [GitHub repository](https://github.com/microsoft/deepspeed). 

 The following papers are also a great resource for learning more about ZeRO:

--- a/docs/source/en/generation_strategies.md
+++ b/docs/source/en/generation_strategies.md
@ -231,7 +231,7 @@ to check if the text is machine-generated (outputs `True` for machine-generated
 >>> detector = WatermarkDetector(model_config=model.config, device="cpu", watermarking_config=watermarking_config)
 >>> detection_out = detector(out, return_dict=True)
 >>> detection_out.prediction
-array([ True,  True])
+array([True, True])
 ```


@ -269,7 +269,7 @@ dimension you can act upon, in addition to selecting a decoding strategy. Popula
 >>> model = AutoModelForCausalLM.from_pretrained(checkpoint)
 >>> outputs = model.generate(**inputs)
 >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-['I look forward to seeing you all again!\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n']
+['I look forward to seeing you all again!\n\n\n\n\n\n\n\n\n\n\n']
 ```

 ### Contrastive search
@ -445,7 +445,7 @@ To enable assisted decoding, set the `assistant_model` argument with a model.
 >>> assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint)
 >>> outputs = model.generate(**inputs, assistant_model=assistant_model)
 >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a glass of wine.']
+['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a']
 ```

 <Tip>
@ -461,7 +461,7 @@ If you're using a `pipeline` object, all you need to do is to pass the assistant
 ...     model="meta-llama/Llama-3.1-8B",
 ...     assistant_model="meta-llama/Llama-3.2-1B",  # This extra line is all that's needed, also works with UAD
 ...     torch_dtype=torch.bfloat16
-... )
+>>> )
 >>> pipe_output = pipe("Once upon a time, ", max_new_tokens=50, do_sample=False)
 >>> pipe_output[0]["generated_text"]
 'Once upon a time, 3D printing was a niche technology that was only'
@ -488,7 +488,7 @@ just like in multinomial sampling. However, in assisted decoding, reducing the t
 >>> assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint)
 >>> outputs = model.generate(**inputs, assistant_model=assistant_model, do_sample=True, temperature=0.5)
 >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-['Alice and Bob are two people who are very different, but they are both very good at what they do. Alice']
+['Alice and Bob, a couple of friends of mine, who are both in the same office as']
 ```

 We recommend to install `scikit-learn` library to enhance the candidate generation strategy and achieve additional speedup.
@ -518,7 +518,7 @@ to ensure the new tokens include the correct prompt suffix.
 >>> assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint)
 >>> outputs = model.generate(**inputs, assistant_model=assistant_model, tokenizer=tokenizer, assistant_tokenizer=assistant_tokenizer)
 >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-['Alice and Bob are playing a game. Alice has a set of $n$ integers $a_1, a']
+['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a']
 ```

 #### Prompt Lookup
@ -547,7 +547,7 @@ If the model you're using was trained to do early exit, you can pass
 >>> model = AutoModelForCausalLM.from_pretrained(checkpoint)
 >>> outputs = model.generate(**inputs, assistant_early_exit=4, do_sample=False, max_new_tokens=20)
 >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
-['Alice and Bob are playing a game. Alice has a set of $n$ integers $a_1, a']
+['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a']
 ```

 ### DoLa Decoding
@ -571,9 +571,10 @@ See the following examples for DoLa decoding with the 32-layer LLaMA-7B model.
 >>> import torch
 >>> from accelerate.test_utils.testing import get_backend

->>> device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
 >>> tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
->>> model = AutoModelForCausalLM.from_pretrained("huggyllama/llama-7b", torch_dtype=torch.float16).to(device)
+>>> model = AutoModelForCausalLM.from_pretrained("huggyllama/llama-7b", torch_dtype=torch.float16)
+>>> device, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
+>>> model.to(device)
 >>> set_seed(42)

 >>> text = "On what date was the Declaration of Independence officially signed?"
@ -592,7 +593,7 @@ See the following examples for DoLa decoding with the 32-layer LLaMA-7B model.
 # DoLa decoding with contrasting specific layers (layers 28 and 30)
 >>> dola_custom_output = model.generate(**inputs, do_sample=False, max_new_tokens=50, dola_layers=[28,30], repetition_penalty=1.2)
 >>> tokenizer.batch_decode(dola_custom_output[:, inputs.input_ids.shape[-1]:], skip_special_tokens=True)
-['\nIn 1891, when he was 54 years old, John Jacob Astor founded his empire. He opened a one-man business and spent the next 27 years working 10-hour days. When']
+['\nIt was officially signed on 2 August 1776, when 56 members of the Second Continental Congress, representing the original 13 American colonies, voted unanimously for the resolution for independence. The 2']
 ```

 #### Understanding the `dola_layers` argument
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@ -110,7 +110,6 @@ Flax), PyTorch, and/or TensorFlow.
 |                       [CPM-Ant](model_doc/cpmant)                        |       ✅        |         ❌         |      ❌      |
 |                          [CTRL](model_doc/ctrl)                          |       ✅        |         ✅         |      ❌      |
 |                           [CvT](model_doc/cvt)                           |       ✅        |         ✅         |      ❌      |
-|                      [DAB-DETR](model_doc/dab-detr)                      |       ✅        |         ❌         |      ❌      |
 |                           [DAC](model_doc/dac)                           |       ✅        |         ❌         |      ❌      |
 |                   [Data2VecAudio](model_doc/data2vec)                    |       ✅        |         ❌         |      ❌      |
 |                    [Data2VecText](model_doc/data2vec)                    |       ✅        |         ❌         |      ❌      |
@ -123,7 +122,6 @@ Flax), PyTorch, and/or TensorFlow.
 |                          [DeiT](model_doc/deit)                          |       ✅        |         ✅         |      ❌      |
 |                        [DePlot](model_doc/deplot)                        |       ✅        |         ❌         |      ❌      |
 |                [Depth Anything](model_doc/depth_anything)                |       ✅        |         ❌         |      ❌      |
-|                     [DepthPro](model_doc/depth_pro)                      |       ✅        |         ❌         |      ❌      |
 |                          [DETA](model_doc/deta)                          |       ✅        |         ❌         |      ❌      |
 |                          [DETR](model_doc/detr)                          |       ✅        |         ❌         |      ❌      |
 |                      [DialoGPT](model_doc/dialogpt)                      |       ✅        |         ✅         |      ✅      |
@ -163,7 +161,6 @@ Flax), PyTorch, and/or TensorFlow.
 |                           [GIT](model_doc/git)                           |       ✅        |         ❌         |      ❌      |
 |                           [GLM](model_doc/glm)                           |       ✅        |         ❌         |      ❌      |
 |                          [GLPN](model_doc/glpn)                          |       ✅        |         ❌         |      ❌      |
-|                      [GOT-OCR2](model_doc/got_ocr2)                      |       ✅        |         ❌         |      ❌      |
 |                       [GPT Neo](model_doc/gpt_neo)                       |       ✅        |         ❌         |      ✅      |
 |                      [GPT NeoX](model_doc/gpt_neox)                      |       ✅        |         ❌         |      ❌      |
 |             [GPT NeoX Japanese](model_doc/gpt_neox_japanese)             |       ✅        |         ❌         |      ❌      |
@ -306,7 +303,6 @@ Flax), PyTorch, and/or TensorFlow.
 |                      [RoFormer](model_doc/roformer)                      |       ✅        |         ✅         |      ✅      |
 |                       [RT-DETR](model_doc/rt_detr)                       |       ✅        |         ❌         |      ❌      |
 |                [RT-DETR-ResNet](model_doc/rt_detr_resnet)                |       ✅        |         ❌         |      ❌      |
-|                    [RT-DETRv2](model_doc/rt_detr_v2)                     |       ✅        |         ❌         |      ❌      |
 |                          [RWKV](model_doc/rwkv)                          |       ✅        |         ❌         |      ❌      |
 |                           [SAM](model_doc/sam)                           |       ✅        |         ✅         |      ❌      |
 |                  [SeamlessM4T](model_doc/seamless_m4t)                   |       ✅        |         ❌         |      ❌      |
@ -389,7 +385,6 @@ Flax), PyTorch, and/or TensorFlow.
 |                         [YOLOS](model_doc/yolos)                         |       ✅        |         ❌         |      ❌      |
 |                          [YOSO](model_doc/yoso)                          |       ✅        |         ❌         |      ❌      |
 |                         [Zamba](model_doc/zamba)                         |       ✅        |         ❌         |      ❌      |
-|                        [Zamba2](model_doc/zamba2)                        |       ✅        |         ❌         |      ❌      |
 |                      [ZoeDepth](model_doc/zoedepth)                      |       ✅        |         ❌         |      ❌      |

 <!-- End table-->
--- a/docs/source/en/installation.md
+++ b/docs/source/en/installation.md
@ -32,32 +32,12 @@ Install 🤗 Transformers for whichever deep learning library you're working wit

 You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, take a look at this [guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). A virtual environment makes it easier to manage different projects, and avoid compatibility issues between dependencies.

-Create a virtual environment with [uv](https://docs.astral.sh/uv/) (refer to [Installation](https://docs.astral.sh/uv/getting-started/installation/) for installation instructions), a fast Rust-based Python package and project manager.
-
-```bash
-uv venv my-env
-source my-env/bin/activate
-```
-
-Now you're ready to install 🤗 Transformers with pip or uv.
-
-<hfoptions id="install">
-<hfoption id="uv">
-
-```bash
-uv pip install transformers
-```
-
-</hfoption>
-<hfoption id="pip">
+Now you're ready to install 🤗 Transformers with the following command:

 ```bash
 pip install transformers
 ```

-</hfoption>
-</hfoptions>
-
 For GPU acceleration, install the appropriate CUDA drivers for [PyTorch](https://pytorch.org/get-started/locally) and TensorFlow(https://www.tensorflow.org/install/pip).

 Run the command below to check if your system detects an NVIDIA GPU.
--- a/docs/source/en/kv_cache.md
+++ b/docs/source/en/kv_cache.md
@ -56,16 +56,16 @@ More concretely, key-value cache acts as a memory bank for these generative mode
  >>> import torch
  >>> from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache

-  >>> model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
-  >>> model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="auto")
+  >>> model_id = "meta-llama/Llama-2-7b-chat-hf"
+  >>> model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="cuda:0")
  >>> tokenizer = AutoTokenizer.from_pretrained(model_id)

  >>> past_key_values = DynamicCache()
  >>> messages = [{"role": "user", "content": "Hello, what's your name."}]
-  >>> inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True).to(model.device)
+  >>> inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True).to("cuda:0")

  >>> generated_ids = inputs.input_ids
-  >>> cache_position = torch.arange(inputs.input_ids.shape[1], dtype=torch.int64, device=model.device)
+  >>> cache_position = torch.arange(inputs.input_ids.shape[1], dtype=torch.int64, device="cuda:0")
  >>> max_new_tokens = 10

  >>> for _ in range(max_new_tokens):
@ -82,13 +82,7 @@ More concretely, key-value cache acts as a memory bank for these generative mode
  ...     cache_position = cache_position[-1:] + 1 # add one more position for the next token

  >>> print(tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0])
-  ```
-  ```txt
-  <|user|>
-  Hello, what's your name. 
-  <|assistant|>
-  My name is Sarah. 
-  <|
+  "[INST] Hello, what's your name. [/INST]  Hello! My name is LLaMA,"
  ```

 </details>
@ -138,13 +132,17 @@ Cache quantization can be detrimental in terms of latency if the context length
 >>> import torch
 >>> from transformers import AutoTokenizer, AutoModelForCausalLM

->>> tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
->>> model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype=torch.float16, device_map="auto")
+>>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
+>>> model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16).to("cuda:0")
 >>> inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device)

 >>> out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="quantized", cache_config={"nbits": 4, "backend": "quanto"})
 >>> print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
-I like rock music because it's a great way to express myself. I like the way it makes me feel, the
+I like rock music because it's loud and energetic. It's a great way to express myself and rel
+
+>>> out = model.generate(**inputs, do_sample=False, max_new_tokens=20)
+>>> print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
+I like rock music because it's loud and energetic. I like to listen to it when I'm feeling
 ```

 ### Offloaded Cache
@ -168,7 +166,7 @@ Use `cache_implementation="offloaded_static"` for an offloaded static cache (see
 >>> ckpt = "microsoft/Phi-3-mini-4k-instruct"

 >>> tokenizer = AutoTokenizer.from_pretrained(ckpt)
->>> model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16, device_map="auto")
+>>> model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16).to("cuda:0")
 >>> inputs = tokenizer("Fun fact: The shortest", return_tensors="pt").to(model.device)

 >>> out = model.generate(**inputs, do_sample=False, max_new_tokens=23, cache_implementation="offloaded")
@ -233,14 +231,14 @@ For more examples with Static Cache and JIT compilation, take a look at [StaticC
 >>> import torch
 >>> from transformers import AutoTokenizer, AutoModelForCausalLM

->>> tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
->>> model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype=torch.float16, device_map="auto")
+>>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
+>>> model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
 >>> inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device)

 >>> # simply pass the cache implementation="static"
 >>> out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="static")
 >>> tokenizer.batch_decode(out, skip_special_tokens=True)[0]
-"Hello, my name is [Your Name] and I am a [Your Position] at [Your Company]. I am writing"
+"Hello, my name is [Your Name], and I am a [Your Profession] with [Number of Years] of"
 ```


@ -258,7 +256,7 @@ This will use the [`~OffloadedStaticCache`] implementation instead.
 >>> model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
 >>> inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device)

->>> # simply pass the cache implementation="offloaded_static"
+>>> # simply pass the cache implementation="static"
 >>> out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="offloaded_static")
 >>> tokenizer.batch_decode(out, skip_special_tokens=True)[0]
 "Hello, my name is [Your Name], and I am a [Your Profession] with [Number of Years] of"
@ -277,14 +275,14 @@ Note that you can use this cache only for models that support sliding window, e.
 >>> import torch
 >>> from transformers import AutoTokenizer, AutoModelForCausalLM, SinkCache

->>> tokenizer = AutoTokenizer.from_pretrained("teknium/OpenHermes-2.5-Mistral-7B")
->>> model = AutoModelForCausalLM.from_pretrained("teknium/OpenHermes-2.5-Mistral-7B", torch_dtype=torch.float16, device_map="auto")
+>>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
+>>> model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", torch_dtype=torch.float16).to("cuda:0")
 >>> inputs = tokenizer("Yesterday I was on a rock concert and.", return_tensors="pt").to(model.device)

 >>> # can be used by passing in cache implementation
 >>> out = model.generate(**inputs, do_sample=False, max_new_tokens=30, cache_implementation="sliding_window")
 >>> tokenizer.batch_decode(out, skip_special_tokens=True)[0]
-"Yesterday I was on a rock concert and. I was so excited to see my favorite band perform live. I was so happy that I could hardly contain myself. I was jumping up and down and"
+"Yesterday I was on a rock concert and. I was so excited to see my favorite band. I was so excited that I was jumping up and down and screaming. I was so excited that I"
 ```

 ### Sink Cache
@ -297,8 +295,8 @@ Unlike other cache classes, this one can't be used directly by indicating a `cac
 >>> import torch
 >>> from transformers import AutoTokenizer, AutoModelForCausalLM, SinkCache

->>> tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
->>> model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype=torch.float16, device_map="auto")
+>>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
+>>> model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16).to("cuda:0")
 >>> inputs = tokenizer("This is a long story about unicorns, fairies and magic.", return_tensors="pt").to(model.device)

 >>> # get our cache, specify number of sink tokens and window size
@ -306,7 +304,7 @@ Unlike other cache classes, this one can't be used directly by indicating a `cac
 >>> past_key_values = SinkCache(window_length=256, num_sink_tokens=4)
 >>> out = model.generate(**inputs, do_sample=False, max_new_tokens=30, past_key_values=past_key_values)
 >>> tokenizer.batch_decode(out, skip_special_tokens=True)[0]
-"This is a long story about unicorns, fairies and magic. It is a story about a young girl named Lily who discovers that she has the power to control the elements. She learns that she can"
+"This is a long story about unicorns, fairies and magic. It is a fantasy world where unicorns and fairies live together in harmony. The story follows a young girl named Lily"
 ```

 ### Encoder-Decoder Cache
@ -334,22 +332,22 @@ In case you are using Sink Cache, you have to crop your inputs to that maximum l
 >>> import torch
 >>> from transformers import AutoTokenizer,AutoModelForCausalLM
 >>> from transformers.cache_utils import (
-...    DynamicCache,
-...    SinkCache,
-...    StaticCache,
-...    SlidingWindowCache,
-...    QuantoQuantizedCache,
-...    QuantizedCacheConfig,
-... )
+>>>     DynamicCache,
+>>>     SinkCache,
+>>>     StaticCache,
+>>>     SlidingWindowCache,
+>>>     QuantoQuantizedCache,
+>>>     QuantizedCacheConfig,
+>>> )

->>> model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+>>> model_id = "meta-llama/Llama-2-7b-chat-hf"
 >>> model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map='auto')
 >>> tokenizer = AutoTokenizer.from_pretrained(model_id)

 >>> user_prompts = ["Hello, what's your name?", "Btw, yesterday I was on a rock concert."]

 >>> past_key_values = DynamicCache()
->>> max_cache_length = past_key_values.get_max_cache_shape()
+>>> max_cache_length = past_key_values.get_max_length()

 >>> messages = []
 >>> for prompt in user_prompts:
@ -365,7 +363,7 @@ In case you are using Sink Cache, you have to crop your inputs to that maximum l
 ...     messages.append({"role": "assistant", "content": completion})

 print(messages)
-[{'role': 'user', 'content': "Hello, what's your name?"}, {'role': 'assistant', 'content': "Hello, I'm AI."}, {'role': 'user', 'content': 'Btw, yesterday I was on a rock concert.'}, {'role': 'assistant', 'content': "I'm sorry to hear that you were on a rock concert yesterday. It sounds like a fun experience, but I'm not capable of experiencing music or concerts. However, I can provide you with some information about rock music and its history. Rock music emerged in the 1950s and 1960s in the United States and Britain, and it quickly gained popularity around the world. Some of the most famous rock bands of all time include The Beatles, The Rolling Stones, Led Zeppelin, and Pink Floyd. Rock music has a distinct sound and style, with elements of blues, country, and folk music. It often features guitar solos, heavy bass lines, and drums. Rock music has had a significant impact on popular culture, influencing genres such as punk rock, heavy metal, and alternative rock."}]
+[{'role': 'user', 'content': "Hello, what's your name?"}, {'role': 'assistant', 'content': " Hello! My name is LLaMA, I'm a large language model trained by a team of researcher at Meta AI. 😊"}, {'role': 'user', 'content': 'Btw, yesterday I was on a rock concert.'}, {'role': 'assistant', 'content': ' Oh, cool! That sounds like a lot of fun! 🎉 Did you enjoy the concert? What was the band like? 🤔'}]
 ```


@ -377,19 +375,17 @@ Sometimes you would want to first fill-in cache object with key/values for certa
 >>> import copy
 >>> import torch
 >>> from transformers import AutoModelForCausalLM, AutoTokenizer, DynamicCache, StaticCache
->>> from accelerate.test_utils.testing import get_backend

->>> DEVICE, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
->>> model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
->>> model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map=DEVICE)
+>>> model_id = "meta-llama/Llama-2-7b-chat-hf"
+>>> model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="cuda")
 >>> tokenizer = AutoTokenizer.from_pretrained(model_id)

 >>> # Init StaticCache with big enough max-length (1024 tokens for the below example)
 >>> # You can also init a DynamicCache, if that suits you better
->>> prompt_cache = StaticCache(config=model.config, max_batch_size=1, max_cache_len=1024, device=DEVICE, dtype=torch.bfloat16)
+>>> prompt_cache = StaticCache(config=model.config, max_batch_size=1, max_cache_len=1024, device="cuda", dtype=torch.bfloat16)

 >>> INITIAL_PROMPT = "You are a helpful assistant. "
->>> inputs_initial_prompt = tokenizer(INITIAL_PROMPT, return_tensors="pt").to(DEVICE)
+>>> inputs_initial_prompt = tokenizer(INITIAL_PROMPT, return_tensors="pt").to("cuda")
 >>> # This is the common prompt cached, we need to run forward without grad to be abel to copy
 >>> with torch.no_grad():
 ...      prompt_cache = model(**inputs_initial_prompt, past_key_values = prompt_cache).past_key_values
@ -397,14 +393,14 @@ Sometimes you would want to first fill-in cache object with key/values for certa
 >>> prompts = ["Help me to write a blogpost about travelling.", "What is the capital of France?"]
 >>> responses = []
 >>> for prompt in prompts:
-...     new_inputs = tokenizer(INITIAL_PROMPT + prompt, return_tensors="pt").to(DEVICE)
+...     new_inputs = tokenizer(INITIAL_PROMPT + prompt, return_tensors="pt").to("cuda")
 ...     past_key_values = copy.deepcopy(prompt_cache)
 ...     outputs = model.generate(**new_inputs, past_key_values=past_key_values,max_new_tokens=20)
 ...     response = tokenizer.batch_decode(outputs)[0]
 ...     responses.append(response)

 >>> print(responses)
-['<s> You are a helpful assistant. Help me to write a blogpost about travelling.  I am excited to share my experiences with you.  I have been traveling for the past', '<s> You are a helpful assistant. What is the capital of France? \n\nAnswer: Paris is the capital of France.</s>']
+['<s> You are a helpful assistant. Help me to write a blogpost about travelling.\n\nTitle: The Ultimate Guide to Travelling: Tips, Tricks, and', '<s> You are a helpful assistant. What is the capital of France?\n\nYes, the capital of France is Paris.</s>']
 ```


@ -418,8 +414,8 @@ this legacy format, you can seamlessly convert it to a `DynamicCache` and back.
 >>> import torch
 >>> from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache

->>> tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
->>> model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0", torch_dtype=torch.float16, device_map="auto")
+>>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
+>>> model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
 >>> inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device)

 >>> # `return_dict_in_generate=True` is required to return the cache. `return_legacy_cache` forces the returned cache
--- a/docs/source/en/llm_tutorial.md
+++ b/docs/source/en/llm_tutorial.md
@ -40,7 +40,6 @@ Before you begin, make sure you have all the necessary libraries installed:
 ```bash
 pip install transformers bitsandbytes>=0.39.0 -q
 ```
-Bitsandbytes supports multiple backends in addition to CUDA-based GPUs. Refer to the multi-backend installation [guide](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend) to learn more.


 ## Generate text
@ -102,11 +101,9 @@ Next, you need to preprocess your text input with a [tokenizer](tokenizer_summar

 ```py
 >>> from transformers import AutoTokenizer
->>> from accelerate.test_utils.testing import get_backend

->>> DEVICE, _, _ = get_backend() # automatically detects the underlying device type (CUDA, CPU, XPU, MPS, etc.)
 >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", padding_side="left")
->>> model_inputs = tokenizer(["A list of colors: red, blue"], return_tensors="pt").to(DEVICE)
+>>> model_inputs = tokenizer(["A list of colors: red, blue"], return_tensors="pt").to("cuda")
 ```

 The `model_inputs` variable holds the tokenized text input, as well as the attention mask. While [`~generation.GenerationMixin.generate`] does its best effort to infer the attention mask when it is not passed, we recommend passing it whenever possible for optimal results.
@ -125,7 +122,7 @@ Finally, you don't need to do it one sequence at a time! You can batch your inpu
 >>> tokenizer.pad_token = tokenizer.eos_token  # Most LLMs don't have a pad token by default
 >>> model_inputs = tokenizer(
 ...     ["A list of colors: red, blue", "Portugal is"], return_tensors="pt", padding=True
-... ).to(DEVICE)
+... ).to("cuda")
 >>> generated_ids = model.generate(**model_inputs)
 >>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
 ['A list of colors: red, blue, green, yellow, orange, purple, pink,',
@ -155,7 +152,7 @@ If not specified in the [`~generation.GenerationConfig`] file, `generate` return


 ```py
->>> model_inputs = tokenizer(["A sequence of numbers: 1, 2"], return_tensors="pt").to(DEVICE)
+>>> model_inputs = tokenizer(["A sequence of numbers: 1, 2"], return_tensors="pt").to("cuda")

 >>> # By default, the output will contain up to 20 tokens
 >>> generated_ids = model.generate(**model_inputs)
@ -177,7 +174,7 @@ By default, and unless specified in the [`~generation.GenerationConfig`] file, `
 >>> from transformers import set_seed
 >>> set_seed(42)

->>> model_inputs = tokenizer(["I am a cat."], return_tensors="pt").to(DEVICE)
+>>> model_inputs = tokenizer(["I am a cat."], return_tensors="pt").to("cuda")

 >>> # LLM + greedy decoding = repetitive, boring output
 >>> generated_ids = model.generate(**model_inputs)
@ -199,7 +196,7 @@ LLMs are [decoder-only](https://huggingface.co/learn/nlp-course/chapter1/6?fw=pt
 >>> # which is shorter, has padding on the right side. Generation fails to capture the logic.
 >>> model_inputs = tokenizer(
 ...     ["1, 2, 3", "A, B, C, D, E"], padding=True, return_tensors="pt"
-... ).to(DEVICE)
+... ).to("cuda")
 >>> generated_ids = model.generate(**model_inputs)
 >>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
 '1, 2, 33333333333'
@ -209,7 +206,7 @@ LLMs are [decoder-only](https://huggingface.co/learn/nlp-course/chapter1/6?fw=pt
 >>> tokenizer.pad_token = tokenizer.eos_token  # Most LLMs don't have a pad token by default
 >>> model_inputs = tokenizer(
 ...     ["1, 2, 3", "A, B, C, D, E"], padding=True, return_tensors="pt"
-... ).to(DEVICE)
+... ).to("cuda")
 >>> generated_ids = model.generate(**model_inputs)
 >>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
 '1, 2, 3, 4, 5, 6,'
@ -226,7 +223,7 @@ Some models and tasks expect a certain input prompt format to work properly. Whe
 ... )
 >>> set_seed(0)
 >>> prompt = """How many helicopters can a human eat in one sitting? Reply as a thug."""
->>> model_inputs = tokenizer([prompt], return_tensors="pt").to(DEVICE)
+>>> model_inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
 >>> input_length = model_inputs.input_ids.shape[1]
 >>> generated_ids = model.generate(**model_inputs, max_new_tokens=20)
 >>> print(tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_tokens=True)[0])
@ -242,7 +239,7 @@ Some models and tasks expect a certain input prompt format to work properly. Whe
 ...     },
 ...     {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
 ... ]
->>> model_inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(DEVICE)
+>>> model_inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to("cuda")
 >>> input_length = model_inputs.shape[1]
 >>> generated_ids = model.generate(model_inputs, do_sample=True, max_new_tokens=20)
 >>> print(tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_tokens=True)[0])
--- a/docs/source/en/main_classes/deepspeed.md
+++ b/docs/source/en/main_classes/deepspeed.md
@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.

 # DeepSpeed

-[DeepSpeed](https://github.com/deepspeedai/DeepSpeed), powered by Zero Redundancy Optimizer (ZeRO), is an optimization library for training and fitting very large models onto a GPU. It is available in several ZeRO stages, where each stage progressively saves more GPU memory by partitioning the optimizer state, gradients, parameters, and enabling offloading to a CPU or NVMe. DeepSpeed is integrated with the [`Trainer`] class and most of the setup is automatically taken care of for you. 
+[DeepSpeed](https://github.com/microsoft/DeepSpeed), powered by Zero Redundancy Optimizer (ZeRO), is an optimization library for training and fitting very large models onto a GPU. It is available in several ZeRO stages, where each stage progressively saves more GPU memory by partitioning the optimizer state, gradients, parameters, and enabling offloading to a CPU or NVMe. DeepSpeed is integrated with the [`Trainer`] class and most of the setup is automatically taken care of for you. 

 However, if you want to use DeepSpeed without the [`Trainer`], Transformers provides a [`HfDeepSpeedConfig`] class.

--- a/docs/source/en/model_doc/blip.md
+++ b/docs/source/en/model_doc/blip.md
@ -61,11 +61,6 @@ The original code can be found [here](https://github.com/salesforce/BLIP).
 [[autodoc]] BlipImageProcessor
    - preprocess

-## BlipImageProcessorFast
-
-[[autodoc]] BlipImageProcessorFast
-    - preprocess
-
 <frameworkcontent>
 <pt>

--- a/docs/source/en/model_doc/clip.md
+++ b/docs/source/en/model_doc/clip.md
@ -251,11 +251,6 @@ The resource should ideally demonstrate something new instead of duplicating an
 [[autodoc]] CLIPImageProcessor
    - preprocess

-## CLIPImageProcessorFast
-
-[[autodoc]] CLIPImageProcessorFast
-    - preprocess
-
 ## CLIPFeatureExtractor

 [[autodoc]] CLIPFeatureExtractor
--- a/docs/source/en/model_doc/convnext.md
+++ b/docs/source/en/model_doc/convnext.md
@ -64,11 +64,6 @@ If you're interested in submitting a resource to be included here, please feel f
 [[autodoc]] ConvNextImageProcessor
    - preprocess

-## ConvNextImageProcessorFast
-
-[[autodoc]] ConvNextImageProcessorFast
-    - preprocess
-
 <frameworkcontent>
 <pt>

--- a/docs/source/en/model_doc/dab-detr.md
+++ b/docs/source/en/model_doc/dab-detr.md
@ -1,119 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# DAB-DETR
-
-## Overview
-
-The DAB-DETR model was proposed in [DAB-DETR: Dynamic Anchor Boxes are Better Queries for DETR](https://arxiv.org/abs/2201.12329) by Shilong Liu, Feng Li, Hao Zhang, Xiao Yang, Xianbiao Qi, Hang Su, Jun Zhu, Lei Zhang.
-DAB-DETR is an enhanced variant of Conditional DETR. It utilizes dynamically updated anchor boxes to provide both a reference query point (x, y) and a reference anchor size (w, h), improving cross-attention computation. This new approach achieves 45.7% AP when trained for 50 epochs with a single ResNet-50 model as the backbone.
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/dab_detr_convergence_plot.png"
-alt="drawing" width="600"/>
-
-The abstract from the paper is the following:
-
-*We present in this paper a novel query formulation using dynamic anchor boxes
-for DETR (DEtection TRansformer) and offer a deeper understanding of the role
-of queries in DETR. This new formulation directly uses box coordinates as queries
-in Transformer decoders and dynamically updates them layer-by-layer. Using box
-coordinates not only helps using explicit positional priors to improve the query-to-feature similarity and eliminate the slow training convergence issue in DETR,
-but also allows us to modulate the positional attention map using the box width
-and height information. Such a design makes it clear that queries in DETR can be
-implemented as performing soft ROI pooling layer-by-layer in a cascade manner.
-As a result, it leads to the best performance on MS-COCO benchmark among
-the DETR-like detection models under the same setting, e.g., AP 45.7% using
-ResNet50-DC5 as backbone trained in 50 epochs. We also conducted extensive
-experiments to confirm our analysis and verify the effectiveness of our methods.*
-
-This model was contributed by [davidhajdu](https://huggingface.co/davidhajdu).
-The original code can be found [here](https://github.com/IDEA-Research/DAB-DETR).
-
-## How to Get Started with the Model
-
-Use the code below to get started with the model.
-
-```python
-import torch
-import requests
-
-from PIL import Image
-from transformers import AutoModelForObjectDetection, AutoImageProcessor
-
-url = 'http://images.cocodataset.org/val2017/000000039769.jpg' 
-image = Image.open(requests.get(url, stream=True).raw)
-
-image_processor = AutoImageProcessor.from_pretrained("IDEA-Research/dab-detr-resnet-50")
-model = AutoModelForObjectDetection.from_pretrained("IDEA-Research/dab-detr-resnet-50")
-
-inputs = image_processor(images=image, return_tensors="pt")
-
-with torch.no_grad():
-    outputs = model(**inputs)
-
-results = image_processor.post_process_object_detection(outputs, target_sizes=torch.tensor([image.size[::-1]]), threshold=0.3)
-
-for result in results:
-    for score, label_id, box in zip(result["scores"], result["labels"], result["boxes"]):
-        score, label = score.item(), label_id.item()
-        box = [round(i, 2) for i in box.tolist()]
-        print(f"{model.config.id2label[label]}: {score:.2f} {box}")
-```
-This should output
-```
-cat: 0.87 [14.7, 49.39, 320.52, 469.28]
-remote: 0.86 [41.08, 72.37, 173.39, 117.2]
-cat: 0.86 [344.45, 19.43, 639.85, 367.86]
-remote: 0.61 [334.27, 75.93, 367.92, 188.81]
-couch: 0.59 [-0.04, 1.34, 639.9, 477.09]
-```
-
-There are three other ways to instantiate a DAB-DETR model (depending on what you prefer):
-
-Option 1: Instantiate DAB-DETR with pre-trained weights for entire model
-```py
->>> from transformers import DabDetrForObjectDetection
-
->>> model = DabDetrForObjectDetection.from_pretrained("IDEA-Research/dab-detr-resnet-50")
-```
-
-Option 2: Instantiate DAB-DETR with randomly initialized weights for Transformer, but pre-trained weights for backbone
-```py
->>> from transformers import DabDetrConfig, DabDetrForObjectDetection
-
->>> config = DabDetrConfig()
->>> model = DabDetrForObjectDetection(config)
-```
-Option 3: Instantiate DAB-DETR with randomly initialized weights for backbone + Transformer
-```py
->>> config = DabDetrConfig(use_pretrained_backbone=False)
->>> model = DabDetrForObjectDetection(config)
-```
-
-
-## DabDetrConfig
-
-[[autodoc]] DabDetrConfig
-
-## DabDetrModel
-
-[[autodoc]] DabDetrModel
-    - forward
-
-## DabDetrForObjectDetection
-
-[[autodoc]] DabDetrForObjectDetection
-    - forward
--- a/docs/source/en/model_doc/deit.md
+++ b/docs/source/en/model_doc/deit.md
@ -125,11 +125,6 @@ If you're interested in submitting a resource to be included here, please feel f
 [[autodoc]] DeiTImageProcessor
    - preprocess

-## DeiTImageProcessorFast
-
-[[autodoc]] DeiTImageProcessorFast
-    - preprocess
-
 <frameworkcontent>
 <pt>

--- a/docs/source/en/model_doc/depth_pro.md
+++ b/docs/source/en/model_doc/depth_pro.md
@ -1,183 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# DepthPro
-
-## Overview
-
-The DepthPro model was proposed in [Depth Pro: Sharp Monocular Metric Depth in Less Than a Second](https://arxiv.org/abs/2410.02073) by Aleksei Bochkovskii, Amaël Delaunoy, Hugo Germain, Marcel Santos, Yichao Zhou, Stephan R. Richter, Vladlen Koltun.
-
-DepthPro is a foundation model for zero-shot metric monocular depth estimation, designed to generate high-resolution depth maps with remarkable sharpness and fine-grained details. It employs a multi-scale Vision Transformer (ViT)-based architecture, where images are downsampled, divided into patches, and processed using a shared Dinov2 encoder. The extracted patch-level features are merged, upsampled, and refined using a DPT-like fusion stage, enabling precise depth estimation.
-
-The abstract from the paper is the following:
-
-*We present a foundation model for zero-shot metric monocular depth estimation. Our model, Depth Pro, synthesizes high-resolution depth maps with unparalleled sharpness and high-frequency details. The predictions are metric, with absolute scale, without relying on the availability of metadata such as camera intrinsics. And the model is fast, producing a 2.25-megapixel depth map in 0.3 seconds on a standard GPU. These characteristics are enabled by a number of technical contributions, including an efficient multi-scale vision transformer for dense prediction, a training protocol that combines real and synthetic datasets to achieve high metric accuracy alongside fine boundary tracing, dedicated evaluation metrics for boundary accuracy in estimated depth maps, and state-of-the-art focal length estimation from a single image. Extensive experiments analyze specific design choices and demonstrate that Depth Pro outperforms prior work along multiple dimensions.*
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/depth_pro_teaser.png"
-alt="drawing" width="600"/>
-
-<small> DepthPro Outputs. Taken from the <a href="https://github.com/apple/ml-depth-pro" target="_blank">official code</a>. </small>
-
-This model was contributed by [geetu040](https://github.com/geetu040). The original code can be found [here](https://github.com/apple/ml-depth-pro).
-
-## Usage Tips
-
-The DepthPro model processes an input image by first downsampling it at multiple scales and splitting each scaled version into patches. These patches are then encoded using a shared Vision Transformer (ViT)-based Dinov2 patch encoder, while the full image is processed by a separate image encoder. The extracted patch features are merged into feature maps, upsampled, and fused using a DPT-like decoder to generate the final depth estimation. If enabled, an additional Field of View (FOV) encoder processes the image for estimating the camera's field of view, aiding in depth accuracy.
-
-```py
->>> import requests
->>> from PIL import Image
->>> import torch
->>> from transformers import DepthProImageProcessorFast, DepthProForDepthEstimation
-
->>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
->>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
->>> image = Image.open(requests.get(url, stream=True).raw)
-
->>> image_processor = DepthProImageProcessorFast.from_pretrained("apple/DepthPro-hf")
->>> model = DepthProForDepthEstimation.from_pretrained("apple/DepthPro-hf").to(device)
-
->>> inputs = image_processor(images=image, return_tensors="pt").to(device)
-
->>> with torch.no_grad():
-...     outputs = model(**inputs)
-
->>> post_processed_output = image_processor.post_process_depth_estimation(
-...     outputs, target_sizes=[(image.height, image.width)],
-... )
-
->>> field_of_view = post_processed_output[0]["field_of_view"]
->>> focal_length = post_processed_output[0]["focal_length"]
->>> depth = post_processed_output[0]["predicted_depth"]
->>> depth = (depth - depth.min()) / depth.max()
->>> depth = depth * 255.
->>> depth = depth.detach().cpu().numpy()
->>> depth = Image.fromarray(depth.astype("uint8"))
-```
-
-### Architecture and Configuration
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/depth_pro_architecture.png"
-alt="drawing" width="600"/>
-
-<small> DepthPro architecture. Taken from the <a href="https://arxiv.org/abs/2410.02073" target="_blank">original paper</a>. </small>
-
-The `DepthProForDepthEstimation` model uses a `DepthProEncoder`, for encoding the input image and a `FeatureFusionStage` for fusing the output features from encoder.
-
-The `DepthProEncoder` further uses two encoders:
- `patch_encoder`
-   - Input image is scaled with multiple ratios, as specified in the `scaled_images_ratios` configuration.
-   - Each scaled image is split into smaller **patches** of size `patch_size` with overlapping areas determined by `scaled_images_overlap_ratios`.
-   - These patches are processed by the **`patch_encoder`**
- `image_encoder`
-   - Input image is also rescaled to `patch_size` and processed by the **`image_encoder`**
-
-Both these encoders can be configured via `patch_model_config` and `image_model_config` respectively, both of which are seperate `Dinov2Model` by default.
-
-Outputs from both encoders (`last_hidden_state`) and selected intermediate states (`hidden_states`) from **`patch_encoder`** are fused by a `DPT`-based `FeatureFusionStage` for depth estimation.
-
-### Field-of-View (FOV) Prediction
-
-The network is supplemented with a focal length estimation head. A small convolutional head ingests frozen features from the depth estimation network and task-specific features from a separate ViT image encoder to predict the horizontal angular field-of-view.
-
-The `use_fov_model` parameter in `DepthProConfig` controls whether **FOV prediction** is enabled. By default, it is set to `False` to conserve memory and computation. When enabled, the **FOV encoder** is instantiated based on the `fov_model_config` parameter, which defaults to a `Dinov2Model`. The `use_fov_model` parameter can also be passed when initializing the `DepthProForDepthEstimation` model.
-
-The pretrained model at checkpoint `apple/DepthPro-hf` uses the FOV encoder. To use the pretrained-model without FOV encoder, set `use_fov_model=False` when loading the model, which saves computation.
-```py
->>> from transformers import DepthProForDepthEstimation
->>> model = DepthProForDepthEstimation.from_pretrained("apple/DepthPro-hf", use_fov_model=False)
-```
-
-To instantiate a new model with FOV encoder, set `use_fov_model=True` in the config.
-```py
->>> from transformers import DepthProConfig, DepthProForDepthEstimation
->>> config = DepthProConfig(use_fov_model=True)
->>> model = DepthProForDepthEstimation(config)
-```
-
-Or set `use_fov_model=True` when initializing the model, which overrides the value in config.
-```py
->>> from transformers import DepthProConfig, DepthProForDepthEstimation
->>> config = DepthProConfig()
->>> model = DepthProForDepthEstimation(config, use_fov_model=True)
-```
-
-### Using Scaled Dot Product Attention (SDPA)
-
-PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function 
-encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the 
-[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) 
-or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
-page for more information.
-
-SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set 
-`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
-
-```py
-from transformers import DepthProForDepthEstimation
-model = DepthProForDepthEstimation.from_pretrained("apple/DepthPro-hf", attn_implementation="sdpa", torch_dtype=torch.float16)
-```
-
-For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).
-
-On a local benchmark (A100-40GB, PyTorch 2.3.0, OS Ubuntu 22.04) with `float32` and `google/vit-base-patch16-224` model, we saw the following speedups during inference.
-
-|   Batch size |   Average inference time (ms), eager mode |   Average inference time (ms), sdpa model |   Speed up, Sdpa / Eager (x) |
-|--------------|-------------------------------------------|-------------------------------------------|------------------------------|
-|            1 |                                         7 |                                         6 |                      1.17 |
-|            2 |                                         8 |                                         6 |                      1.33 |
-|            4 |                                         8 |                                         6 |                      1.33 |
-|            8 |                                         8 |                                         6 |                      1.33 |
-
-## Resources
-
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DepthPro:
-
- Research Paper: [Depth Pro: Sharp Monocular Metric Depth in Less Than a Second](https://arxiv.org/pdf/2410.02073)
- Official Implementation: [apple/ml-depth-pro](https://github.com/apple/ml-depth-pro)
- DepthPro Inference Notebook: [DepthPro Inference](https://github.com/qubvel/transformers-notebooks/blob/main/notebooks/DepthPro_inference.ipynb)
- DepthPro for Super Resolution and Image Segmentation
-    - Read blog on Medium: [Depth Pro: Beyond Depth](https://medium.com/@raoarmaghanshakir040/depth-pro-beyond-depth-9d822fc557ba)
-    - Code on Github: [geetu040/depthpro-beyond-depth](https://github.com/geetu040/depthpro-beyond-depth)
-
-If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
-
-## DepthProConfig
-
-[[autodoc]] DepthProConfig
-
-## DepthProImageProcessor
-
-[[autodoc]] DepthProImageProcessor
-    - preprocess
-    - post_process_depth_estimation
-
-## DepthProImageProcessorFast
-
-[[autodoc]] DepthProImageProcessorFast
-    - preprocess
-    - post_process_depth_estimation
-
-## DepthProModel
-
-[[autodoc]] DepthProModel
-    - forward
-
-## DepthProForDepthEstimation
-
-[[autodoc]] DepthProForDepthEstimation
-    - forward
--- a/docs/source/en/model_doc/glm.md
+++ b/docs/source/en/model_doc/glm.md
@ -56,7 +56,7 @@ In the following, we demonstrate how to use `glm-4-9b-chat` for the inference. N
 >>> from transformers import AutoModelForCausalLM, AutoTokenizer
 >>> device = "cuda" # the device to load the model onto

->>> model = AutoModelForCausalLM.from_pretrained("THUDM/glm-4-9b-chat", device_map="auto", trust_remote_code=True)
+>>> model = AutoModelForCausalLM.from_pretrained("THUDM/glm-4-9b-chat", device_map="auto")
 >>> tokenizer = AutoTokenizer.from_pretrained("THUDM/glm-4-9b-chat")

 >>> prompt = "Give me a short introduction to large language model."
--- a/docs/source/en/model_doc/got_ocr2.md
+++ b/docs/source/en/model_doc/got_ocr2.md
@ -1,269 +0,0 @@
-<!--Copyright 2024 StepFun and The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# GOT-OCR2
-
-## Overview
-
-The GOT-OCR2 model was proposed in [General OCR Theory: Towards OCR-2.0 via a Unified End-to-end Model](https://arxiv.org/abs/2409.01704) by Haoran Wei, Chenglong Liu, Jinyue Chen, Jia Wang, Lingyu Kong, Yanming Xu, Zheng Ge, Liang Zhao, Jianjian Sun, Yuang Peng, Chunrui Han, Xiangyu Zhang.
-
-The abstract from the paper is the following:
-
-*Traditional OCR systems (OCR-1.0) are increasingly unable to meet people’snusage due to the growing demand for intelligent processing of man-made opticalncharacters. In this paper, we collectively refer to all artificial optical signals (e.g., plain texts, math/molecular formulas, tables, charts, sheet music, and even geometric shapes) as "characters" and propose the General OCR Theory along with an excellent model, namely GOT, to promote the arrival of OCR-2.0. The GOT, with 580M parameters, is a unified, elegant, and end-to-end model, consisting of a high-compression encoder and a long-contexts decoder. As an OCR-2.0 model, GOT can handle all the above "characters" under various OCR tasks. On the input side, the model supports commonly used scene- and document-style images in slice and whole-page styles. On the output side, GOT can generate plain or formatted results (markdown/tikz/smiles/kern) via an easy prompt. Besides, the model enjoys interactive OCR features, i.e., region-level recognition guided by coordinates or colors. Furthermore, we also adapt dynamic resolution and multipage OCR technologies to GOT for better practicality. In experiments, we provide sufficient results to prove the superiority of our model.*
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/got_ocr_overview.png"
-alt="drawing" width="600"/>
-
-<small> GOT-OCR2 training stages. Taken from the <a href="https://arxiv.org/abs/2409.01704">original paper.</a> </small>
-
-
-Tips:
-
-GOT-OCR2 works on a wide range of tasks, including plain document OCR, scene text OCR, formatted document OCR, and even OCR for tables, charts, mathematical formulas, geometric shapes, molecular formulas and sheet music. While this implementation of the model will only output plain text, the outputs can be further processed to render the desired format, with packages like `pdftex`, `mathpix`, `matplotlib`, `tikz`, `verovio` or `pyecharts`.
-The model can also be used for interactive OCR, where the user can specify the region to be recognized by providing the coordinates or the color of the region's bounding box.
-
-This model was contributed by [yonigozlan](https://huggingface.co/yonigozlan).
-The original code can be found [here](https://github.com/Ucas-HaoranWei/GOT-OCR2.0).
-
-## Usage example
-
-### Plain text inference
-
-```python
->>> from transformers import AutoProcessor, AutoModelForImageTextToText
-
->>> device = "cuda" if torch.cuda.is_available() else "cpu"
->>> model = AutoModelForImageTextToText.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf", device_map=device)
->>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf")
-
->>> image = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/image_ocr.jpg"
->>> inputs = processor(image, return_tensors="pt").to(device)
-
->>> generate_ids = model.generate(
-...     **inputs,
-...     do_sample=False,
-...     tokenizer=processor.tokenizer,
-...     stop_strings="<|im_end|>",
-...     max_new_tokens=4096,
-... )
-
->>> processor.decode(generate_ids[0, inputs["input_ids"].shape[1]:], skip_special_tokens=True)
-"R&D QUALITY IMPROVEMENT\nSUGGESTION/SOLUTION FORM\nName/Phone Ext. : (...)"
-```
-
-### Plain text inference batched
-
-```python
->>> from transformers import AutoProcessor, AutoModelForImageTextToText
-
->>> device = "cuda" if torch.cuda.is_available() else "cpu"
->>> model = AutoModelForImageTextToText.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf", device_map=device)
->>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf")
-
->>> image1 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/multi_box.png"
->>> image2 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/image_ocr.jpg"
-
->>> inputs = processor([image1, image2], return_tensors="pt").to(device)
-
->>> generate_ids = model.generate(
-...     **inputs,
-...     do_sample=False,
-...     tokenizer=processor.tokenizer,
-...     stop_strings="<|im_end|>",
-...     max_new_tokens=4,
-... )
-
->>> processor.batch_decode(generate_ids[:, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
-["Reducing the number", "R&D QUALITY"]
-```
-
-### Formatted text inference
-
-GOT-OCR2 can also generate formatted text, such as markdown or LaTeX. Here is an example of how to generate formatted text:
-
-```python
->>> from transformers import AutoProcessor, AutoModelForImageTextToText
-
->>> device = "cuda" if torch.cuda.is_available() else "cpu"
->>> model = AutoModelForImageTextToText.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf", device_map=device)
->>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf")
-
->>> image = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/latex.png"
->>> inputs = processor(image, return_tensors="pt", format=True).to(device)
-
->>> generate_ids = model.generate(
-...     **inputs,
-...     do_sample=False,
-...     tokenizer=processor.tokenizer,
-...     stop_strings="<|im_end|>",
-...     max_new_tokens=4096,
-... )
-
->>> processor.decode(generate_ids[0, inputs["input_ids"].shape[1]:], skip_special_tokens=True)
-"\\author{\nHanwen Jiang* \\(\\quad\\) Arjun Karpur \\({ }^{\\dagger} \\quad\\) Bingyi Cao \\({ }^{\\dagger} \\quad\\) (...)"
-```
-
-### Inference on multiple pages
-
-Although it might be reasonable in most cases to use a “for loop” for multi-page processing, some text data with formatting across several pages make it necessary to process all pages at once. GOT introduces a multi-page OCR (without “for loop”) feature, where multiple pages can be processed by the model at once, whith the output being one continuous text.
-Here is an example of how to process multiple pages at once:
-
-
-```python
->>> from transformers import AutoProcessor, AutoModelForImageTextToText
-
->>> device = "cuda" if torch.cuda.is_available() else "cpu"
->>> model = AutoModelForImageTextToText.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf", device_map=device)
->>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf")
-
->>> image1 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/page1.png"
->>> image2 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/page2.png"
->>> inputs = processor([image1, image2], return_tensors="pt", multi_page=True, format=True).to(device)
-
->>> generate_ids = model.generate(
-...     **inputs,
-...     do_sample=False,
-...     tokenizer=processor.tokenizer,
-...     stop_strings="<|im_end|>",
-...     max_new_tokens=4096,
-... )
-
->>> processor.decode(generate_ids[0, inputs["input_ids"].shape[1]:], skip_special_tokens=True)
-"\\title{\nGeneral OCR Theory: Towards OCR-2.0 via a Unified End-to-end Model\n}\n\\author{\nHaoran Wei (...)"
-```
-
-### Inference on cropped patches
-
-GOT supports a 1024×1024 input resolution, which is sufficient for most OCR tasks, such as scene OCR or processing A4-sized PDF pages. However, certain scenarios, like horizontally stitched two-page PDFs commonly found in academic papers or images with unusual aspect ratios, can lead to accuracy issues when processed as a single image. To address this, GOT can dynamically crop an image into patches, process them all at once, and merge the results for better accuracy with such inputs.
-Here is an example of how to process cropped patches:
-
-```python
->>> import torch
->>> from transformers import AutoProcessor, AutoModelForImageTextToText
-
->>> device = "cuda" if torch.cuda.is_available() else "cpu"
->>> model = AutoModelForImageTextToText.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf", torch_dtype=torch.bfloat16, device_map=device)
->>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf")
-
->>> image = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/one_column.png"
->>> inputs = processor(image, return_tensors="pt", format=True, crop_to_patches=True, max_patches=3).to(device)
-
->>> generate_ids = model.generate(
-...     **inputs,
-...     do_sample=False,
-...     tokenizer=processor.tokenizer,
-...     stop_strings="<|im_end|>",
-...     max_new_tokens=4096,
-... )
-
->>> processor.decode(generate_ids[0, inputs["input_ids"].shape[1]:], skip_special_tokens=True)
-"on developing architectural improvements to make learnable matching methods generalize.\nMotivated by the above observations, (...)"
-```
-
-### Inference on a specific region
-
-GOT supports interactive OCR, where the user can specify the region to be recognized by providing the coordinates or the color of the region's bounding box. Here is an example of how to process a specific region:
-
-```python
->>> from transformers import AutoProcessor, AutoModelForImageTextToText
-
->>> device = "cuda" if torch.cuda.is_available() else "cpu"
->>> model = AutoModelForImageTextToText.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf", device_map=device)
->>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf")
-
->>> image = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/multi_box.png"
->>> inputs = processor(image, return_tensors="pt", color="green").to(device) # or box=[x1, y1, x2, y2] for coordinates (image pixels)
-
->>> generate_ids = model.generate(
-...     **inputs,
-...     do_sample=False,
-...     tokenizer=processor.tokenizer,
-...     stop_strings="<|im_end|>",
-...     max_new_tokens=4096,
-... )
-
->>> processor.decode(generate_ids[0, inputs["input_ids"].shape[1]:], skip_special_tokens=True)
-"You should keep in mind what features from the module should be used, especially \nwhen you’re planning to sell a template."
-```
-
-### Inference on general OCR data example: sheet music
-
-Although this implementation of the model will only output plain text, the outputs can be further processed to render the desired format, with packages like `pdftex`, `mathpix`, `matplotlib`, `tikz`, `verovio` or `pyecharts`.
-Here is an example of how to process sheet music:
-
-```python
->>> from transformers import AutoProcessor, AutoModelForImageTextToText
->>> import verovio
-
->>> device = "cuda" if torch.cuda.is_available() else "cpu"
->>> model = AutoModelForImageTextToText.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf", device_map=device)
->>> processor = AutoProcessor.from_pretrained("stepfun-ai/GOT-OCR-2.0-hf")
-
->>> image = "https://huggingface.co/datasets/hf-internal-testing/fixtures_got_ocr/resolve/main/sheet_music.png"
->>> inputs = processor(image, return_tensors="pt", format=True).to(device)
-
->>> generate_ids = model.generate(
-...     **inputs,
-...     do_sample=False,
-...     tokenizer=processor.tokenizer,
-...     stop_strings="<|im_end|>",
-...     max_new_tokens=4096,
-... )
-
->>> outputs = processor.decode(generate_ids[0, inputs["input_ids"].shape[1]:], skip_special_tokens=True)
->>> tk = verovio.toolkit()
->>> tk.loadData(outputs)
->>> tk.setOptions(
-...     {
-...         "pageWidth": 2100,
-...         "pageHeight": 800,
-...         "footer": "none",
-...         "barLineWidth": 0.5,
-...         "beamMaxSlope": 15,
-...         "staffLineWidth": 0.2,
-...         "spacingStaff": 6,
-...     }
-... )
->>> tk.getPageCount()
->>> svg = tk.renderToSVG()
->>> svg = svg.replace('overflow="inherit"', 'overflow="visible"')
->>> with open("output.svg", "w") as f:
->>>     f.write(svg)
-```
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/sheet_music.svg"
-alt="drawing" width="600"/>
-
-## GotOcr2Config
-
-[[autodoc]] GotOcr2Config
-
-## GotOcr2VisionConfig
-
-[[autodoc]] GotOcr2VisionConfig
-
-## GotOcr2ImageProcessor
-
-[[autodoc]] GotOcr2ImageProcessor
-
-## GotOcr2Processor
-
-[[autodoc]] GotOcr2Processor
-
-## GotOcr2ForConditionalGeneration
-
-[[autodoc]] GotOcr2ForConditionalGeneration
-    - forward
-
--- a/docs/source/en/model_doc/granitevision.md
+++ b/docs/source/en/model_doc/granitevision.md
@ -31,8 +31,13 @@ Tips:
 Sample inference:
 ```python
 from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
+from PIL import Image
+import requests

-model_path = "ibm-granite/granite-vision-3.1-2b-preview"
+# Note: These docs were written prior to the public model release,
+# and this path is subject to change.
+# Please see https://huggingface.co/ibm-granite for the current model list.
+model_path = "ibm-granite/granite-3.1-2b-instruct-vision"
 processor = LlavaNextProcessor.from_pretrained(model_path)

 model = LlavaNextForConditionalGeneration.from_pretrained(model_path).to("cuda")
--- a/docs/source/en/model_doc/grounding-dino.md
+++ b/docs/source/en/model_doc/grounding-dino.md
@ -64,19 +64,18 @@ Here's how to use the model for zero-shot object detection:

 >>> results = processor.post_process_grounded_object_detection(
 ...     outputs,
-...     inputs.input_ids,
-...     box_threshold=0.4,
+...     threshold=0.4,
 ...     text_threshold=0.3,
-...     target_sizes=[image.size[::-1]]
+...     target_sizes=[(image.height, image.width)]
 ... )
-
-# Retrieve the first image result
+>>> # Retrieve the first image result
 >>> result = results[0]
->>> for box, score, labels in zip(result["boxes"], result["scores"], result["labels"]):
+>>> for box, score, text_label in zip(result["boxes"], result["scores"], result["text_labels"]):
 ...     box = [round(x, 2) for x in box.tolist()]
-...     print(f"Detected {labels} with confidence {round(score.item(), 3)} at location {box}")
-Detected a cat with confidence 0.468 at location [344.78, 22.9, 637.3, 373.62]
-Detected a cat with confidence 0.426 at location [11.74, 51.55, 316.51, 473.22]
+...     print(f"Detected {text_label} with confidence {round(score.item(), 3)} at location {box}")
+Detected a cat with confidence 0.479 at location [344.7, 23.11, 637.18, 374.28]
+Detected a cat with confidence 0.438 at location [12.27, 51.91, 316.86, 472.44]
+Detected a remote control with confidence 0.478 at location [38.57, 70.0, 176.78, 118.18]
 ```

 ## Grounded SAM
--- a/docs/source/en/model_doc/llava.md
+++ b/docs/source/en/model_doc/llava.md
@ -195,11 +195,6 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 [[autodoc]] LlavaImageProcessor
    - preprocess

-## LlavaImageProcessorFast
-
-[[autodoc]] LlavaImageProcessorFast
-    - preprocess
-
 ## LlavaProcessor

 [[autodoc]] LlavaProcessor
--- a/docs/source/en/model_doc/llava_next.md
+++ b/docs/source/en/model_doc/llava_next.md
@ -288,11 +288,6 @@ model = AutoModelForImageTextToText.from_pretrained(
 [[autodoc]] LlavaNextImageProcessor
    - preprocess

-## LlavaNextImageProcessorFast
-
-[[autodoc]] LlavaNextImageProcessorFast
-    - preprocess
-
 ## LlavaNextProcessor

 [[autodoc]] LlavaNextProcessor
--- a/docs/source/en/model_doc/llava_onevision.md
+++ b/docs/source/en/model_doc/llava_onevision.md
@ -81,7 +81,7 @@ text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=

 # Note that the template simply formats your prompt, you still have to tokenize it and obtain pixel values for your images
 print(text_prompt)
-'<|im_start|>user\n<image>What is shown in this image?<|im_end|>\n<|im_start|>assistant\nPage showing the list of options.<|im_end|>'
+>>> "<|im_start|>user\n<image>What is shown in this image?<|im_end|>\n<|im_start|>assistant\nPage showing the list of options.<|im_end|>"
 ```

 This model was contributed by [RaushanTurganbay](https://huggingface.co/RaushanTurganbay).
@ -100,8 +100,8 @@ import torch
 from PIL import Image
 import requests

-processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-hf")
-model = LlavaOnevisionForConditionalGeneration.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-hf", torch_dtype=torch.float16, low_cpu_mem_usage=True)
+processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-hf") 
+model = LlavaOnevisionForConditionalGeneration.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-hf", torch_dtype=torch.float16, low_cpu_mem_usage=True) 
 model.to("cuda:0")

 # prepare image and text prompt, using the appropriate prompt template
@ -298,8 +298,8 @@ First make sure to install flash-attn. Refer to the [original repository of Flas
 from transformers import LlavaOnevisionForConditionalGeneration

 model = LlavaOnevisionForConditionalGeneration.from_pretrained(
-    model_id,
-    torch_dtype=torch.float16,
+    model_id, 
+    torch_dtype=torch.float16, 
    low_cpu_mem_usage=True,
    use_flash_attention_2=True
 ).to(0)
@ -318,11 +318,6 @@ model = LlavaOnevisionForConditionalGeneration.from_pretrained(

 [[autodoc]] LlavaOnevisionImageProcessor

-## LlavaOnevisionImageProcessorFast
-
-[[autodoc]] LlavaOnevisionImageProcessorFast
-    - preprocess
-
 ## LlavaOnevisionVideoProcessor

 [[autodoc]] LlavaOnevisionVideoProcessor
--- a/docs/source/en/model_doc/moshi.md
+++ b/docs/source/en/model_doc/moshi.md
@ -110,13 +110,8 @@ To follow the example of the following image, `"Hello, I'm Moshi"` could be tran
 >>> from datasets import load_dataset, Audio
 >>> import torch, math
 >>> from transformers import MoshiForConditionalGeneration, AutoFeatureExtractor, AutoTokenizer
-
-
 >>> librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
->>> feature_extractor = AutoFeatureExtractor.from_pretrained("kyutai/moshiko-pytorch-bf16")
->>> tokenizer = AutoTokenizer.from_pretrained("kyutai/moshiko-pytorch-bf16")
->>> device = "cuda"
->>> dtype = torch.bfloat16
+

 >>> # prepare user input audio 
 >>> librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=feature_extractor.sampling_rate))
--- a/docs/source/en/model_doc/phi3.md
+++ b/docs/source/en/model_doc/phi3.md
@ -57,7 +57,10 @@ Phi-3 has been integrated in the development version (4.40.0.dev) of `transforme
 >>> outputs = model.generate(inputs, max_new_tokens=32)
 >>> text = tokenizer.batch_decode(outputs)[0]
 >>> print(text)
-<|user|> Can you provide ways to eat combinations of bananas and dragonfruits?<|end|><|assistant|> Certainly! Bananas and dragonfruits can be combined in various delicious ways. Here are some creative ideas for incorporating both fruits
+<s><|user|> 
+Can you provide ways to eat combinations of bananas and dragonfruits?<|end|> 
+<|assistant|> 
+Certainly! Bananas and dragonfruits can be combined in various delicious ways. Here are some ideas for eating combinations of bananas and
 ```

 ## Phi3Config
--- a/docs/source/en/model_doc/rt_detr_v2.md
+++ b/docs/source/en/model_doc/rt_detr_v2.md
@ -1,97 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# RT-DETRv2
-
-## Overview
-
-The RT-DETRv2 model was proposed in [RT-DETRv2: Improved Baseline with Bag-of-Freebies for Real-Time Detection Transformer](https://arxiv.org/abs/2407.17140) by Wenyu Lv, Yian Zhao, Qinyao Chang, Kui Huang, Guanzhong Wang, Yi Liu.
-
-RT-DETRv2 refines RT-DETR by introducing selective multi-scale feature extraction, a discrete sampling operator for broader deployment compatibility, and improved training strategies like dynamic data augmentation and scale-adaptive hyperparameters. These changes enhance flexibility and practicality while maintaining real-time performance.
-
-The abstract from the paper is the following:
-
-*In this report, we present RT-DETRv2, an improved Real-Time DEtection TRansformer (RT-DETR). RT-DETRv2 builds upon the previous state-of-the-art real-time detector, RT-DETR, and opens up a set of bag-of-freebies for flexibility and practicality, as well as optimizing the training strategy to achieve enhanced performance. To improve the flexibility, we suggest setting a distinct number of sampling points for features at different scales in the deformable attention to achieve selective multi-scale feature extraction by the decoder. To enhance practicality, we propose an optional discrete sampling operator to replace the grid_sample operator that is specific to RT-DETR compared to YOLOs. This removes the deployment constraints typically associated with DETRs. For the training strategy, we propose dynamic data augmentation and scale-adaptive hyperparameters customization to improve performance without loss of speed.*
-
-This model was contributed by [jadechoghari](https://huggingface.co/jadechoghari).
-The original code can be found [here](https://github.com/lyuwenyu/RT-DETR).
-
-## Usage tips 
-
-This second version of RT-DETR improves how the decoder finds objects in an image. 
-
- **better sampling** – adjusts offsets so the model looks at the right areas
- **flexible attention** – can use smooth (bilinear) or fixed (discrete) sampling
- **optimized processing** – improves how attention weights mix information
-
-```py
->>> import torch
->>> import requests
-
->>> from PIL import Image
->>> from transformers import RTDetrV2ForObjectDetection, RTDetrImageProcessor
-
->>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
->>> image = Image.open(requests.get(url, stream=True).raw)
-
->>> image_processor = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_v2_r18vd")
->>> model = RTDetrV2ForObjectDetection.from_pretrained("PekingU/rtdetr_v2_r18vd")
-
->>> inputs = image_processor(images=image, return_tensors="pt")
-
->>> with torch.no_grad():
-...     outputs = model(**inputs)
-
->>> results = image_processor.post_process_object_detection(outputs, target_sizes=torch.tensor([(image.height, image.width)]), threshold=0.5)
-
->>> for result in results:
-...     for score, label_id, box in zip(result["scores"], result["labels"], result["boxes"]):
-...         score, label = score.item(), label_id.item()
-...         box = [round(i, 2) for i in box.tolist()]
-...         print(f"{model.config.id2label[label]}: {score:.2f} {box}")
-cat: 0.97 [341.14, 25.11, 639.98, 372.89]
-cat: 0.96 [12.78, 56.35, 317.67, 471.34]
-remote: 0.95 [39.96, 73.12, 175.65, 117.44]
-sofa: 0.86 [-0.11, 2.97, 639.89, 473.62]
-sofa: 0.82 [-0.12, 1.78, 639.87, 473.52]
-remote: 0.79 [333.65, 76.38, 370.69, 187.48]
-```
-
-## Resources
-
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with RT-DETRv2.
-
-<PipelineTag pipeline="object-detection"/>
-
- Scripts for finetuning [`RTDetrV2ForObjectDetection`] with [`Trainer`] or [Accelerate](https://huggingface.co/docs/accelerate/index) can be found [here](https://github.com/huggingface/transformers/tree/main/examples/pytorch/object-detection).
- See also: [Object detection task guide](../tasks/object_detection).
- Notebooks for [inference](https://github.com/qubvel/transformers-notebooks/blob/main/notebooks/RT_DETR_v2_inference.ipynb) and [fine-tuning](https://github.com/qubvel/transformers-notebooks/blob/main/notebooks/RT_DETR_v2_finetune_on_a_custom_dataset.ipynb) RT-DETRv2 on a custom dataset (🌎).
-
-
-## RTDetrV2Config
-
-[[autodoc]] RTDetrV2Config
-
-
-## RTDetrV2Model
-
-[[autodoc]] RTDetrV2Model
-    - forward
- 
-## RTDetrV2ForObjectDetection
-
-[[autodoc]] RTDetrV2ForObjectDetection
-    - forward
--- a/docs/source/en/model_doc/seamless_m4t.md
+++ b/docs/source/en/model_doc/seamless_m4t.md
@ -52,7 +52,7 @@ Here is how to use the processor to process text and audio:
 ```python
 >>> # let's load an audio sample from an Arabic speech corpus
 >>> from datasets import load_dataset
->>> dataset = load_dataset("arabic_speech_corpus", split="test", streaming=True, trust_remote_code=True)
+>>> dataset = load_dataset("arabic_speech_corpus", split="test", streaming=True)
 >>> audio_sample = next(iter(dataset))["audio"]

 >>> # now, process it
--- a/docs/source/en/model_doc/seamless_m4t_v2.md
+++ b/docs/source/en/model_doc/seamless_m4t_v2.md
@ -52,7 +52,7 @@ Here is how to use the processor to process text and audio:
 ```python
 >>> # let's load an audio sample from an Arabic speech corpus
 >>> from datasets import load_dataset
->>> dataset = load_dataset("arabic_speech_corpus", split="test", streaming=True, trust_remote_code=True)
+>>> dataset = load_dataset("arabic_speech_corpus", split="test", streaming=True)
 >>> audio_sample = next(iter(dataset))["audio"]

 >>> # now, process it
--- a/docs/source/en/model_doc/siglip.md
+++ b/docs/source/en/model_doc/siglip.md
@ -86,7 +86,7 @@ If you want to do the pre- and postprocessing yourself, here's how to do that:
 >>> candidate_labels = ["2 cats", "2 dogs"]
 # follows the pipeline prompt template to get same results
 >>> texts = [f'This is a photo of {label}.' for label in candidate_labels]
-# important: we pass `padding=max_length` since the model was trained with this
+>>> # important: we pass `padding=max_length` since the model was trained with this
 >>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")

 >>> with torch.no_grad():
@ -95,7 +95,7 @@ If you want to do the pre- and postprocessing yourself, here's how to do that:
 >>> logits_per_image = outputs.logits_per_image
 >>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
 >>> print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'")
-19.8% that image 0 is '2 cats'
+31.9% that image 0 is 'a photo of 2 cats'
 ```

 ## Resources
@ -142,7 +142,8 @@ To load and run a model using Flash Attention 2, refer to the snippet below:
 # follows the pipeline prompt template to get same results
 >>> texts = [f'This is a photo of {label}.' for label in candidate_labels]
 # important: we pass `padding=max_length` since the model was trained with this
->>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt").to(device)
+>>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")
+>>> inputs.to(device)

 >>> with torch.no_grad():
 ...     with torch.autocast(device):
@ -151,7 +152,7 @@ To load and run a model using Flash Attention 2, refer to the snippet below:
 >>> logits_per_image = outputs.logits_per_image
 >>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
 >>> print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'")
-19.8% that image 0 is '2 cats'
+51.3% that image 0 is 'This is a photo of 2 cats.'
 ```


@ -214,11 +215,6 @@ Below is an expected speedup diagram that compares inference time between the na
 [[autodoc]] SiglipImageProcessor
    - preprocess

-## SiglipImageProcessorFast
-
-[[autodoc]] SiglipImageProcessorFast
-    - preprocess
-
 ## SiglipProcessor

 [[autodoc]] SiglipProcessor
--- a/docs/source/en/model_doc/zamba2.md
+++ b/docs/source/en/model_doc/zamba2.md
@ -1,93 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-# Zamba2
-
-Zamba2 is a large language model (LLM) trained by Zyphra, and made available under an Apache 2.0 license. Please see the [Zyphra Hugging Face](https://huggingface.co/collections/zyphra/) repository for model weights.
-
-This model was contributed by [pglo](https://huggingface.co/pglo).
-
-
-## Model details
-
-Zamba2-1.2B, Zamba2-2.7B and Zamba2-7B are hybrid models combining state-space models (Specifically [Mamba](https://github.com/state-spaces/mamba)) and transformer, and were trained using next-token prediction. Zamba2 uses shared transformer layers after every 6 mamba blocks. It uses the [Mistral v0.1 tokenizer](https://huggingface.co/mistralai/Mistral-7B-v0.1). We came to this architecture after a series of ablations at small scales. Zamba2-1.2B, Zamba2-2.7B and Zamba2-7B were pre-trained on 2T and 3T tokens, respectively.
-
-<img src=https://github.com/user-attachments/assets/c2cff209-b901-483c-87aa-774b82a0769f width=30% height=40% />
-
-## Quick start
-
-
-### Presequities
-
-Zamba2 requires you use `transformers` version 4.48.0 or higher:
-```bash
-pip install transformers>=4.48.0
-```
-
-## Inference
-
-```python
-from transformers import AutoTokenizer, AutoModelForCausalLM
-import torch
-
-tokenizer = AutoTokenizer.from_pretrained("Zyphra/Zamba2-7B")
-model = AutoModelForCausalLM.from_pretrained("Zyphra/Zamba2-7B", device_map="cuda", torch_dtype=torch.bfloat16)
-
-input_text = "What factors contributed to the fall of the Roman Empire?"
-input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
-
-outputs = model.generate(**input_ids, max_new_tokens=100)
-print(tokenizer.decode(outputs[0]))
-```
-
-
-## Model card
-
-The model cards can be found at:
-* [Zamba2-1.2B](https://huggingface.co/Zyphra/Zamba2-1.2B)
-* [Zamba2-2.7B](https://huggingface.co/Zyphra/Zamba2-2.7B)
-* [Zamba2-7B](https://huggingface.co/Zyphra/Zamba2-7B)
-
-
-## Issues
-For issues with model output, or community discussion, please use the Hugging Face community [forum](https://huggingface.co/Zyphra/Zamba2-7B/discussions)
-
-
-## License
-
-The model weights are open-sourced via an Apache 2.0 license.
-
-
-## Zamba2Config
-
-[[autodoc]] Zamba2Config
-
-
-## Zamba2Model
-
-[[autodoc]] Zamba2Model
-    - forward
-
-
-## Zamba2ForCausalLM
-
-[[autodoc]] Zamba2ForCausalLM
-    - forward
-
-
-## Zamba2ForSequenceClassification
-
-[[autodoc]] transformers.Zamba2ForSequenceClassification
-    - forward
--- a/docs/source/en/model_doc/zoedepth.md
+++ b/docs/source/en/model_doc/zoedepth.md
@ -70,7 +70,7 @@ Alternatively, one can also perform inference using the classes:
 >>> inputs = image_processor(images=image, return_tensors="pt")

 >>> with torch.no_grad():   
-...     outputs = model(inputs)
+...     outputs = model(pixel_values)

 >>> # interpolate to original size and visualize the prediction
 >>> ## ZoeDepth dynamically pads the input image. Thus we pass the original image size as argument
--- a/docs/source/en/perf_infer_gpu_one.md
+++ b/docs/source/en/perf_infer_gpu_one.md
@ -52,7 +52,6 @@ FlashAttention-2 is currently supported for the following architectures:
 * [Emu3](https://huggingface.co/docs/transformers/model_doc/emu3)
 * [Gemma](https://huggingface.co/docs/transformers/model_doc/gemma#transformers.GemmaModel)
 * [Gemma2](https://huggingface.co/docs/transformers/model_doc/gemma2#transformers.Gemma2Model)
-* [GotOcr2](https://huggingface.co/docs/transformers/model_doc/got_ocr2#transformers.GotOcr2ForConditionalGeneration)
 * [GPT2](https://huggingface.co/docs/transformers/model_doc/gpt2)
 * [GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode#transformers.GPTBigCodeModel)
 * [GPTNeo](https://huggingface.co/docs/transformers/model_doc/gpt_neo#transformers.GPTNeoModel)
@ -112,7 +111,6 @@ FlashAttention-2 is currently supported for the following architectures:
 * [UniSpeech](https://huggingface.co/docs/transformers/v4.39.3/en/model_doc/unispeech#transformers.UniSpeechModel)
 * [unispeech_sat](https://huggingface.co/docs/transformers/v4.39.3/en/model_doc/unispeech-sat#transformers.UniSpeechSatModel)
 * [helium](https://huggingface.co/docs/transformers/main/en/model_doc/heliumtransformers.HeliumModel)
-* [Zamba2](https://huggingface.co/docs/transformers/model_doc/zamba2)

 You can request to add FlashAttention-2 support for another model by opening a GitHub Issue or Pull Request.

@ -244,7 +242,6 @@ For now, Transformers supports SDPA inference and training for the following arc
 * [data2vec_vision](https://huggingface.co/docs/transformers/main/en/model_doc/data2vec#transformers.Data2VecVisionModel)
 * [Dbrx](https://huggingface.co/docs/transformers/model_doc/dbrx#transformers.DbrxModel)
 * [DeiT](https://huggingface.co/docs/transformers/model_doc/deit#transformers.DeiTModel)
-* [DepthPro](https://huggingface.co/docs/transformers/model_doc/depth_pro#transformers.DepthProModel)
 * [DiffLlama](https://huggingface.co/docs/transformers/model_doc/diffllama#transformers.DiffLlamaModel)
 * [Dinov2](https://huggingface.co/docs/transformers/en/model_doc/dinov2)
 * [Dinov2_with_registers](https://huggingface.co/docs/transformers/en/model_doc/dinov2)
@ -255,7 +252,6 @@ For now, Transformers supports SDPA inference and training for the following arc
 * [Falcon](https://huggingface.co/docs/transformers/model_doc/falcon#transformers.FalconModel)
 * [Gemma](https://huggingface.co/docs/transformers/model_doc/gemma#transformers.GemmaModel)
 * [Gemma2](https://huggingface.co/docs/transformers/model_doc/gemma2#transformers.Gemma2Model)
-* [GotOcr2](https://huggingface.co/docs/transformers/model_doc/got_ocr2#transformers.GotOcr2ForConditionalGeneration)
 * [Granite](https://huggingface.co/docs/transformers/model_doc/granite#transformers.GraniteModel)
 * [GPT2](https://huggingface.co/docs/transformers/model_doc/gpt2)
 * [GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode#transformers.GPTBigCodeModel)
@ -332,7 +328,6 @@ For now, Transformers supports SDPA inference and training for the following arc
 * [XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl#transformers.XLMRobertaXLModel)
 * [YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos#transformers.YolosModel)
 * [helium](https://huggingface.co/docs/transformers/main/en/model_doc/heliumtransformers.HeliumModel)
-* [Zamba2](https://huggingface.co/docs/transformers/model_doc/zamba2)

 <Tip>

@ -358,7 +353,7 @@ tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
 model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype=torch.float16).to("cuda")

 input_text = "Hello my dog is cute and"
-inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
+inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

 + with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
    outputs = model.generate(**inputs)
@ -432,14 +427,14 @@ To load a model in 4-bit for inference, use the `load_in_4bit` parameter. The `d
 ```py
 from transformers import AutoModelForCausalLM

-model_name = "bigscience/bloom-1b7"
+model_name = "bigscience/bloom-2b5"
 model_4bit = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", device_map="auto", load_in_4bit=True)
 ```

-To load a model in 4-bit for inference with multiple GPUs, you can control how much GPU RAM you want to allocate to each GPU. For example, to distribute 2GB of memory to the first GPU and 5GB of memory to the second GPU:
+To load a model in 4-bit for inference with multiple GPUs, you can control how much GPU RAM you want to allocate to each GPU. For example, to distribute 600MB of memory to the first GPU and 1GB of memory to the second GPU:

 ```py
-max_memory_mapping = {0: "2GB", 1: "5GB"}
+max_memory_mapping = {0: "600MB", 1: "1GB"}
 model_name = "bigscience/bloom-3b"
 model_4bit = AutoModelForCausalLM.from_pretrained(
    model_name, torch_dtype="auto", device_map="auto", load_in_4bit=True, max_memory=max_memory_mapping
@ -459,7 +454,7 @@ To load a model in 8-bit for inference, use the `load_in_8bit` parameter. The `d
 ```py
 from transformers import AutoModelForCausalLM, BitsAndBytesConfig

-model_name = "bigscience/bloom-1b7"
+model_name = "bigscience/bloom-2b5"
 model_8bit = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", quantization_config=BitsAndBytesConfig(load_in_8bit=True))
 ```

@ -468,20 +463,20 @@ If you're loading a model in 8-bit for text generation, you should use the [`~tr
 ```py
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

-model_name = "bigscience/bloom-1b7"
+model_name = "bigscience/bloom-2b5"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model_8bit = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto", quantization_config=BitsAndBytesConfig(load_in_8bit=True))

 prompt = "Hello, my llama is cute"
-inputs = tokenizer(prompt, return_tensors="pt").to(model_8bit.device)
-generated_ids = model_8bit.generate(**inputs)
+inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
+generated_ids = model.generate(**inputs)
 outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
 ```

-To load a model in 8-bit for inference with multiple GPUs, you can control how much GPU RAM you want to allocate to each GPU. For example, to distribute 2GB of memory to the first GPU and 5GB of memory to the second GPU:
+To load a model in 8-bit for inference with multiple GPUs, you can control how much GPU RAM you want to allocate to each GPU. For example, to distribute 1GB of memory to the first GPU and 2GB of memory to the second GPU:

 ```py
-max_memory_mapping = {0: "2GB", 1: "5GB"}
+max_memory_mapping = {0: "1GB", 1: "2GB"}
 model_name = "bigscience/bloom-3b"
 model_8bit = AutoModelForCausalLM.from_pretrained(
    model_name, torch_dtype="auto", device_map="auto", load_in_8bit=True, max_memory=max_memory_mapping
@ -546,8 +541,11 @@ quantization_config = BitsAndBytesConfig(
 tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
 model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype="auto", quantization_config=quantization_config)

+# enable BetterTransformer
+model = model.to_bettertransformer()
+
 input_text = "Hello my dog is cute and"
-inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
+inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

 # enable FlashAttention
 with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
--- a/docs/source/en/perf_train_gpu_many.md
+++ b/docs/source/en/perf_train_gpu_many.md
@ -476,7 +476,7 @@ And GPU1 does the same by enlisting GPU3 to its aid.
 Since each dimension requires at least 2 GPUs, here you'd need at least 4 GPUs.

 Implementations:
- [DeepSpeed](https://github.com/deepspeedai/DeepSpeed)
+- [DeepSpeed](https://github.com/microsoft/DeepSpeed)
 - [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)
 - [Varuna](https://github.com/microsoft/varuna)
 - [SageMaker](https://arxiv.org/abs/2111.05972)
@ -497,7 +497,7 @@ This diagram is from a blog post [3D parallelism: Scaling to trillion-parameter
 Since each dimension requires at least 2 GPUs, here you'd need at least 8 GPUs.

 Implementations:
- [DeepSpeed](https://github.com/deepspeedai/DeepSpeed) - DeepSpeed also includes an even more efficient DP, which they call ZeRO-DP.
+- [DeepSpeed](https://github.com/microsoft/DeepSpeed) - DeepSpeed also includes an even more efficient DP, which they call ZeRO-DP.
 - [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)
 - [Varuna](https://github.com/microsoft/varuna)
 - [SageMaker](https://arxiv.org/abs/2111.05972)
--- a/docs/source/en/perf_train_gpu_one.md
+++ b/docs/source/en/perf_train_gpu_one.md
@ -298,7 +298,8 @@ from transformers.trainer_pt_utils import get_parameter_names

 training_args = TrainingArguments(per_device_train_batch_size=4, **default_args)

-decay_parameters = get_parameter_names(model, [nn.LayerNorm], ["bias", "layernorm", "rmsnorm"])
+decay_parameters = get_parameter_names(model, [nn.LayerNorm])
+decay_parameters = [name for name in decay_parameters if "bias" not in name]
 optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if n in decay_parameters],
--- a/docs/source/en/quantization/bitsandbytes.md
+++ b/docs/source/en/quantization/bitsandbytes.md
@ -208,8 +208,7 @@ from transformers import AutoModelForCausalLM, BitsAndBytesConfig
 model_id = "bigscience/bloom-1b7"

 quantization_config = BitsAndBytesConfig(
-    llm_int8_threshold=10.0,
-    llm_int8_enable_fp32_cpu_offload=True
+    llm_int8_threshold=10,
 )

 model_8bit = AutoModelForCausalLM.from_pretrained(
@ -286,7 +285,7 @@ For inference, the `bnb_4bit_quant_type` does not have a huge impact on performa

 ### Nested quantization

-Nested quantization is a technique that can save additional memory at no additional performance cost. This feature performs a second quantization of the already quantized weights to save an additional 0.4 bits/parameter. For example, with nested quantization, you can finetune a [Llama-13b](https://huggingface.co/meta-llama/Llama-2-13b-chat-hf) model on a 16GB NVIDIA T4 GPU with a sequence length of 1024, a batch size of 1, and enabling gradient accumulation with 4 steps.
+Nested quantization is a technique that can save additional memory at no additional performance cost. This feature performs a second quantization of the already quantized weights to save an additional 0.4 bits/parameter. For example, with nested quantization, you can finetune a [Llama-13b](https://huggingface.co/meta-llama/Llama-2-13b) model on a 16GB NVIDIA T4 GPU with a sequence length of 1024, a batch size of 1, and enabling gradient accumulation with 4 steps.

 ```py
 from transformers import BitsAndBytesConfig
@ -296,7 +295,7 @@ double_quant_config = BitsAndBytesConfig(
    bnb_4bit_use_double_quant=True,
 )

-model_double_quant = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-13b-chat-hf", torch_dtype="auto", quantization_config=double_quant_config)
+model_double_quant = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-13b", torch_dtype="auto", quantization_config=double_quant_config)
 ```

 ## Dequantizing `bitsandbytes` models
@ -308,7 +307,7 @@ from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer

 model_id = "facebook/opt-125m"

-model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=BitsAndBytesConfig(load_in_4bit=True))
+model = AutoModelForCausalLM.from_pretrained(model_id, BitsAndBytesConfig(load_in_4bit=True))
 tokenizer = AutoTokenizer.from_pretrained(model_id)

 model.dequantize()
--- a/docs/source/en/quantization/compressed_tensors.md
+++ b/docs/source/en/quantization/compressed_tensors.md
@ -61,7 +61,7 @@ ct_model = AutoModelForCausalLM.from_pretrained("nm-testing/Meta-Llama-3.1-8B-In

 # Measure memory usage
 mem_params = sum([param.nelement()*param.element_size() for param in ct_model.parameters()])
-print(f"{mem_params/2**30:.4f} GB")
+print(f"{mem/2**30:.4f} GB")
 # 8.4575 GB
 ```

--- a/docs/source/en/serialization.md
+++ b/docs/source/en/serialization.md
@ -130,7 +130,7 @@ Alternative to CLI, you can export a 🤗 Transformers model to ONNX programmati
 >>> from optimum.onnxruntime import ORTModelForSequenceClassification
 >>> from transformers import AutoTokenizer

->>> model_checkpoint = "distilbert/distilbert-base-uncased-distilled-squad"
+>>> model_checkpoint = "distilbert_base_uncased_squad"
 >>> save_directory = "onnx/"

 >>> # Load a model from transformers and export it to ONNX
--- a/docs/source/en/task_summary.md
+++ b/docs/source/en/task_summary.md
@ -305,7 +305,10 @@ There are two types of language modeling:
    ...     for pred in preds
    ... ]
    >>> preds
-    [{'score': 0.224, 'token': 3944, 'token_str': ' tool', 'sequence': 'Hugging Face is a community-based open-source tool for machine learning.'}]
+    [{'score': 0.2236,
+      'token': 1761,
+      'token_str': ' platform',
+      'sequence': 'Hugging Face is a community-based open-source platform for machine learning.'}]
    ```

 ## Multimodal
--- a/docs/source/en/tasks/prompting.md
+++ b/docs/source/en/tasks/prompting.md
@ -80,7 +80,7 @@ Run inference with decoder-only models with the `text-generation` pipeline:
 >>> prompt = "Hello, I'm a language model"

 >>> generator(prompt, max_length = 30)
-[{'generated_text': "Hello, I'm a language model. Not a programming language at all: it's pretty simple.\n\nWhen I write a function, I mean"}]
+[{'generated_text': "Hello, I'm a language model programmer so you can use some of my stuff. But you also need some sort of a C program to run."}]
 ```

 To run inference with an encoder-decoder, use the `text2text-generation` pipeline:
@ -258,7 +258,7 @@ also be a suitable location for instructions. Typically, it's better to place th

 >>> for seq in sequences:
 ...     print(f"{seq['generated_text']}")
-"Permaculture is an ecological design method that mimics natural ecosystems' diversity, functionality, and resilience using modern technology and indigenous knowledge. It aims to help"
+Permaculture is an ecological design mimicking natural ecosystems to meet basic needs and prepare for climate change. It is based on traditional knowledge and scientific understanding.
 ```

 #### Question answering
@ -284,7 +284,7 @@ the leading word or phrase (`"Answer:"`) to nudge the model to start generating

 >>> for seq in sequences:
 ...     print(f"Result: {seq['generated_text']}")
-"Result: Modern tools are used, such as immersion blenders"
+Result: Modern tools often used to make gazpacho include
 ```

 #### Reasoning
@ -309,7 +309,7 @@ Let's try if we can make a model reason about a simple arithmetics task with a b
 >>> for seq in sequences:
 ...     print(f"Result: {seq['generated_text']}")
 Result: 
-There are a total of 50 students in the class (5 groups x 4 students per group = 20 groups, and 
+There are a total of 5 groups, so there are 5 x 4=20 students in the class.
 ```

 Correct! Let's increase the complexity a little and see if we can still get away with a basic prompt:
--- a/docs/source/en/trainer.md
+++ b/docs/source/en/trainer.md
@ -130,7 +130,7 @@ from torch import nn
 from transformers import Trainer

 class CustomTrainer(Trainer):
-    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
+    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        # forward pass
        outputs = model(**inputs)
@ -156,7 +156,9 @@ class EarlyStoppingCallback(TrainerCallback):

    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step >= self.num_steps:
-            control.should_training_stop = True
+            return {"should_training_stop": True}
+        else:
+            return {}
 ```

 Then pass it to the [`Trainer`]'s `callback` parameter.
@ -735,7 +737,7 @@ accelerate launch --num_processes=2 \
    --fsdp_transformer_layer_cls_to_wrap="BertLayer" \
    --fsdp_sharding_strategy=1 \
    --fsdp_state_dict_type=FULL_STATE_DICT \
-    ./examples/pytorch/text-classification/run_glue.py \
+    ./examples/pytorch/text-classification/run_glue.py
    --model_name_or_path google-bert/bert-base-cased \
    --task_name $TASK_NAME \
    --do_train \
--- a/docs/source/ja/main_classes/deepspeed.md
+++ b/docs/source/ja/main_classes/deepspeed.md
@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.

 # DeepSpeed Integration

-[DeepSpeed](https://github.com/deepspeedai/DeepSpeed) は、[ZeRO 論文](https://arxiv.org/abs/1910.02054) で説明されているすべてを実装します。現在、次のものを完全にサポートしています。
+[DeepSpeed](https://github.com/microsoft/DeepSpeed) は、[ZeRO 論文](https://arxiv.org/abs/1910.02054) で説明されているすべてを実装します。現在、次のものを完全にサポートしています。

 1. オプティマイザーの状態分割 (ZeRO ステージ 1)
 2. 勾配分割 (ZeRO ステージ 2)
@ -32,7 +32,7 @@ DeepSpeed ZeRO-2 は、その機能が推論には役に立たないため、主
 DeepSpeed ZeRO-3 は、巨大なモデルを複数の GPU にロードできるため、推論にも使用できます。
 単一の GPU では不可能です。

-🤗 Transformers は、2 つのオプションを介して [DeepSpeed](https://github.com/deepspeedai/DeepSpeed) を統合します。
+🤗 Transformers は、2 つのオプションを介して [DeepSpeed](https://github.com/microsoft/DeepSpeed) を統合します。

 1. [`Trainer`] によるコア DeepSpeed 機能の統合。何でもやってくれるタイプです
   統合の場合 - カスタム構成ファイルを指定するか、テンプレートを使用するだけで、他に何もする必要はありません。たいていの
@ -78,7 +78,7 @@ pip install deepspeed
 pip install transformers[deepspeed]
 ```

-または、[DeepSpeed の GitHub ページ](https://github.com/deepspeedai/DeepSpeed#installation) で詳細を確認してください。
+または、[DeepSpeed の GitHub ページ](https://github.com/microsoft/deepspeed#installation) で詳細を確認してください。
 [高度なインストール](https://www.deepspeed.ai/tutorials/advanced-install/)。

 それでもビルドに苦労する場合は、まず [CUDA 拡張機能のインストール ノート](trainer#cuda-extension-installation-notes) を必ず読んでください。
@ -89,7 +89,7 @@ pip install transformers[deepspeed]
 DeepSpeed のローカル ビルドを作成するには:

 ```bash
-git clone https://github.com/deepspeedai/DeepSpeed/
+git clone https://github.com/microsoft/DeepSpeed/
 cd DeepSpeed
 rm -rf build
 TORCH_CUDA_ARCH_LIST="8.6" DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 pip install . \
@ -113,7 +113,7 @@ CUDA_VISIBLE_DEVICES=0 python -c "import torch; print(torch.cuda.get_device_capa
 複数のマシンで同じセットアップを使用する必要がある場合は、バイナリ ホイールを作成します。

 ```bash
-git clone https://github.com/deepspeedai/DeepSpeed/
+git clone https://github.com/microsoft/DeepSpeed/
 cd DeepSpeed
 rm -rf build
 TORCH_CUDA_ARCH_LIST="8.6" DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 \
@ -154,7 +154,7 @@ _CudaDeviceProperties(name='GeForce RTX 3090', major=8, minor=6, total_memory=24
 目的のアーチを明示的に指定することをお勧めします。

 提案されたことをすべて試してもまだビルドの問題が発生する場合は、GitHub の問題に進んでください。
-[ディープスピード](https://github.com/deepspeedai/DeepSpeed/issues)、
+[ディープスピード](https://github.com/microsoft/DeepSpeed/issues)、

 <a id='deepspeed-multi-gpu'></a>

@ -481,11 +481,11 @@ deepspeed examples/pytorch/translation/run_translation.py ...
 設定ファイルで使用できる DeepSpeed 設定オプションの完全なガイドについては、次を参照してください。
 [次のドキュメント](https://www.deepspeed.ai/docs/config-json/) にアクセスしてください。

-さまざまな実際のニーズに対応する数十の DeepSpeed 構成例を [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples)で見つけることができます。
+さまざまな実際のニーズに対応する数十の DeepSpeed 構成例を [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples)で見つけることができます。
 リポジトリ:

 ```bash
-git clone https://github.com/deepspeedai/DeepSpeedExamples
+git clone https://github.com/microsoft/DeepSpeedExamples
 cd DeepSpeedExamples
 find . -name '*json'
 ```
@ -497,7 +497,7 @@ find . -name '*json'
 grep -i Lamb $(find . -name '*json')
 ```

-さらにいくつかの例が [メイン リポジトリ](https://github.com/deepspeedai/DeepSpeed) にもあります。
+さらにいくつかの例が [メイン リポジトリ](https://github.com/microsoft/DeepSpeed) にもあります。

 DeepSpeed を使用する場合は、常に DeepSpeed 構成ファイルを指定する必要がありますが、一部の構成パラメータには
 コマンドライン経由で設定します。微妙な違いについては、このガイドの残りの部分で説明します。
@ -868,7 +868,7 @@ ZeRO-Infinity は、GPU と CPU メモリを NVMe メモリで拡張すること
 書き込みでは、読み取り最大 3.5 GB/秒、書き込み最大 3 GB/秒のピーク速度が得られます)。

 最適な`aio`構成ブロックを見つけるには、ターゲット設定でベンチマークを実行する必要があります。
-[ここで説明](https://github.com/deepspeedai/DeepSpeed/issues/998)。
+[ここで説明](https://github.com/microsoft/DeepSpeed/issues/998)。

 <a id='deepspeed-zero2-zero3-performance'></a>

@ -1934,7 +1934,7 @@ SW: Model with 2783M total params, 65M largest layer params.
  問題が解決しない場合にのみ、Deepspeed について言及し、必要な詳細をすべて提供してください。

 - 問題が統合部分ではなく DeepSpeed コアにあることが明らかな場合は、問題を提出してください。
-  [Deepspeed](https://github.com/deepspeedai/DeepSpeed/) を直接使用します。よくわからない場合でも、ご安心ください。
+  [Deepspeed](https://github.com/microsoft/DeepSpeed/) を直接使用します。よくわからない場合でも、ご安心ください。
  どちらの問題トラッカーでも問題ありません。投稿されたらそれを判断し、次の場合は別の問題トラッカーにリダイレクトします。
  そうである必要がある。

@ -1994,7 +1994,7 @@ SW: Model with 2783M total params, 65M largest layer params.

 ### Notes

- DeepSpeed には pip でインストール可能な PyPI パッケージがありますが、ハードウェアに最も適合するように、また有効にする必要がある場合は、[ソース](https://github.com/deepspeedai/DeepSpeed#installation) からインストールすることを強くお勧めします。
+- DeepSpeed には pip でインストール可能な PyPI パッケージがありますが、ハードウェアに最も適合するように、また有効にする必要がある場合は、[ソース](https://github.com/microsoft/deepspeed#installation) からインストールすることを強くお勧めします。
  1 ビット Adam などの特定の機能は、pypi ディストリビューションでは利用できません。
 - 🤗 Transformers で DeepSpeed を使用するために [`Trainer`] を使用する必要はありません - 任意のモデルを使用できます
  後者は [DeepSpeed 統合手順](https://www.deepspeed.ai/getting-started/#writing-deepspeed-models) に従って調整する必要があります。
@ -2239,7 +2239,7 @@ RUN_SLOW=1 pytest tests/deepspeed

 ## Main DeepSpeed Resources

- [プロジェクトの github](https://github.com/deepspeedai/DeepSpeed)
+- [プロジェクトの github](https://github.com/microsoft/deepspeed)
 - [使用方法ドキュメント](https://www.deepspeed.ai/getting-started/)
 - [API ドキュメント](https://deepspeed.readthedocs.io/en/latest/index.html)
 - [ブログ投稿](https://www.microsoft.com/en-us/research/search/?q=deepspeed)
@ -2251,4 +2251,4 @@ RUN_SLOW=1 pytest tests/deepspeed
 - [ZeRO-Infinity: 極限スケールの深層学習のための GPU メモリの壁を打ち破る](https://arxiv.org/abs/2104.07857)

 最後に、HuggingFace [`Trainer`] は DeepSpeed のみを統合していることを覚えておいてください。
-DeepSpeed の使用に関して問題や質問がある場合は、[DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/issues) に問題を提出してください。
+DeepSpeed の使用に関して問題や質問がある場合は、[DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/issues) に問題を提出してください。
--- a/docs/source/ja/main_classes/trainer.md
+++ b/docs/source/ja/main_classes/trainer.md
@ -199,7 +199,7 @@ _python_、_numpy_、および _pytorch_ の RNG 状態は、そのチェック
 torchrun --nproc_per_node=2  trainer-program.py ...
 ```

-[`accelerate`](https://github.com/huggingface/accelerate) または [`deepspeed`](https://github.com/deepspeedai/DeepSpeed) がインストールされている場合は、次を使用して同じことを達成することもできます。の一つ：
+[`accelerate`](https://github.com/huggingface/accelerate) または [`deepspeed`](https://github.com/microsoft/DeepSpeed) がインストールされている場合は、次を使用して同じことを達成することもできます。の一つ：

 ```bash
 accelerate launch --num_processes 2 trainer-program.py ...
@ -291,7 +291,7 @@ export CUDA_VISIBLE_DEVICES=1,0
 [`Trainer`] は、トレーニングを劇的に改善する可能性のあるライブラリをサポートするように拡張されました。
 時間とはるかに大きなモデルに適合します。

-現在、サードパーティのソリューション [DeepSpeed](https://github.com/deepspeedai/DeepSpeed) および [PyTorch FSDP](https://pytorch.org/docs/stable/fsdp.html) をサポートしています。論文 [ZeRO: メモリの最適化兆パラメータ モデルのトレーニングに向けて、Samyam Rajbhandari、Jeff Rasley、Olatunji Ruwase、Yuxiong He 著](https://arxiv.org/abs/1910.02054)。
+現在、サードパーティのソリューション [DeepSpeed](https://github.com/microsoft/DeepSpeed) および [PyTorch FSDP](https://pytorch.org/docs/stable/fsdp.html) をサポートしています。論文 [ZeRO: メモリの最適化兆パラメータ モデルのトレーニングに向けて、Samyam Rajbhandari、Jeff Rasley、Olatunji Ruwase、Yuxiong He 著](https://arxiv.org/abs/1910.02054)。

 この提供されるサポートは、この記事の執筆時点では新しくて実験的なものです。 DeepSpeed と PyTorch FSDP のサポートはアクティブであり、それに関する問題は歓迎しますが、FairScale 統合は PyTorch メインに統合されているため、もうサポートしていません ([PyTorch FSDP 統合](#pytorch-fully-sharded-data-parallel))

@ -301,7 +301,7 @@ export CUDA_VISIBLE_DEVICES=1,0

 この記事の執筆時点では、Deepspeed を使用するには、CUDA C++ コードをコンパイルする必要があります。

-すべてのインストールの問題は、[Deepspeed](https://github.com/deepspeedai/DeepSpeed/issues) の対応する GitHub の問題を通じて対処する必要がありますが、ビルド中に発生する可能性のある一般的な問題がいくつかあります。
+すべてのインストールの問題は、[Deepspeed](https://github.com/microsoft/DeepSpeed/issues) の対応する GitHub の問題を通じて対処する必要がありますが、ビルド中に発生する可能性のある一般的な問題がいくつかあります。
 CUDA 拡張機能を構築する必要がある PyTorch 拡張機能。

 したがって、次の操作を実行中に CUDA 関連のビルドの問題が発生した場合は、次のとおりです。
--- a/docs/source/ja/model_doc/blip.md
+++ b/docs/source/ja/model_doc/blip.md
@ -61,11 +61,6 @@ BLIP は、次のようなさまざまなマルチモーダル タスクを実
 [[autodoc]] BlipImageProcessor
    - preprocess

-## BlipImageProcessorFast
-
-[[autodoc]] BlipImageProcessorFast
-    - preprocess
-
 <frameworkcontent>
 <pt>

--- a/docs/source/ja/model_doc/clip.md
+++ b/docs/source/ja/model_doc/clip.md
@ -133,11 +133,6 @@ CLIP を使い始めるのに役立つ公式 Hugging Face およびコミュニ
 [[autodoc]] CLIPImageProcessor
    - preprocess

-## CLIPImageProcessorFast
-
-[[autodoc]] CLIPImageProcessorFast
-    - preprocess
-
 ## CLIPFeatureExtractor

 [[autodoc]] CLIPFeatureExtractor
--- a/docs/source/ja/model_doc/convnext.md
+++ b/docs/source/ja/model_doc/convnext.md
@ -64,11 +64,6 @@ ConvNeXT の使用を開始するのに役立つ公式 Hugging Face およびコ
 [[autodoc]] ConvNextImageProcessor
    - preprocess

-## ConvNextImageProcessorFast
-
-[[autodoc]] ConvNextImageProcessorFast
-    - preprocess
-
 <frameworkcontent>
 <pt>

--- a/docs/source/ja/model_doc/deit.md
+++ b/docs/source/ja/model_doc/deit.md
@ -98,11 +98,6 @@ DeiT を始めるのに役立つ公式 Hugging Face およびコミュニティ
 [[autodoc]] DeiTImageProcessor
    - preprocess

-## DeiTImageProcessorFast
-
-[[autodoc]] DeiTImageProcessorFast
-    - preprocess
-
 <frameworkcontent>
 <pt>

--- a/docs/source/ja/perf_train_gpu_many.md
+++ b/docs/source/ja/perf_train_gpu_many.md
@ -360,7 +360,7 @@ by [@anton-l](https://github.com/anton-l)。
 SageMakerは、より効率的な処理のためにTPとDPを組み合わせて使用します。

 代替名：
- [DeepSpeed](https://github.com/deepspeedai/DeepSpeed)はこれを「テンソルスライシング」と呼びます。詳細は[DeepSpeedの特徴](https://www.deepspeed.ai/training/#model-parallelism)をご覧ください。
+- [DeepSpeed](https://github.com/microsoft/DeepSpeed)はこれを「テンソルスライシング」と呼びます。詳細は[DeepSpeedの特徴](https://www.deepspeed.ai/training/#model-parallelism)をご覧ください。

 実装例:
 - [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)には、モデル固有の内部実装があります。
@ -384,7 +384,7 @@ DeepSpeedの[パイプラインチュートリアル](https://www.deepspeed.ai/t
 各次元には少なくとも2つのGPUが必要ですので、ここでは少なくとも4つのGPUが必要です。

 実装例:
- [DeepSpeed](https://github.com/deepspeedai/DeepSpeed)
+- [DeepSpeed](https://github.com/microsoft/DeepSpeed)
 - [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)
 - [Varuna](https://github.com/microsoft/varuna)
 - [SageMaker](https://arxiv.org/abs/2111.05972)
@ -403,7 +403,7 @@ DeepSpeedの[パイプラインチュートリアル](https://www.deepspeed.ai/t
 各次元には少なくとも2つのGPUが必要ですので、ここでは少なくとも8つのGPUが必要です。

 実装例:
- [DeepSpeed](https://github.com/deepspeedai/DeepSpeed) - DeepSpeedには、さらに効率的なDPであるZeRO-DPと呼ばれるものも含まれています。
+- [DeepSpeed](https://github.com/microsoft/DeepSpeed) - DeepSpeedには、さらに効率的なDPであるZeRO-DPと呼ばれるものも含まれています。
 - [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)
 - [Varuna](https://github.com/microsoft/varuna)
 - [SageMaker](https://arxiv.org/abs/2111.05972)
--- a/docs/source/ja/perf_train_gpu_one.md
+++ b/docs/source/ja/perf_train_gpu_one.md
@ -237,7 +237,8 @@ from transformers.trainer_pt_utils import get_parameter_names

 training_args = TrainingArguments(per_device_train_batch_size=4, **default_args)

-decay_parameters = get_parameter_names(model, [nn.LayerNorm], ["bias", "layernorm", "rmsnorm"])
+decay_parameters = get_parameter_names(model, [nn.LayerNorm])
+decay_parameters = [name for name in decay_parameters if "bias" not in name]
 optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if n in decay_parameters],
--- a/docs/source/ko/deepspeed.md
+++ b/docs/source/ko/deepspeed.md
@ -28,7 +28,7 @@ GPU가 제한된 환경에서 ZeRO는 최적화 메모리와 계산을 GPU에서

 ## 설치[[installation]]

-DeepSpeed는 PyPI 또는 Transformers에서 설치할 수 있습니다(자세한 설치 옵션은 DeepSpeed [설치 상세사항](https://www.deepspeed.ai/tutorials/advanced-install/) 또는 GitHub [README](https://github.com/deepspeedai/DeepSpeed#installation)를 참조하세요).
+DeepSpeed는 PyPI 또는 Transformers에서 설치할 수 있습니다(자세한 설치 옵션은 DeepSpeed [설치 상세사항](https://www.deepspeed.ai/tutorials/advanced-install/) 또는 GitHub [README](https://github.com/microsoft/deepspeed#installation)를 참조하세요).

 <Tip>

@ -114,10 +114,10 @@ DeepSpeed는 트레이닝 실행 방법을 구성하는 모든 매개변수가

 <Tip>

-DeepSpeed 구성 옵션의 전체 목록은 [DeepSpeed Configuration JSON](https://www.deepspeed.ai/docs/config-json/)에서 확인할 수 있습니다. 또한 [DeepSpeedExamples](https://github.com/deepspeedai/DeepSpeedExamples) 리포지토리 또는 기본 [DeepSpeed](https://github.com/deepspeedai/DeepSpeed) 리포지토리에서 다양한 DeepSpeed 구성 예제에 대한 보다 실용적인 예제를 찾을 수 있습니다. 구체적인 예제를 빠르게 찾으려면 다음과 같이 하세요:
+DeepSpeed 구성 옵션의 전체 목록은 [DeepSpeed Configuration JSON](https://www.deepspeed.ai/docs/config-json/)에서 확인할 수 있습니다. 또한 [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) 리포지토리 또는 기본 [DeepSpeed](https://github.com/microsoft/DeepSpeed) 리포지토리에서 다양한 DeepSpeed 구성 예제에 대한 보다 실용적인 예제를 찾을 수 있습니다. 구체적인 예제를 빠르게 찾으려면 다음과 같이 하세요:

 ```bash
-git clone https://github.com/deepspeedai/DeepSpeedExamples
+git clone https://github.com/microsoft/DeepSpeedExamples
 cd DeepSpeedExamples
 find . -name '*json'
 # Lamb 옵티마이저 샘플 찾기
@ -303,7 +303,7 @@ ZeRO-3로 대규모 모델을 초기화하고 매개변수에 액세스하는

 [ZeRO-Infinity](https://hf.co/papers/2104.07857)를 사용하면 모델 상태를 CPU 및/또는 NVMe로 오프로드하여 더 많은 메모리를 절약할 수 있습니다. 스마트 파티셔닝 및 타일링 알고리즘을 통해 각 GPU는 오프로딩 중에 매우 적은 양의 데이터를 주고받을 수 있으므로 최신 NVMe는 훈련 프로세스에 사용할 수 있는 것보다 훨씬 더 큰 총 메모리 풀에 맞출 수 있습니다. ZeRO-Infinity에는 ZeRO-3가 필요합니다.

-사용 가능한 CPU 및/또는 NVMe 메모리에 따라 [옵티마이저](https://www.deepspeed.ai/docs/config-json/#optimizer-offloading)와 [매개변수](https://www.deepspeed.ai/docs/config-json/#parameter-offloading) 중 하나만 오프로드하거나 아무것도 오프로드하지 않을 수 있습니다. 또한 일반 하드 드라이브나 솔리드 스테이트 드라이브에서도 작동하지만 속도가 현저히 느려지므로 `nvme_path`가 NVMe 장치를 가리키고 있는지 확인해야 합니다. 최신 NVMe를 사용하면 읽기 작업의 경우 최대 3.5GB/s, 쓰기 작업의 경우 최대 3GB/s의 전송 속도를 기대할 수 있습니다. 마지막으로, 트레이닝 설정에서 [벤치마크 실행하기](https://github.com/deepspeedai/DeepSpeed/issues/998)을 통해 최적의 'aio' 구성을 결정합니다.
+사용 가능한 CPU 및/또는 NVMe 메모리에 따라 [옵티마이저](https://www.deepspeed.ai/docs/config-json/#optimizer-offloading)와 [매개변수](https://www.deepspeed.ai/docs/config-json/#parameter-offloading) 중 하나만 오프로드하거나 아무것도 오프로드하지 않을 수 있습니다. 또한 일반 하드 드라이브나 솔리드 스테이트 드라이브에서도 작동하지만 속도가 현저히 느려지므로 `nvme_path`가 NVMe 장치를 가리키고 있는지 확인해야 합니다. 최신 NVMe를 사용하면 읽기 작업의 경우 최대 3.5GB/s, 쓰기 작업의 경우 최대 3GB/s의 전송 속도를 기대할 수 있습니다. 마지막으로, 트레이닝 설정에서 [벤치마크 실행하기](https://github.com/microsoft/DeepSpeed/issues/998)을 통해 최적의 'aio' 구성을 결정합니다.

 아래 예제 ZeRO-3/Infinity 구성 파일은 대부분의 매개변수 값을 `auto`으로 설정하고 있지만, 수동으로 값을 추가할 수도 있습니다.

@ -1141,7 +1141,7 @@ rank1:

 ## 트러블슈팅[[troubleshoot]]

-문제가 발생하면 DeepSpeed가 문제의 원인이 아닌 경우가 많으므로(아주 명백하고 예외적으로 DeepSpeed 모듈을 볼 수 있는 경우가 아니라면) DeepSpeed가 문제의 원인인지 고려해야 합니다! 첫 번째 단계는 DeepSpeed 없이 설정을 다시 시도하고 문제가 지속되면 문제를 신고하는 것입니다. 문제가 핵심적인 DeepSpeed 문제이고 transformers와 관련이 없는 경우, [DeepSpeed 리포지토리](https://github.com/deepspeedai/DeepSpeed)에서 이슈를 개설하세요.
+문제가 발생하면 DeepSpeed가 문제의 원인이 아닌 경우가 많으므로(아주 명백하고 예외적으로 DeepSpeed 모듈을 볼 수 있는 경우가 아니라면) DeepSpeed가 문제의 원인인지 고려해야 합니다! 첫 번째 단계는 DeepSpeed 없이 설정을 다시 시도하고 문제가 지속되면 문제를 신고하는 것입니다. 문제가 핵심적인 DeepSpeed 문제이고 transformers와 관련이 없는 경우, [DeepSpeed 리포지토리](https://github.com/microsoft/DeepSpeed)에서 이슈를 개설하세요.

 transformers와 관련된 이슈를 개설할 때에는 다음 정보를 제공해 주세요:

@ -1211,7 +1211,7 @@ NVMe 및 ZeRO-3를 설정한 경우 NVMe로 오프로드를 실험해 보세요(

 ## 리소스[[resources]]

-DeepSpeed ZeRO는 제한된 GPU 리소스로 추론을 위해 매우 큰 모델을 훈련하고 로드하는 강력한 기술로, 누구나 쉽게 사용할 수 있습니다. DeepSpeed에 대해 자세히 알아보려면 [블로그 포스트](https://www.microsoft.com/en-us/research/search/?q=deepspeed), [공식 문서](https://www.deepspeed.ai/getting-started/), [깃허브 리포지토리](https://github.com/deepspeedai/DeepSpeed)를 참조하세요. 
+DeepSpeed ZeRO는 제한된 GPU 리소스로 추론을 위해 매우 큰 모델을 훈련하고 로드하는 강력한 기술로, 누구나 쉽게 사용할 수 있습니다. DeepSpeed에 대해 자세히 알아보려면 [블로그 포스트](https://www.microsoft.com/en-us/research/search/?q=deepspeed), [공식 문서](https://www.deepspeed.ai/getting-started/), [깃허브 리포지토리](https://github.com/microsoft/deepspeed)를 참조하세요. 

 다음 문서도 ZeRO에 대해 자세히 알아볼 수 있는 훌륭한 자료입니다:

--- a/docs/source/ko/perf_train_gpu_many.md
+++ b/docs/source/ko/perf_train_gpu_many.md
@ -386,7 +386,7 @@ DeepSpeed [pipeline tutorial](https://www.deepspeed.ai/tutorials/pipeline/)에
 각 차원마다 적어도 2개의 GPU가 필요하므로 최소한 4개의 GPU가 필요합니다.

 구현:
- [DeepSpeed](https://github.com/deepspeedai/DeepSpeed)
+- [DeepSpeed](https://github.com/microsoft/DeepSpeed)
 - [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)
 - [Varuna](https://github.com/microsoft/varuna)
 - [SageMaker](https://arxiv.org/abs/2111.05972)
@ -405,7 +405,7 @@ DeepSpeed [pipeline tutorial](https://www.deepspeed.ai/tutorials/pipeline/)에
 각 차원마다 적어도 2개의 GPU가 필요하므로 최소한 8개의 GPU가 필요합니다.

 구현:
- [DeepSpeed](https://github.com/deepspeedai/DeepSpeed) - DeepSpeed는 더욱 효율적인 DP인 ZeRO-DP라고도 부릅니다.
+- [DeepSpeed](https://github.com/microsoft/DeepSpeed) - DeepSpeed는 더욱 효율적인 DP인 ZeRO-DP라고도 부릅니다.
 - [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)
 - [Varuna](https://github.com/microsoft/varuna)
 - [SageMaker](https://arxiv.org/abs/2111.05972)
--- a/docs/source/zh/main_classes/deepspeed.md
+++ b/docs/source/zh/main_classes/deepspeed.md
@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.

 # DeepSpeed集成

-[DeepSpeed](https://github.com/deepspeedai/DeepSpeed)实现了[ZeRO论文](https://arxiv.org/abs/1910.02054)中描述的所有内容。目前，它提供对以下功能的全面支持：
+[DeepSpeed](https://github.com/microsoft/DeepSpeed)实现了[ZeRO论文](https://arxiv.org/abs/1910.02054)中描述的所有内容。目前，它提供对以下功能的全面支持：

 1. 优化器状态分区（ZeRO stage 1）
 2. 梯度分区（ZeRO stage 2）
@ -31,7 +31,7 @@ DeepSpeed ZeRO-2主要用于训练，因为它的特性对推理没有用处。

 DeepSpeed ZeRO-3也可以用于推理，因为它允许将单个GPU无法加载的大模型加载到多个GPU上。

-🤗 Transformers通过以下两种方式集成了[DeepSpeed](https://github.com/deepspeedai/DeepSpeed)：
+🤗 Transformers通过以下两种方式集成了[DeepSpeed](https://github.com/microsoft/DeepSpeed)：

 1. 通过[`Trainer`]集成核心的DeepSpeed功能。这是一种“为您完成一切”式的集成 - 您只需提供自定义配置文件或使用我们的模板配置文件。本文档的大部分内容都集中在这个功能上。
 2. 如果您不使用[`Trainer`]并希望在自己的Trainer中集成DeepSpeed，那么像`from_pretrained`和`from_config`这样的核心功能函数将包括ZeRO stage 3及以上的DeepSpeed的基础部分，如`zero.Init`。要利用此功能，请阅读有关[非Trainer DeepSpeed集成](#nontrainer-deepspeed-integration)的文档。
@ -72,7 +72,7 @@ pip install deepspeed
 pip install transformers[deepspeed]
 ```

-或在 [DeepSpeed 的 GitHub 页面](https://github.com/deepspeedai/DeepSpeed#installation) 和
+或在 [DeepSpeed 的 GitHub 页面](https://github.com/microsoft/deepspeed#installation) 和
 [高级安装](https://www.deepspeed.ai/tutorials/advanced-install/) 中查找更多详细信息。

 如果构建过程中仍然遇到问题，请首先确保阅读 [CUDA 扩展安装注意事项](trainer#cuda-extension-installation-notes)。
@ -83,7 +83,7 @@ pip install transformers[deepspeed]


 ```bash
-git clone https://github.com/deepspeedai/DeepSpeed/
+git clone https://github.com/microsoft/DeepSpeed/
 cd DeepSpeed
 rm -rf build
 TORCH_CUDA_ARCH_LIST="8.6" DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 pip install . \
@ -105,7 +105,7 @@ CUDA_VISIBLE_DEVICES=0 python -c "import torch; print(torch.cuda.get_device_capa


 ```bash
-git clone https://github.com/deepspeedai/DeepSpeed/
+git clone https://github.com/microsoft/DeepSpeed/
 cd DeepSpeed
 rm -rf build
 TORCH_CUDA_ARCH_LIST="8.6" DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 \
@ -142,7 +142,7 @@ _CudaDeviceProperties(name='GeForce RTX 3090', major=8, minor=6, total_memory=24

 您也可以完全省略 `TORCH_CUDA_ARCH_LIST`，然后构建程序将自动查询构建所在的 GPU 的架构。这可能与目标机器上的 GPU 不匹配，因此最好明确指定所需的架构。

-如果尝试了所有建议的方法仍然遇到构建问题，请继续在 [Deepspeed](https://github.com/deepspeedai/DeepSpeed/issues)的 GitHub Issue 上提交问题。
+如果尝试了所有建议的方法仍然遇到构建问题，请继续在 [Deepspeed](https://github.com/microsoft/DeepSpeed/issues)的 GitHub Issue 上提交问题。


 <a id='deepspeed-multi-gpu'></a>
@ -471,10 +471,10 @@ deepspeed examples/pytorch/translation/run_translation.py ...

 有关可以在 DeepSpeed 配置文件中使用的完整配置选项的详细指南，请参阅[以下文档](https://www.deepspeed.ai/docs/config-json/)。

-您可以在 [DeepSpeedExamples 仓库](https://github.com/deepspeedai/DeepSpeedExamples)中找到解决各种实际需求的数十个 DeepSpeed 配置示例。
+您可以在 [DeepSpeedExamples 仓库](https://github.com/microsoft/DeepSpeedExamples)中找到解决各种实际需求的数十个 DeepSpeed 配置示例。

 ```bash
-git clone https://github.com/deepspeedai/DeepSpeedExamples
+git clone https://github.com/microsoft/DeepSpeedExamples
 cd DeepSpeedExamples
 find . -name '*json'
 ```
@ -485,7 +485,7 @@ find . -name '*json'
 grep -i Lamb $(find . -name '*json')
 ```

-还可以在[主仓](https://github.com/deepspeedai/DeepSpeed)中找到更多示例。
+还可以在[主仓](https://github.com/microsoft/DeepSpeed)中找到更多示例。

 在使用 DeepSpeed 时，您总是需要提供一个 DeepSpeed 配置文件，但是一些配置参数必须通过命令行进行配置。您将在本指南的剩余章节找到这些细微差别。

@ -797,7 +797,7 @@ ZeRO-Infinity 通过使用 NVMe 内存扩展 GPU 和 CPU 内存，从而允许

 确保您的 `nvme_path` 实际上是一个 NVMe，因为它与普通硬盘或 SSD 一起工作，但速度会慢得多。快速可扩展的训练是根据现代 NVMe 传输速度设计的（截至本文撰写时，可以达到 ~3.5GB/s 读取，~3GB/s 写入的峰值速度）。

-为了找出最佳的 `aio` 配置块，您必须在目标设置上运行一个基准测试，具体操作请参见[说明](https://github.com/deepspeedai/DeepSpeed/issues/998)。
+为了找出最佳的 `aio` 配置块，您必须在目标设置上运行一个基准测试，具体操作请参见[说明](https://github.com/microsoft/DeepSpeed/issues/998)。



@ -1789,7 +1789,7 @@ SW: Model with 2783M total params, 65M largest layer params.

  因此，如果问题明显与DeepSpeed相关，例如您可以看到有一个异常并且可以看到DeepSpeed模块涉及其中，请先重新测试没有DeepSpeed的设置。只有当问题仍然存在时，才向Deepspeed提供所有必需的细节。

- 如果您明确问题是在Deepspeed核心中而不是集成部分，请直接向[Deepspeed](https://github.com/deepspeedai/DeepSpeed/)提交问题。如果您不确定，请不要担心，无论使用哪个issue跟踪问题都可以，一旦您发布问题，我们会弄清楚并将其重定向到另一个issue跟踪（如果需要的话）。
+- 如果您明确问题是在Deepspeed核心中而不是集成部分，请直接向[Deepspeed](https://github.com/microsoft/DeepSpeed/)提交问题。如果您不确定，请不要担心，无论使用哪个issue跟踪问题都可以，一旦您发布问题，我们会弄清楚并将其重定向到另一个issue跟踪（如果需要的话）。



@ -2086,7 +2086,7 @@ RUN_SLOW=1 pytest tests/deepspeed

 ## 主要的DeepSpeed资源

- [项目GitHub](https://github.com/deepspeedai/DeepSpeed)
+- [项目GitHub](https://github.com/microsoft/deepspeed)
 - [使用文档](https://www.deepspeed.ai/getting-started/)
 - [API文档](https://deepspeed.readthedocs.io/en/latest/index.html)
 - [博客文章](https://www.microsoft.com/en-us/research/search/?q=deepspeed)
@ -2097,4 +2097,4 @@ RUN_SLOW=1 pytest tests/deepspeed
 - [ZeRO-Offload: Democratizing Billion-Scale Model Training](https://arxiv.org/abs/2101.06840)
 - [ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning](https://arxiv.org/abs/2104.07857)

-最后，请记住，HuggingFace [`Trainer`]仅集成了DeepSpeed，因此如果您在使用DeepSpeed时遇到任何问题或疑问，请在[DeepSpeed GitHub](https://github.com/deepspeedai/DeepSpeed/issues)上提交一个issue。
+最后，请记住，HuggingFace [`Trainer`]仅集成了DeepSpeed，因此如果您在使用DeepSpeed时遇到任何问题或疑问，请在[DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/issues)上提交一个issue。
--- a/docs/source/zh/main_classes/trainer.md
+++ b/docs/source/zh/main_classes/trainer.md
@ -182,7 +182,7 @@ my_app.py ... --log_level error --log_level_replica error --log_on_each_node 0
 python -m torch.distributed.launch --nproc_per_node=2  trainer-program.py ...
 ```

-如果你安装了 [`accelerate`](https://github.com/huggingface/accelerate) 或 [`deepspeed`](https://github.com/deepspeedai/DeepSpeed)，你还可以通过以下任一方法实现相同的效果：
+如果你安装了 [`accelerate`](https://github.com/huggingface/accelerate) 或 [`deepspeed`](https://github.com/microsoft/DeepSpeed)，你还可以通过以下任一方法实现相同的效果：


 ```bash
@ -281,7 +281,7 @@ export CUDA_VISIBLE_DEVICES=1,0

 [`Trainer`] 已经被扩展，以支持可能显著提高训练时间并适应更大模型的库。

-目前，它支持第三方解决方案 [DeepSpeed](https://github.com/deepspeedai/DeepSpeed) 和 [PyTorch FSDP](https://pytorch.org/docs/stable/fsdp.html)，它们实现了论文 [ZeRO: Memory Optimizations Toward Training Trillion Parameter Models, by Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, Yuxiong He](https://arxiv.org/abs/1910.02054) 的部分内容。
+目前，它支持第三方解决方案 [DeepSpeed](https://github.com/microsoft/DeepSpeed) 和 [PyTorch FSDP](https://pytorch.org/docs/stable/fsdp.html)，它们实现了论文 [ZeRO: Memory Optimizations Toward Training Trillion Parameter Models, by Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, Yuxiong He](https://arxiv.org/abs/1910.02054) 的部分内容。

 截至撰写本文，此提供的支持是新的且实验性的。尽管我们欢迎围绕 DeepSpeed 和 PyTorch FSDP 的issues，但我们不再支持 FairScale 集成，因为它已经集成到了 PyTorch 主线（参见 [PyTorch FSDP 集成](#pytorch-fully-sharded-data-parallel)）。

@ -293,7 +293,7 @@ export CUDA_VISIBLE_DEVICES=1,0

 撰写时，Deepspeed 需要在使用之前编译 CUDA C++ 代码。

-虽然所有安装问题都应通过 [Deepspeed](https://github.com/deepspeedai/DeepSpeed/issues) 的 GitHub Issues处理，但在构建依赖CUDA 扩展的任何 PyTorch 扩展时，可能会遇到一些常见问题。
+虽然所有安装问题都应通过 [Deepspeed](https://github.com/microsoft/DeepSpeed/issues) 的 GitHub Issues处理，但在构建依赖CUDA 扩展的任何 PyTorch 扩展时，可能会遇到一些常见问题。

 因此，如果在执行以下操作时遇到与 CUDA 相关的构建问题：

--- a/examples/modular-transformers/modeling_dummy.py
+++ b/examples/modular-transformers/modeling_dummy.py
@ -639,7 +639,7 @@ class DummyModel(DummyPreTrainedModel):
        if (
            self.config._attn_implementation == "sdpa"
            and attention_mask is not None
-            and attention_mask.device.type in ["cuda", "xpu"]
+            and attention_mask.device.type == "cuda"
            and not output_attentions
        ):
            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
--- a/examples/modular-transformers/modeling_multimodal1.py
+++ b/examples/modular-transformers/modeling_multimodal1.py
@ -639,7 +639,7 @@ class Multimodal1TextModel(Multimodal1TextPreTrainedModel):
        if (
            self.config._attn_implementation == "sdpa"
            and attention_mask is not None
-            and attention_mask.device.type in ["cuda", "xpu"]
+            and attention_mask.device.type == "cuda"
            and not output_attentions
        ):
            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
--- a/examples/modular-transformers/modeling_my_new_model2.py
+++ b/examples/modular-transformers/modeling_my_new_model2.py
@ -644,7 +644,7 @@ class MyNewModel2Model(MyNewModel2PreTrainedModel):
        if (
            self.config._attn_implementation == "sdpa"
            and attention_mask is not None
-            and attention_mask.device.type in ["cuda", "xpu"]
+            and attention_mask.device.type == "cuda"
            and not output_attentions
        ):
            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
--- a/examples/modular-transformers/modeling_new_task_model.py
+++ b/examples/modular-transformers/modeling_new_task_model.py
@ -452,9 +452,11 @@ class NewTaskModelForNewTask(NewTaskModelPreTrainedModel, GenerationMixin):
        return model_inputs

    def resize_token_embeddings(
-        self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None, mean_resizing=True
+        self,
+        new_num_tokens: Optional[int] = None,
+        pad_to_multiple_of=None,
    ) -> nn.Embedding:
-        model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of, mean_resizing)
+        model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)

        # Update vocab size
        self.config.text_config.vocab_size = model_embeds.num_embeddings
--- a/examples/modular-transformers/modeling_super.py
+++ b/examples/modular-transformers/modeling_super.py
@ -561,7 +561,7 @@ class SuperModel(SuperPreTrainedModel):
        if (
            self.config._attn_implementation == "sdpa"
            and attention_mask is not None
-            and attention_mask.device.type in ["cuda", "xpu"]
+            and attention_mask.device.type == "cuda"
            and not output_attentions
        ):
            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
--- a/examples/modular-transformers/modular_new_task_model.py
+++ b/examples/modular-transformers/modular_new_task_model.py
@ -70,9 +70,11 @@ class NewTaskModelForNewTask(PaliGemmaForConditionalGeneration):
        return (embeddings,) + vlm_outputs

    def resize_token_embeddings(
-        self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None, mean_resizing=True
+        self,
+        new_num_tokens: Optional[int] = None,
+        pad_to_multiple_of=None,
    ) -> nn.Embedding:
-        model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of, mean_resizing)
+        model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)

        # Update vocab size
        self.config.text_config.vocab_size = model_embeds.num_embeddings
--- a/examples/pytorch/object-detection/run_object_detection.py
+++ b/examples/pytorch/object-detection/run_object_detection.py
@ -271,10 +271,6 @@ class DataTrainingArguments:
            )
        },
    )
-    use_fast: Optional[bool] = field(
-        default=True,
-        metadata={"help": "Use a fast torchvision-base image processor if it is supported for a given model."},
-    )


@dataclass
@ -431,7 +427,6 @@ def main():
        size={"max_height": data_args.image_square_size, "max_width": data_args.image_square_size},
        do_pad=True,
        pad_size={"height": data_args.image_square_size, "width": data_args.image_square_size},
-        use_fast=data_args.use_fast,
        **common_pretrained_args,
    )

--- a/examples/pytorch/object-detection/run_object_detection_no_trainer.py
+++ b/examples/pytorch/object-detection/run_object_detection_no_trainer.py
@ -256,12 +256,6 @@ def parse_args():
        default=1333,
        help="Image longest size will be resized to this value, then image will be padded to square.",
    )
-    parser.add_argument(
-        "--use_fast",
-        type=bool,
-        default=True,
-        help="Use a fast torchvision-base image processor if it is supported for a given model.",
-    )
    parser.add_argument(
        "--cache_dir",
        type=str,
@ -488,7 +482,6 @@ def main():
        size={"max_height": args.image_square_size, "max_width": args.image_square_size},
        do_pad=True,
        pad_size={"height": args.image_square_size, "width": args.image_square_size},
-        use_fast=args.use_fast,
        **common_pretrained_args,
    )

--- a/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py
+++ b/examples/research_projects/robust-speech-event/run_speech_recognition_ctc_bnb.py
@ -680,7 +680,8 @@ def main():
    # Instantiate custom data collator
    data_collator = DataCollatorCTCWithPadding(processor=processor)

-    decay_parameters = get_parameter_names(model, [torch.nn.LayerNorm], ["bias", "layernorm", "rmsnorm"])
+    decay_parameters = get_parameter_names(model, [torch.nn.LayerNorm])
+    decay_parameters = [name for name in decay_parameters if "bias" not in name]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if n in decay_parameters],
--- a/notebooks/README.md
+++ b/notebooks/README.md
@ -144,6 +144,7 @@ You can open any page of the documentation as a notebook in Colab (there is a bu
 | Notebook     |      Description      |   |   |
 |:----------|:-------------|:-------------|------:|
 | [How to quantize a model with ONNX Runtime for text classification](https://github.com/huggingface/notebooks/blob/main/examples/text_classification_quantization_ort.ipynb)| Show how to apply static and dynamic quantization on a model using [ONNX Runtime](https://github.com/microsoft/onnxruntime) for any GLUE task. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification_quantization_ort.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/text_classification_quantization_ort.ipynb)|
+| [How to quantize a model with Intel Neural Compressor for text classification](https://github.com/huggingface/notebooks/blob/main/examples/text_classification_quantization_inc.ipynb)| Show how to apply static, dynamic and aware training quantization on a model using [Intel Neural Compressor (INC)](https://github.com/intel/neural-compressor) for any GLUE task. | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification_quantization_inc.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/text_classification_quantization_inc.ipynb)|
 | [How to fine-tune a model on text classification with ONNX Runtime](https://github.com/huggingface/notebooks/blob/main/examples/text_classification_ort.ipynb)| Show how to preprocess the data and fine-tune a model on any GLUE task using [ONNX Runtime](https://github.com/microsoft/onnxruntime). | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification_ort.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/text_classification_ort.ipynb)|
 | [How to fine-tune a model on summarization with ONNX Runtime](https://github.com/huggingface/notebooks/blob/main/examples/summarization_ort.ipynb)| Show how to preprocess the data and fine-tune a model on XSUM using [ONNX Runtime](https://github.com/microsoft/onnxruntime). | [![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization_ort.ipynb)| [![Open in AWS Studio](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/examples/summarization_ort.ipynb)|

--- a/pyproject.toml
+++ b/pyproject.toml
@ -52,5 +52,3 @@ markers = [
    "bitsandbytes: select (or deselect with `not`) bitsandbytes integration tests",
    "generate: marks tests that use the GenerationTesterMixin"
 ]
-log_cli = 1
-log_cli_level = "WARNING"
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@ -328,7 +328,6 @@ _import_structure = {
        "CTRLTokenizer",
    ],
    "models.cvt": ["CvtConfig"],
-    "models.dab_detr": ["DabDetrConfig"],
    "models.dac": ["DacConfig", "DacFeatureExtractor"],
    "models.data2vec": [
        "Data2VecAudioConfig",
@ -400,7 +399,6 @@ _import_structure = {
    "models.deprecated.vit_hybrid": ["ViTHybridConfig"],
    "models.deprecated.xlm_prophetnet": ["XLMProphetNetConfig"],
    "models.depth_anything": ["DepthAnythingConfig"],
-    "models.depth_pro": ["DepthProConfig"],
    "models.detr": ["DetrConfig"],
    "models.dialogpt": [],
    "models.diffllama": ["DiffLlamaConfig"],
@ -478,11 +476,6 @@ _import_structure = {
    ],
    "models.glm": ["GlmConfig"],
    "models.glpn": ["GLPNConfig"],
-    "models.got_ocr2": [
-        "GotOcr2Config",
-        "GotOcr2Processor",
-        "GotOcr2VisionConfig",
-    ],
    "models.gpt2": [
        "GPT2Config",
        "GPT2Tokenizer",
@ -749,7 +742,6 @@ _import_structure = {
        "RoFormerTokenizer",
    ],
    "models.rt_detr": ["RTDetrConfig", "RTDetrResNetConfig"],
-    "models.rt_detr_v2": ["RTDetrV2Config"],
    "models.rwkv": ["RwkvConfig"],
    "models.sam": [
        "SamConfig",
@ -897,7 +889,6 @@ _import_structure = {
    "models.yolos": ["YolosConfig"],
    "models.yoso": ["YosoConfig"],
    "models.zamba": ["ZambaConfig"],
-    "models.zamba2": ["Zamba2Config"],
    "models.zoedepth": ["ZoeDepthConfig"],
    "onnx": [],
    "pipelines": [
@ -1238,7 +1229,6 @@ else:
    _import_structure["models.deprecated.efficientformer"].append("EfficientFormerImageProcessor")
    _import_structure["models.deprecated.tvlt"].append("TvltImageProcessor")
    _import_structure["models.deprecated.vit_hybrid"].extend(["ViTHybridImageProcessor"])
-    _import_structure["models.depth_pro"].extend(["DepthProImageProcessor", "DepthProImageProcessorFast"])
    _import_structure["models.detr"].extend(["DetrFeatureExtractor", "DetrImageProcessor"])
    _import_structure["models.donut"].extend(["DonutFeatureExtractor", "DonutImageProcessor"])
    _import_structure["models.dpt"].extend(["DPTFeatureExtractor", "DPTImageProcessor"])
@ -1247,7 +1237,6 @@ else:
    _import_structure["models.flava"].extend(["FlavaFeatureExtractor", "FlavaImageProcessor", "FlavaProcessor"])
    _import_structure["models.fuyu"].extend(["FuyuImageProcessor", "FuyuProcessor"])
    _import_structure["models.glpn"].extend(["GLPNFeatureExtractor", "GLPNImageProcessor"])
-    _import_structure["models.got_ocr2"].extend(["GotOcr2ImageProcessor"])
    _import_structure["models.grounding_dino"].extend(["GroundingDinoImageProcessor"])
    _import_structure["models.idefics"].extend(["IdeficsImageProcessor"])
    _import_structure["models.idefics2"].extend(["Idefics2ImageProcessor"])
@ -1311,20 +1300,11 @@ except OptionalDependencyNotAvailable:
    ]
 else:
    _import_structure["image_processing_utils_fast"] = ["BaseImageProcessorFast"]
-    _import_structure["models.blip"].append("BlipImageProcessorFast")
-    _import_structure["models.clip"].append("CLIPImageProcessorFast")
-    _import_structure["models.convnext"].append("ConvNextImageProcessorFast")
    _import_structure["models.deformable_detr"].append("DeformableDetrImageProcessorFast")
-    _import_structure["models.deit"].append("DeiTImageProcessorFast")
-    _import_structure["models.depth_pro"].append("DepthProImageProcessorFast")
    _import_structure["models.detr"].append("DetrImageProcessorFast")
-    _import_structure["models.llava"].append("LlavaImageProcessorFast")
-    _import_structure["models.llava_next"].append("LlavaNextImageProcessorFast")
-    _import_structure["models.llava_onevision"].append("LlavaOnevisionImageProcessorFast")
    _import_structure["models.pixtral"].append("PixtralImageProcessorFast")
    _import_structure["models.qwen2_vl"].append("Qwen2VLImageProcessorFast")
    _import_structure["models.rt_detr"].append("RTDetrImageProcessorFast")
-    _import_structure["models.siglip"].append("SiglipImageProcessorFast")
    _import_structure["models.vit"].append("ViTImageProcessorFast")

 try:
@ -1911,13 +1891,6 @@ else:
            "CvtPreTrainedModel",
        ]
    )
-    _import_structure["models.dab_detr"].extend(
-        [
-            "DabDetrForObjectDetection",
-            "DabDetrModel",
-            "DabDetrPreTrainedModel",
-        ]
-    )
    _import_structure["models.dac"].extend(
        [
            "DacModel",
@ -2184,13 +2157,6 @@ else:
            "DepthAnythingPreTrainedModel",
        ]
    )
-    _import_structure["models.depth_pro"].extend(
-        [
-            "DepthProForDepthEstimation",
-            "DepthProModel",
-            "DepthProPreTrainedModel",
-        ]
-    )
    _import_structure["models.detr"].extend(
        [
            "DetrForObjectDetection",
@ -2459,12 +2425,6 @@ else:
            "GLPNPreTrainedModel",
        ]
    )
-    _import_structure["models.got_ocr2"].extend(
-        [
-            "GotOcr2ForConditionalGeneration",
-            "GotOcr2PreTrainedModel",
-        ]
-    )
    _import_structure["models.gpt2"].extend(
        [
            "GPT2DoubleHeadsModel",
@ -3465,9 +3425,6 @@ else:
            "RTDetrResNetPreTrainedModel",
        ]
    )
-    _import_structure["models.rt_detr_v2"].extend(
-        ["RTDetrV2ForObjectDetection", "RTDetrV2Model", "RTDetrV2PreTrainedModel"]
-    )
    _import_structure["models.rwkv"].extend(
        [
            "RwkvForCausalLM",
@ -4032,14 +3989,6 @@ else:
            "ZambaPreTrainedModel",
        ]
    )
-    _import_structure["models.zamba2"].extend(
-        [
-            "Zamba2ForCausalLM",
-            "Zamba2ForSequenceClassification",
-            "Zamba2Model",
-            "Zamba2PreTrainedModel",
-        ]
-    )
    _import_structure["models.zoedepth"].extend(
        [
            "ZoeDepthForDepthEstimation",
@ -5417,9 +5366,6 @@ if TYPE_CHECKING:
        CTRLTokenizer,
    )
    from .models.cvt import CvtConfig
-    from .models.dab_detr import (
-        DabDetrConfig,
-    )
    from .models.dac import (
        DacConfig,
        DacFeatureExtractor,
@ -5508,7 +5454,6 @@ if TYPE_CHECKING:
        XLMProphetNetConfig,
    )
    from .models.depth_anything import DepthAnythingConfig
-    from .models.depth_pro import DepthProConfig
    from .models.detr import DetrConfig
    from .models.diffllama import DiffLlamaConfig
    from .models.dinat import DinatConfig
@ -5586,7 +5531,6 @@ if TYPE_CHECKING:
    )
    from .models.glm import GlmConfig
    from .models.glpn import GLPNConfig
-    from .models.got_ocr2 import GotOcr2Config, GotOcr2Processor, GotOcr2VisionConfig
    from .models.gpt2 import (
        GPT2Config,
        GPT2Tokenizer,
@ -5890,7 +5834,6 @@ if TYPE_CHECKING:
        RTDetrConfig,
        RTDetrResNetConfig,
    )
-    from .models.rt_detr_v2 import RTDetrV2Config
    from .models.rwkv import RwkvConfig
    from .models.sam import (
        SamConfig,
@ -6061,7 +6004,6 @@ if TYPE_CHECKING:
    from .models.yolos import YolosConfig
    from .models.yoso import YosoConfig
    from .models.zamba import ZambaConfig
-    from .models.zamba2 import Zamba2Config
    from .models.zoedepth import ZoeDepthConfig

    # Pipelines
@ -6378,7 +6320,6 @@ if TYPE_CHECKING:
        from .models.deprecated.efficientformer import EfficientFormerImageProcessor
        from .models.deprecated.tvlt import TvltImageProcessor
        from .models.deprecated.vit_hybrid import ViTHybridImageProcessor
-        from .models.depth_pro import DepthProImageProcessor, DepthProImageProcessorFast
        from .models.detr import DetrFeatureExtractor, DetrImageProcessor
        from .models.donut import DonutFeatureExtractor, DonutImageProcessor
        from .models.dpt import DPTFeatureExtractor, DPTImageProcessor
@ -6391,7 +6332,6 @@ if TYPE_CHECKING:
        )
        from .models.fuyu import FuyuImageProcessor, FuyuProcessor
        from .models.glpn import GLPNFeatureExtractor, GLPNImageProcessor
-        from .models.got_ocr2 import GotOcr2ImageProcessor
        from .models.grounding_dino import GroundingDinoImageProcessor
        from .models.idefics import IdeficsImageProcessor
        from .models.idefics2 import Idefics2ImageProcessor
@ -6467,20 +6407,11 @@ if TYPE_CHECKING:
        from .utils.dummy_torchvision_objects import *
    else:
        from .image_processing_utils_fast import BaseImageProcessorFast
-        from .models.blip import BlipImageProcessorFast
-        from .models.clip import CLIPImageProcessorFast
-        from .models.convnext import ConvNextImageProcessorFast
        from .models.deformable_detr import DeformableDetrImageProcessorFast
-        from .models.deit import DeiTImageProcessorFast
-        from .models.depth_pro import DepthProImageProcessorFast
        from .models.detr import DetrImageProcessorFast
-        from .models.llava import LlavaImageProcessorFast
-        from .models.llava_next import LlavaNextImageProcessorFast
-        from .models.llava_onevision import LlavaOnevisionImageProcessorFast
        from .models.pixtral import PixtralImageProcessorFast
        from .models.qwen2_vl import Qwen2VLImageProcessorFast
        from .models.rt_detr import RTDetrImageProcessorFast
-        from .models.siglip import SiglipImageProcessorFast
        from .models.vit import ViTImageProcessorFast

    try:
@ -6971,11 +6902,6 @@ if TYPE_CHECKING:
            CvtModel,
            CvtPreTrainedModel,
        )
-        from .models.dab_detr import (
-            DabDetrForObjectDetection,
-            DabDetrModel,
-            DabDetrPreTrainedModel,
-        )
        from .models.dac import (
            DacModel,
            DacPreTrainedModel,
@ -7191,11 +7117,6 @@ if TYPE_CHECKING:
            DepthAnythingForDepthEstimation,
            DepthAnythingPreTrainedModel,
        )
-        from .models.depth_pro import (
-            DepthProForDepthEstimation,
-            DepthProModel,
-            DepthProPreTrainedModel,
-        )
        from .models.detr import (
            DetrForObjectDetection,
            DetrForSegmentation,
@ -7415,10 +7336,6 @@ if TYPE_CHECKING:
            GLPNModel,
            GLPNPreTrainedModel,
        )
-        from .models.got_ocr2 import (
-            GotOcr2ForConditionalGeneration,
-            GotOcr2PreTrainedModel,
-        )
        from .models.gpt2 import (
            GPT2DoubleHeadsModel,
            GPT2ForQuestionAnswering,
@ -8194,7 +8111,6 @@ if TYPE_CHECKING:
            RTDetrResNetBackbone,
            RTDetrResNetPreTrainedModel,
        )
-        from .models.rt_detr_v2 import RTDetrV2ForObjectDetection, RTDetrV2Model, RTDetrV2PreTrainedModel
        from .models.rwkv import (
            RwkvForCausalLM,
            RwkvModel,
@ -8626,12 +8542,6 @@ if TYPE_CHECKING:
            ZambaModel,
            ZambaPreTrainedModel,
        )
-        from .models.zamba2 import (
-            Zamba2ForCausalLM,
-            Zamba2ForSequenceClassification,
-            Zamba2Model,
-            Zamba2PreTrainedModel,
-        )
        from .models.zoedepth import (
            ZoeDepthForDepthEstimation,
            ZoeDepthPreTrainedModel,
--- a/src/transformers/activations.py
+++ b/src/transformers/activations.py
@ -217,7 +217,6 @@ ACT2CLS = {
    "silu": nn.SiLU,
    "swish": nn.SiLU,
    "tanh": nn.Tanh,
-    "prelu": nn.PReLU,
 }
 ACT2FN = ClassInstantier(ACT2CLS)

--- a/src/transformers/audio_utils.py
+++ b/src/transformers/audio_utils.py
@ -146,7 +146,7 @@ def chroma_filter_bank(
    sampling_rate: int,
    tuning: float = 0.0,
    power: Optional[float] = 2.0,
-    weighting_parameters: Optional[Tuple[float, float]] = (5.0, 2.0),
+    weighting_parameters: Optional[Tuple[float]] = (5.0, 2),
    start_at_c_chroma: Optional[bool] = True,
 ):
    """
@ -165,7 +165,7 @@ def chroma_filter_bank(
            Tuning deviation from A440 in fractions of a chroma bin.
        power (`float`, *optional*, defaults to 2.0):
            If 12.0, normalizes each column with their L2 norm. If 1.0, normalizes each column with their L1 norm.
-        weighting_parameters (`Tuple[float, float]`, *optional*, defaults to `(5., 2.)`):
+        weighting_parameters (`Tuple[float]`, *optional*, defaults to `(5., 2.)`):
            If specified, apply a Gaussian weighting parameterized by the first element of the tuple being the center and
            the second element being the Gaussian half-width.
        start_at_c_chroma (`float`, *optional*, defaults to `True`):
--- a/src/transformers/cache_utils.py
+++ b/src/transformers/cache_utils.py
@ -1,5 +1,6 @@
 import copy
 import importlib.metadata
+import inspect
 import json
 import os
 from dataclasses import dataclass
@ -9,12 +10,7 @@ import torch
 from packaging import version

 from .configuration_utils import PretrainedConfig
-from .utils import (
-    is_hqq_available,
-    is_optimum_quanto_available,
-    is_torchdynamo_compiling,
-    logging,
-)
+from .utils import is_hqq_available, is_optimum_quanto_available, logging
 from .utils.deprecation import deprecate_kwarg


@ -24,15 +20,82 @@ if is_hqq_available():
 logger = logging.get_logger(__name__)


-class Cache(torch.nn.Module):
+class Cache(torch.Tensor):
    """
    Base, abstract class for all caches. The actual data structure is specific to each subclass.
    """

-    is_compileable = False
+    @staticmethod
+    def __new__(cls, *args, **kwargs):
+        # We use a tensor wrapper to allow for torch script tracing when using the cache as an input in a forward method

-    def __init__(self):
-        super().__init__()
+        wrapper_kwargs = {}
+        init_signature = inspect.signature(cls.__init__)
+        init_arguments = list(init_signature.parameters.keys())
+        init_defaults = {
+            k: v.default for k, v in init_signature.parameters.items() if v.default is not inspect.Parameter.empty
+        }
+
+        for argument in ["dtype", "device"]:
+            if argument in init_arguments:
+                arg_idx = init_arguments.index(argument)
+                if len(args) > arg_idx and args[arg_idx] is not None:
+                    wrapper_kwargs[argument] = args[arg_idx]
+                elif kwargs.get(argument, None) is not None:
+                    wrapper_kwargs[argument] = kwargs[argument]
+                elif init_defaults[argument] is not None:
+                    wrapper_kwargs[argument] = init_defaults[argument]
+
+        if "cache_config" in init_arguments:
+            cache_config_idx = init_arguments.index("cache_config")
+            if len(args) > cache_config_idx and args[cache_config_idx] is not None:
+                wrapper_kwargs["device"] = args[cache_config_idx].device
+            elif kwargs.get("cache_config", None) is not None:
+                wrapper_kwargs["device"] = kwargs["cache_config"].device
+            elif init_defaults["cache_config"] is not None:
+                wrapper_kwargs["device"] = init_defaults["cache_config"].device
+
+        self = torch.Tensor._make_wrapper_subclass(cls, (), **wrapper_kwargs, requires_grad=False)
+        # we create a dummy empty tensor for generic tensor flattening/unflattening
+        self._empty_tensor = torch.tensor([], **wrapper_kwargs, requires_grad=False)
+        return self
+
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args, kwargs):
+        assert (
+            func.__name__ in cls.__dict__
+        ), f"Class {cls.__name__} is a tensor wrapper and does not implement method {func.__name__}"
+        return getattr(cls, func.__name__)(*args, **kwargs)
+
+    def __repr__(self):
+        return f"{self.__class__.__name__}()"
+
+    def __bool__(self):
+        # in many places, past_key_values is checked for not being None using `if past_key_values:`
+        # I think `if past_key_values is not None:` should be used instead
+        return self is not None  # True
+
+    def to(self, *args, **kwargs):
+        # originals
+        wrapper_kwargs = {"dtype": getattr(self, "dtype", None), "device": getattr(self, "device", None)}
+
+        # overrides
+        for arg in list(args) + list(kwargs.values()):
+            if isinstance(arg, (torch.device, str, int)):
+                wrapper_kwargs["device"] = arg
+            elif isinstance(arg, torch.dtype):
+                wrapper_kwargs["dtype"] = arg
+
+        # new wrapper
+        new_self = torch.Tensor._make_wrapper_subclass(self.__class__, (), **wrapper_kwargs)
+        new_self.__dict__ = {k: v for k, v in self.__dict__.items() if k not in ["device", "dtype"]}
+        return new_self
+
+    def clone(self):
+        wrapper_kwargs = {"dtype": getattr(self, "dtype", None), "device": getattr(self, "device", None)}
+        new_self = torch.Tensor._make_wrapper_subclass(self.__class__, (), **wrapper_kwargs, requires_grad=False)
+        new_self.__dict__ = copy.deepcopy(self.__dict__)
+        return new_self

    def update(
        self,
@ -306,7 +369,7 @@ class StaticCacheConfig(CacheConfig):

    cache_implementation = "static"

-    def __init__(self, batch_size: int, max_cache_len: int, device="cpu"):
+    def __init__(self, batch_size: int, max_cache_len: int, device: Union[str, torch.device] = torch.device("cpu")):
        self.batch_size = batch_size
        self.max_cache_len = max_cache_len
        self.device = device
@ -363,6 +426,16 @@ class DynamicCache(Cache):
        ```
    """

+    def __tensor_flatten__(self):
+        return ["_empty_tensor"], {"_seen_tokens": self._seen_tokens}
+
+    @staticmethod
+    def __tensor_unflatten__(inner_tensors, meta, _, __):
+        cache = DynamicCache()
+        cache._seen_tokens = meta["_seen_tokens"]
+        cache._empty_tensor = inner_tensors["_empty_tensor"]
+        return cache
+
    @deprecate_kwarg("num_hidden_layers", version="4.47.0")
    def __init__(self, num_hidden_layers: Optional[int] = None) -> None:
        super().__init__()
@ -450,7 +523,7 @@ class DynamicCache(Cache):
            or len(self.key_cache) <= layer_idx  # skipped `layer_idx` and hasn't run a layer with cache after it
            or len(self.key_cache[layer_idx]) == 0  # the layer has no cache
        )
-        layer_seq_length = self.key_cache[layer_idx].shape[-2] if not is_empty_layer else 0
+        layer_seq_length = self.key_cache[layer_idx].shape[-2] if not is_empty_layer else torch.tensor(0)
        return layer_seq_length

    def get_max_cache_shape(self) -> Optional[int]:
@ -677,9 +750,6 @@ class QuantizedCache(DynamicCache):
        self.axis_key = cache_config.axis_key
        self.axis_value = cache_config.axis_value
        self.compute_dtype = cache_config.compute_dtype
-        self.device = cache_config.device
-
-        super().__init__()

    def update(
        self,
@ -779,7 +849,7 @@ class QuantoQuantizedCache(QuantizedCache):
                raise ImportError(
                    f"You need optimum-quanto package version to be greater or equal than 0.2.5 to use `QuantoQuantizedCache`. Detected version {optimum_quanto_version}."
                )
-            from optimum.quanto import MaxOptimizer, qint2, qint4
+            from optimum.quanto import MaxOptimizer, qint2, qint4  # type: ignore

        if self.nbits not in [2, 4]:
            raise ValueError(f"`nbits` for `quanto` backend has to be one of [`2`, `4`] but got {self.nbits}")
@ -798,7 +868,7 @@ class QuantoQuantizedCache(QuantizedCache):
    def _quantize(self, tensor, axis):
        # We have two different API since in optimum-quanto, we don't use AffineQuantizer anymore
        if is_optimum_quanto_available():
-            from optimum.quanto import quantize_weight
+            from optimum.quanto import quantize_weight  # type: ignore

            scale, zeropoint = self.optimizer(tensor, self.qtype, axis, self.q_group_size)
            qtensor = quantize_weight(tensor, self.qtype, axis, scale, zeropoint, self.q_group_size)
@ -1100,8 +1170,6 @@ class StaticCache(Cache):
        ```
    """

-    is_compileable = True
-
    # TODO (joao): remove `=None` in non-optional arguments in v4.46. Remove from `OBJECTS_TO_IGNORE` as well.
    @deprecate_kwarg("layer_device_map", version="4.52.0")
    def __init__(
@ -1109,7 +1177,7 @@ class StaticCache(Cache):
        config: PretrainedConfig,
        batch_size: int = None,
        max_cache_len: int = None,
-        device: torch.device = None,
+        device: Union[torch.device, str] = torch.device("meta"),
        dtype: torch.dtype = torch.float32,
        max_batch_size: Optional[int] = None,
        layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None,
@ -1120,7 +1188,6 @@ class StaticCache(Cache):
                f"The 'batch_size' argument of {self.__class__.__name__} is deprecated and will be removed in "
                "v4.49. Use the more precisely named 'max_batch_size' argument instead."
            )
-
        self.max_batch_size = batch_size or max_batch_size
        self.max_cache_len = config.max_position_embeddings if max_cache_len is None else max_cache_len

@ -1129,8 +1196,6 @@ class StaticCache(Cache):
            config.head_dim if hasattr(config, "head_dim") else config.hidden_size // config.num_attention_heads
        )

-        self.dtype = dtype
-        self.device = torch.device(device) if device is not None else torch.device("meta")
        self.num_key_value_heads = (
            config.num_attention_heads
            if getattr(config, "num_key_value_heads", None) is None
@ -1148,18 +1213,10 @@ class StaticCache(Cache):
                layer_device = self.device
            new_layer_key_cache = torch.zeros(cache_shape, dtype=self.dtype, device=layer_device)
            new_layer_value_cache = torch.zeros(cache_shape, dtype=self.dtype, device=layer_device)
-            # Notes:
-            # 1. `mark_static_address` is used to tag the cache as an fixed data pointer, preventing cuda graph
-            #     breaks when updating the cache. It can't be used if the cache code is being compiled (but in that case
-            #     it is not needed anyway)
-            # 2. `torch.export()` requires mutations to be registered as buffers.
-            if not is_torchdynamo_compiling():
-                self.register_buffer(f"key_cache_{idx}", torch.zeros(cache_shape, dtype=dtype, device=layer_device))
-                self.register_buffer(f"value_cache_{idx}", torch.zeros(cache_shape, dtype=dtype, device=layer_device))
-                new_layer_key_cache = getattr(self, f"key_cache_{idx}")
-                new_layer_value_cache = getattr(self, f"value_cache_{idx}")
-                torch._dynamo.mark_static_address(new_layer_key_cache)
-                torch._dynamo.mark_static_address(new_layer_value_cache)
+            # Note: `mark_static_address` is used to tag the cache as a fixed data pointer,
+            # preventing compiled graph breaks when updating the cache.
+            torch._dynamo.mark_static_address(new_layer_key_cache)
+            torch._dynamo.mark_static_address(new_layer_key_cache)
            self.key_cache.append(new_layer_key_cache)
            self.value_cache.append(new_layer_value_cache)

@ -1301,7 +1358,6 @@ class SlidingWindowCache(StaticCache):
    """

    is_sliding = True
-    is_compileable = True

    # TODO (joao): remove `=None` in non-optional arguments in v4.46. Remove from `OBJECTS_TO_IGNORE` as well.
    def __init__(
@ -1309,7 +1365,7 @@ class SlidingWindowCache(StaticCache):
        config: PretrainedConfig,
        batch_size: int = None,
        max_cache_len: int = None,
-        device: torch.device = None,
+        device: Union[torch.device, str] = torch.device("meta"),
        dtype: torch.dtype = torch.float32,
        max_batch_size: Optional[int] = None,
        layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None,
@ -1426,7 +1482,6 @@ class EncoderDecoderCache(Cache):
        super().__init__()
        self.self_attention_cache = self_attention_cache
        self.cross_attention_cache = cross_attention_cache
-        self.is_compileable = getattr(self.self_attention_cache, "is_compileable", False)

        self.is_updated = {}
        for layer_idx in range(len(cross_attention_cache.key_cache)):
@ -1618,8 +1673,6 @@ class HybridCache(Cache):
        ```
    """

-    is_compileable = True
-
    # TODO (joao): remove `=None` in non-optional arguments in v4.46. Remove from `OBJECTS_TO_IGNORE` as well.
    @deprecate_kwarg("layer_device_map", version="4.52.0")
    def __init__(
@ -1627,7 +1680,7 @@ class HybridCache(Cache):
        config: PretrainedConfig,
        batch_size: int = None,
        max_cache_len: int = None,
-        device: Union[torch.device, str] = None,
+        device: Union[torch.device, str] = torch.device("meta"),
        dtype: torch.dtype = torch.float32,
        max_batch_size: Optional[int] = None,
        layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None,
@ -1656,7 +1709,6 @@ class HybridCache(Cache):
            config.num_attention_heads if config.num_key_value_heads is None else config.num_key_value_heads
        )

-        self.device = torch.device(device) if device is not None else torch.device("meta")
        layer_switch = config.sliding_window_pattern if hasattr(config, "sliding_window_pattern") else 2  # 2 is for BC
        self.is_sliding = torch.tensor(
            [bool((i + 1) % layer_switch) for i in range(config.num_hidden_layers)], dtype=torch.bool
@ -1789,7 +1841,7 @@ class HybridCache(Cache):
        return self.max_batch_size


-class MambaCache:
+class MambaCache(Cache):
    """
    Cache for mamba model which does not have attention mechanism and key value states.

@ -1840,15 +1892,13 @@ class MambaCache:
        ```
    """

-    is_compileable = True
-
    # TODO (joao): remove `=None` in non-optional arguments in v4.46. Remove from `OBJECTS_TO_IGNORE` as well.
    def __init__(
        self,
        config: PretrainedConfig,
        batch_size: int = None,
        dtype: torch.dtype = torch.float16,
-        device: Optional[Union[torch.device, str]] = None,
+        device: Union[torch.device, str] = torch.device("meta"),
        max_batch_size: Optional[int] = None,
    ):
        if batch_size is not None:
@ -1856,12 +1906,10 @@ class MambaCache:
                f"The 'batch_size' argument of {self.__class__.__name__} is deprecated and will be removed in "
                "v4.49. Use the more precisely named 'max_batch_size' argument instead."
            )
-        self.dtype = dtype
        self.max_batch_size = batch_size or max_batch_size
        self.intermediate_size = config.intermediate_size
        self.ssm_state_size = config.state_size
        self.conv_kernel_size = config.conv_kernel
-        self.device = torch.device(device) if device is not None else torch.device("meta")

        self.conv_states: List[torch.Tensor] = []
        self.ssm_states: List[torch.Tensor] = []
@ -1985,25 +2033,20 @@ class OffloadedStaticCache(StaticCache):
        ```
    """

-    is_compileable = True
-
    @deprecate_kwarg("layer_device_map", version="4.52.0")
    def __init__(
        self,
        config: PretrainedConfig,
        max_batch_size: int,
        max_cache_len: Optional[int],
-        device: Union[str, torch.device],
-        dtype: Optional[torch.dtype] = None,
+        device: Union[torch.device, str] = torch.device("meta"),
+        dtype: torch.dtype = torch.float32,
        offload_device: Union[str, torch.device] = torch.device("cpu"),
        layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None,
    ) -> None:
-        super(Cache, self).__init__()
        self.max_batch_size = max_batch_size
        self.max_cache_len = config.max_position_embeddings if max_cache_len is None else max_cache_len
-        self.device = torch.device(device) if layer_device_map is None else torch.device(layer_device_map[0])
        self.offload_device = torch.device(offload_device)
-        self.dtype = dtype if dtype is not None else torch.float32

        # Some model define a custom `head_dim` != config.hidden_size // config.num_attention_heads
        head_dim = config.head_dim if hasattr(config, "head_dim") else config.hidden_size // config.num_attention_heads
--- a/src/transformers/commands/add_fast_image_processor.py
+++ b/src/transformers/commands/add_fast_image_processor.py
@ -1,655 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import re
-from argparse import ArgumentParser, Namespace
-from datetime import date
-from pathlib import Path
-
-from ..utils import logging
-from . import BaseTransformersCLICommand
-
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-CURRENT_YEAR = date.today().year
-TRANSFORMERS_PATH = Path(__file__).parent.parent
-REPO_PATH = TRANSFORMERS_PATH.parent.parent
-
-
-def add_import_structure_entry_init(content: str, fast_image_processor_name: str, model_name: str):
-    """
-    Add an entry to the `_import_structure` dictionary in the `__init__.py` file of the transformers package.
-    """
-    # Step 1: Find the block
-    block_regex = re.compile(
-        r"if not is_torchvision_available\(\):.*?else:\s*(\n(?P<indent>\s+)_import_structure\[.*?\].*?\n(?:\s*(?P=indent)_import_structure\[.*?\].*?\n)*)",
-        re.DOTALL,
-    )
-    match = block_regex.search(content)
-
-    if not match:
-        raise ValueError("Couldn't find the '_import_structure' block.")
-
-    # Capture the block content and indentation
-    block_content = match.group(1)
-    indent = match.group("indent")
-
-    # Step 2: Parse existing entries
-    lines = block_content.strip().split("\n")
-    entries = []
-
-    import_structure_header = indent + lines[0]
-    entries = lines[1:]
-
-    # Add the new entry, maintaining alphabetical order
-    new_entry = f'{indent}_import_structure["models.{model_name}"].append("{fast_image_processor_name}")'
-    if new_entry not in entries:
-        entries.append(new_entry)
-
-    entries.sort()
-    entries = [import_structure_header] + entries
-
-    # Step 3: Reconstruct the block
-    updated_block = "\n".join(entry for entry in entries)
-
-    # Replace the original block in the content
-    updated_content = content[: match.start(1)] + "\n" + updated_block + "\n" + content[match.end(1) :]
-
-    return updated_content
-
-
-def add_import_statement_init(content: str, fast_image_processor_name: str, model_name: str):
-    """
-    Add an import statement to the `__init__.py` file of the transformers package.
-    """
-    # Step 1: Find the block
-    block_regex = re.compile(
-        r"if not is_torchvision_available\(\):\s+raise OptionalDependencyNotAvailable\(\)\s+except OptionalDependencyNotAvailable:\s+from \.utils\.dummy_torchvision_objects import \*\s+else:(?P<else_block>\s*(\n\s*from .+ import .*\n)+)(?=\s*try:\s+if not \(is_torchvision_available\(\) and is_timm_available\(\)\):)",
-        re.DOTALL,
-    )
-    match = block_regex.search(content)
-
-    if match:
-        block_content = match.group("else_block")  # The captured import block
-    else:
-        print("Couldn't find the import statement block.")
-
-    # Step 2: Parse existing entries
-    lines = block_content.strip().split("\n")
-    entries = []
-
-    indent = " " * (len(lines[1]) - len(lines[1].lstrip()))
-    import_structure_header = indent + lines[0]
-    entries = lines[1:]
-
-    # Add the new entry, maintaining alphabetical order
-    new_entry = f"{indent}from .models.{model_name} import {fast_image_processor_name}"
-    if new_entry not in entries:
-        entries.append(new_entry)
-
-    entries.sort()
-    entries = [import_structure_header] + entries
-
-    # Step 3: Reconstruct the block
-    updated_block = "\n".join(entry for entry in entries)
-
-    # Replace the original block in the content
-    updated_content = (
-        content[: match.start("else_block")] + "\n" + updated_block + "\n\n" + content[match.end("else_block") :]
-    )
-
-    return updated_content
-
-
-def add_fast_image_processor_to_main_init(fast_image_processor_name: str, model_name: str):
-    """
-    Add the fast image processor to the main __init__.py file of the transformers package.
-    """
-    with open(TRANSFORMERS_PATH / "__init__.py", "r", encoding="utf-8") as f:
-        content = f.read()
-
-    # add _import_structure entry
-    content = add_import_structure_entry_init(content, fast_image_processor_name, model_name)
-    # add import statement
-    content = add_import_statement_init(content, fast_image_processor_name, model_name)
-
-    # write the updated content
-    with open(TRANSFORMERS_PATH / "__init__.py", "w", encoding="utf-8") as f:
-        f.write(content)
-
-
-def add_fast_image_processor_to_model_init(
-    fast_image_processing_module_file: str, fast_image_processor_name, model_name: str
-):
-    """
-    Add the fast image processor to the __init__.py file of the model.
-    """
-    with open(TRANSFORMERS_PATH / "models" / model_name / "__init__.py", "r", encoding="utf-8") as f:
-        content = f.read()
-
-    fast_image_processing_module_file = fast_image_processing_module_file.split(os.sep)[-1].replace(".py", "")
-
-    if "import *" in content:
-        # we have an init file in the updated format
-        # get the indented block after if TYPE_CHECKING: and before else:, append the new import, sort the imports and write the updated content
-        # Step 1: Find the block
-        block_regex = re.compile(
-            r"if TYPE_CHECKING:\n(?P<if_block>.*?)(?=\s*else:)",
-            re.DOTALL,
-        )
-        match = block_regex.search(content)
-
-        if not match:
-            raise ValueError("Couldn't find the 'if TYPE_CHECKING' block.")
-
-        block_content = match.group("if_block")  # The captured import block
-
-        # Step 2: Parse existing entries
-        entries = block_content.split("\n")
-        indent = " " * (len(entries[0]) - len(entries[0].lstrip()))
-        new_entry = f"{indent}from .{fast_image_processing_module_file} import *"
-        if new_entry not in entries:
-            entries.append(new_entry)
-        entries.sort()
-        updated_block = "\n".join(entry for entry in entries)
-
-        # Replace the original block in the content
-        updated_content = content[: match.start("if_block")] + updated_block + content[match.end("if_block") :]
-    else:
-        # we have an init file in the old format
-
-        # add "is_torchvision_available" import to from ...utils import (
-        # Regex to match import statements from transformers.utils
-        pattern = r"""
-            from\s+\.\.\.utils\s+import\s+
-            (?:                                   # Non-capturing group for either:
-                ([\w, ]+)                         # 1. Single-line imports (e.g., 'a, b')
-                |                                 # OR
-                \((.*?)\)                         # 2. Multi-line imports (e.g., '(a, ... b)')
-            )
-        """
-        regex = re.compile(pattern, re.VERBOSE | re.DOTALL)
-
-        def replacement_function(match):
-            # Extract existing imports
-            imports = (match.group(1) or match.group(2)).split(",")
-            imports = imports[:-1] if imports[-1] == "\n" else imports
-            imports = [imp.strip() for imp in imports]
-
-            # Add the new import if not already present
-            if "is_torchvision_available" not in imports:
-                imports.append("is_torchvision_available")
-                imports.sort()
-
-            # Convert to multi-line import in all cases
-            updated_imports = "(\n    " + ",\n    ".join(imports) + ",\n)"
-
-            return f"from ...utils import {updated_imports}"
-
-        # Replace all matches in the file content
-        updated_content = regex.sub(replacement_function, content)
-
-        vision_import_structure_block = f'    _import_structure["{fast_image_processing_module_file[:-5]}"] = ["{fast_image_processor_name[:-4]}"]\n'
-
-        added_import_structure_block = (
-            "try:\n    if not is_torchvision_available():\n"
-            "        raise OptionalDependencyNotAvailable()\n"
-            "except OptionalDependencyNotAvailable:\n"
-            "    pass\n"
-            "else:\n"
-            f'    _import_structure["{fast_image_processing_module_file}"] = ["{fast_image_processor_name}"]\n'
-        )
-
-        if vision_import_structure_block not in updated_content:
-            raise ValueError("Couldn't find the 'vision _import_structure block' block.")
-
-        if added_import_structure_block not in updated_content:
-            updated_content = updated_content.replace(
-                vision_import_structure_block, vision_import_structure_block + "\n" + added_import_structure_block
-            )
-
-        vision_import_statement_block = (
-            f"        from .{fast_image_processing_module_file[:-5]} import {fast_image_processor_name[:-4]}\n"
-        )
-
-        added_import_statement_block = (
-            "    try:\n        if not is_torchvision_available():\n"
-            "            raise OptionalDependencyNotAvailable()\n"
-            "    except OptionalDependencyNotAvailable:\n"
-            "        pass\n"
-            "    else:\n"
-            f"        from .{fast_image_processing_module_file} import {fast_image_processor_name}\n"
-        )
-
-        if vision_import_statement_block not in updated_content:
-            raise ValueError("Couldn't find the 'vision _import_structure block' block.")
-
-        if added_import_statement_block not in updated_content:
-            updated_content = updated_content.replace(
-                vision_import_statement_block, vision_import_statement_block + "\n" + added_import_statement_block
-            )
-
-    # write the updated content
-    with open(TRANSFORMERS_PATH / "models" / model_name / "__init__.py", "w", encoding="utf-8") as f:
-        f.write(updated_content)
-
-
-def add_fast_image_processor_to_auto(image_processor_name: str, fast_image_processor_name: str):
-    """
-    Add the fast image processor to the auto module.
-    """
-    with open(TRANSFORMERS_PATH / "models" / "auto" / "image_processing_auto.py", "r", encoding="utf-8") as f:
-        content = f.read()
-
-    # get all lines containing the image processor name
-    updated_content = content.replace(
-        f'("{image_processor_name}",)', f'("{image_processor_name}", "{fast_image_processor_name}")'
-    )
-
-    # write the updated content
-    with open(TRANSFORMERS_PATH / "models" / "auto" / "image_processing_auto.py", "w", encoding="utf-8") as f:
-        f.write(updated_content)
-
-
-def add_fast_image_processor_to_dummy(fast_image_processor_name: str):
-    """
-    Add the fast image processor to the dummy torchvision objects file.
-    """
-    dummy_torchvision_objects_file = TRANSFORMERS_PATH / "utils" / "dummy_torchvision_objects.py"
-    with open(dummy_torchvision_objects_file, "r", encoding="utf-8") as f:
-        content = f.read()
-
-    # regex to find objects starting with "class " and ending with "ImageProcessorFast", including "ImageProcessorFast" in the match
-    image_processor_names = re.findall(r"class (\w*ImageProcessorFast)", content)
-    image_processor_names.append(fast_image_processor_name)
-    image_processor_names.sort()
-    index_new = image_processor_names.index(fast_image_processor_name)
-
-    new_dummy_object = (
-        f"class {fast_image_processor_name}(metaclass=DummyObject):\n"
-        '    _backends = ["torchvision"]\n\n'
-        "    def __init__(self, *args, **kwargs):\n"
-        '        requires_backends(self, ["torchvision"])\n'
-    )
-    if new_dummy_object not in content:
-        if index_new != len(image_processor_names) - 1:
-            # add the dummy object just before the next ImageProcessorFast
-            first_line = f"class {image_processor_names[index_new+1]}(metaclass=DummyObject):"
-            updated_content = content.replace(first_line, new_dummy_object + "\n\n" + first_line)
-        else:
-            # add the dummy object at the very end
-            updated_content = content + "\n\n" + new_dummy_object
-
-        # write the updated content
-        with open(dummy_torchvision_objects_file, "w", encoding="utf-8") as f:
-            f.write(updated_content)
-
-
-def add_fast_image_processor_to_doc(fast_image_processor_name: str, model_name: str):
-    """
-    Add the fast image processor to the model's doc file.
-    """
-    doc_source = REPO_PATH / "docs" / "source"
-    # find the doc files
-    doc_files = list(doc_source.glob(f"*/model_doc/{model_name}.md"))
-    if not doc_files:
-        # try again with "-"
-        doc_files = list(doc_source.glob(f"*/model_doc/{model_name.replace('_', '-')}.md"))
-    if not doc_files:
-        raise ValueError(f"No doc files found for {model_name}")
-
-    base_doc_string = (
-        f"## {fast_image_processor_name[:-4]}\n\n" f"[[autodoc]] {fast_image_processor_name[:-4]}\n" "    - preprocess"
-    )
-    fast_doc_string = (
-        f"## {fast_image_processor_name}\n\n" f"[[autodoc]] {fast_image_processor_name}\n" "    - preprocess"
-    )
-
-    for doc_file in doc_files:
-        with open(doc_file, "r", encoding="utf-8") as f:
-            content = f.read()
-
-        if fast_doc_string not in content:
-            # add the fast image processor to the doc
-            updated_content = content.replace(
-                base_doc_string,
-                base_doc_string + "\n\n" + fast_doc_string,
-            )
-
-            # write the updated content
-            with open(doc_file, "w", encoding="utf-8") as f:
-                f.write(updated_content)
-
-
-def add_fast_image_processor_to_tests(fast_image_processor_name: str, model_name: str):
-    """
-    Add the fast image processor to the image processing tests.
-    """
-    tests_path = REPO_PATH / "tests" / "models" / model_name
-    test_file = tests_path / f"test_image_processing_{model_name}.py"
-    if not os.path.exists(test_file):
-        logger.warning(f"No test file found for {model_name}. Skipping.")
-        return
-
-    with open(test_file, "r", encoding="utf-8") as f:
-        content = f.read()
-
-    # add is_torchvision_available import to the imports
-    # Regex to match import statements from transformers.utils
-    pattern = r"""
-        from\s+transformers\.utils\s+import\s+
-        (?:                                   # Non-capturing group for either:
-            ([\w, ]+)                         # 1. Single-line imports (e.g., 'a, b')
-            |                                 # OR
-            \((.*?)\)                         # 2. Multi-line imports (e.g., '(a, ... b)')
-        )
-    """
-    regex = re.compile(pattern, re.VERBOSE | re.DOTALL)
-
-    def replacement_function(match):
-        # Extract existing imports
-        existing_imports = (match.group(1) or match.group(2)).split(",")
-        existing_imports = existing_imports[:-1] if existing_imports[-1] == "\n" else existing_imports
-        existing_imports = [imp.strip() for imp in existing_imports]
-
-        # Add the new import if not already present
-        if "is_torchvision_available" not in existing_imports:
-            existing_imports.append("is_torchvision_available")
-            existing_imports.sort()
-
-        # Rebuild the import statement
-        if match.group(1):  # Single-line import
-            updated_imports = ", ".join(existing_imports)
-        else:  # Multi-line import
-            updated_imports = "(\n    " + ",\n    ".join(existing_imports) + ",\n)"
-
-        return f"from transformers.utils import {updated_imports}"
-
-    # Replace all matches in the file content
-    updated_content = regex.sub(replacement_function, content)
-
-    # add the fast image processor to the imports
-    base_import_string = f"    from transformers import {fast_image_processor_name[:-4]}"
-    fast_import_string = (
-        "    if is_torchvision_available():\n" f"        from transformers import {fast_image_processor_name}"
-    )
-    if fast_import_string not in updated_content:
-        updated_content = updated_content.replace(base_import_string, base_import_string + "\n\n" + fast_import_string)
-
-    # get line starting with "    image_processing_class = " and add a line after it starting with "    fast_image_processing_class = "
-    image_processing_class_line = re.search(r"    image_processing_class = .*", updated_content)
-    if not image_processing_class_line:
-        logger.warning(f"Couldn't find the 'image_processing_class' line in {test_file}. Skipping.")
-        return
-
-    fast_image_processing_class_line = (
-        f"    fast_image_processing_class = {fast_image_processor_name} if is_torchvision_available() else None"
-    )
-    if "    fast_image_processing_class = " not in updated_content:
-        updated_content = updated_content.replace(
-            image_processing_class_line.group(0),
-            image_processing_class_line.group(0) + "\n" + fast_image_processing_class_line,
-        )
-
-    # write the updated content
-    with open(test_file, "w", encoding="utf-8") as f:
-        f.write(updated_content)
-
-
-def get_fast_image_processing_content_header(content: str) -> str:
-    """
-    Get the header of the slow image processor file.
-    """
-    # get all lines before and including the line containing """Image processor
-    content_header = re.search(r"^(.*?\n)*?\"\"\"Image processor.*", content)
-    content_header = content_header.group(0)
-    content_header = re.sub(r"# Copyright (\d+)\s", f"# Copyright {CURRENT_YEAR} ", content_header)
-    content_header = content_header.replace("Image processor", "Fast Image processor")
-    return content_header
-
-
-def write_default_fast_image_processor_file(
-    fast_image_processing_module_file: str, fast_image_processor_name: str, content_base_file: str
-):
-    """
-    Write a default fast image processor file. Used when encountering a problem while parsing the slow image processor file.
-    """
-    imports = "\n\nfrom ...image_processing_utils_fast import BaseImageProcessorFast\n\n\n"
-    content_header = get_fast_image_processing_content_header(content_base_file)
-    content_base_file = (
-        f"class {fast_image_processor_name}(BaseImageProcessorFast):\n"
-        "    # To be implemented\n"
-        "    resample = None\n"
-        "    image_mean = None\n"
-        "    image_std = None\n"
-        "    size = None\n"
-        "    default_to_square = None\n"
-        "    crop_size = None\n"
-        "    do_resize = None\n"
-        "    do_center_crop = None\n"
-        "    do_rescale = None\n"
-        "    do_normalize = None\n"
-        "    do_convert_rgb = None\n\n\n"
-        f'__all__ = ["{fast_image_processor_name}"]\n'
-    )
-
-    content = content_header + imports + content_base_file
-
-    with open(fast_image_processing_module_file, "w", encoding="utf-8") as f:
-        f.write(content)
-
-
-def add_fast_image_processor_file(
-    fast_image_processing_module_file: str, fast_image_processor_name: str, content_base_file: str
-):
-    """
-    Add the fast image processor file to the model's folder.
-    """
-    # if the file already exists, do nothing
-    if os.path.exists(fast_image_processing_module_file):
-        print(f"{fast_image_processing_module_file} already exists. Skipping.")
-        return
-
-    regex = rf"class {fast_image_processor_name[:-4]}.*?(\n\S|$)"
-    match = re.search(regex, content_base_file, re.DOTALL)
-    if not match:
-        print(f"Couldn't find the {fast_image_processor_name[:-4]} class in {fast_image_processing_module_file}")
-        print("Creating a new file with the default content.")
-        return write_default_fast_image_processor_file(
-            fast_image_processing_module_file, fast_image_processor_name, content_base_file
-        )
-    # Exclude the last unindented line
-    slow_class_content = match.group(0).rstrip()
-    # get default args:
-    # find the __init__ block which start with def __init__ and ends with def
-    match = re.search(r"def __init__.*?def ", slow_class_content, re.DOTALL)
-    if not match:
-        print(
-            f"Couldn't find the __init__ block for {fast_image_processor_name[:-4]} in {fast_image_processing_module_file}"
-        )
-        print("Creating a new file with the default content.")
-        return write_default_fast_image_processor_file(
-            fast_image_processing_module_file, fast_image_processor_name, content_base_file
-        )
-    init = match.group(0)
-    init_signature_block = init.split(")")[0]
-    arg_names = init_signature_block.split(":")
-    arg_names = [arg_name.split("\n")[-1].strip() for arg_name in arg_names]
-    # get the default values
-    default_args = re.findall(r"= (.*?)(?:,|\))", init_signature_block)
-
-    # build default args dict
-    default_args_dict = dict(zip(arg_names, default_args))
-    pattern_default_size = r"size = size if size is not None else\s+(.*)"
-    match_default_size = re.findall(pattern_default_size, init)
-    default_args_dict["size"] = match_default_size[0] if match_default_size else None
-    pattern_default_crop_size = r"crop_size = crop_size if crop_size is not None else\s+(.*)"
-    match_default_crop_size = re.findall(pattern_default_crop_size, init)
-    default_args_dict["crop_size"] = match_default_crop_size[0] if match_default_crop_size else None
-    pattern_default_image_mean = r"self.image_mean = image_mean if image_mean is not None else\s+(.*)"
-    match_default_image_mean = re.findall(pattern_default_image_mean, init)
-    default_args_dict["image_mean"] = match_default_image_mean[0] if match_default_image_mean else None
-    pattern_default_image_std = r"self.image_std = image_std if image_std is not None else\s+(.*)"
-    match_default_image_std = re.findall(pattern_default_image_std, init)
-    default_args_dict["image_std"] = match_default_image_std[0] if match_default_image_std else None
-    default_args_dict["default_to_square"] = False if "(size, default_to_square=False" in init else None
-
-    content_header = get_fast_image_processing_content_header(content_base_file)
-    content_base_file = (
-        f"@add_start_docstrings(\n"
-        f'    "Constructs a fast {fast_image_processor_name.replace("ImageProcessorFast", "")} image processor.",\n'
-        f"    BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,\n)\n"
-        f"class {fast_image_processor_name}(BaseImageProcessorFast):\n"
-        "    # This generated class can be used as a starting point for the fast image processor.\n"
-        "    # if the image processor is only used for simple augmentations, such as resizing, center cropping, rescaling, or normalizing,\n"
-        "    # only the default values should be set in the class.\n"
-        "    # If the image processor requires more complex augmentations, methods from BaseImageProcessorFast can be overridden.\n"
-        "    # In most cases, only the `_preprocess` method should be overridden.\n\n"
-        "    # For an example of a fast image processor requiring more complex augmentations, see `LlavaNextImageProcessorFast`.\n\n"
-        "    # Default values should be checked against the slow image processor\n"
-        "    # None values left after checking can be removed\n"
-        f'    resample = {default_args_dict.get("resample")}\n'
-        f'    image_mean = {default_args_dict.get("image_mean")}\n'
-        f'    image_std = {default_args_dict.get("image_std")}\n'
-        f'    size = {default_args_dict.get("size")}\n'
-        f'    default_to_square = {default_args_dict.get("default_to_square")}\n'
-        f'    crop_size = {default_args_dict.get("crop_size")}\n'
-        f'    do_resize = {default_args_dict.get("do_resize")}\n'
-        f'    do_center_crop = {default_args_dict.get("do_center_crop")}\n'
-        f'    do_rescale = {default_args_dict.get("do_rescale")}\n'
-        f'    do_normalize = {default_args_dict.get("do_normalize")}\n'
-        f'    do_convert_rgb = {default_args_dict.get("do_convert_rgb")}\n\n\n'
-        f'__all__ = ["{fast_image_processor_name}"]\n'
-    )
-
-    imports = (
-        "\n\nfrom ...image_processing_utils_fast import BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, BaseImageProcessorFast\n"
-    )
-    image_utils_imports = []
-    if default_args_dict.get("resample") is not None and "PILImageResampling" in default_args_dict.get("resample"):
-        image_utils_imports.append("PILImageResampling")
-    if default_args_dict.get("image_mean") is not None and not any(
-        char.isdigit() for char in default_args_dict.get("image_mean")
-    ):
-        image_utils_imports.append(default_args_dict.get("image_mean"))
-    if default_args_dict.get("image_std") is not None and not any(
-        char.isdigit() for char in default_args_dict.get("image_std")
-    ):
-        image_utils_imports.append(default_args_dict.get("image_std"))
-
-    if image_utils_imports:
-        # sort imports
-        image_utils_imports.sort()
-        imports += f"from ...image_utils import {', '.join(image_utils_imports)}\n"
-
-    imports += "from ...utils import add_start_docstrings\n"
-
-    content = content_header + imports + "\n\n" + content_base_file
-
-    with open(fast_image_processing_module_file, "w", encoding="utf-8") as f:
-        f.write(content)
-
-
-def add_fast_image_processor(model_name: str):
-    """
-    Add the necessary references to the fast image processor in the transformers package,
-    and create the fast image processor file in the model's folder.
-    """
-    model_module = TRANSFORMERS_PATH / "models" / model_name
-    image_processing_module_file = list(model_module.glob("image_processing*.py"))
-    if not image_processing_module_file:
-        raise ValueError(f"No image processing module found in {model_module}")
-    elif len(image_processing_module_file) > 1:
-        for file_name in image_processing_module_file:
-            if not str(file_name).endswith("_fast.py"):
-                image_processing_module_file = str(file_name)
-                break
-    else:
-        image_processing_module_file = str(image_processing_module_file[0])
-
-    with open(image_processing_module_file, "r", encoding="utf-8") as f:
-        content_base_file = f.read()
-
-    # regex to find object starting with "class " and ending with "ImageProcessor", including "ImageProcessor" in the match
-    image_processor_name = re.findall(r"class (\w*ImageProcessor)", content_base_file)
-    if not image_processor_name:
-        raise ValueError(f"No ImageProcessor class found in {image_processing_module_file}")
-    elif len(image_processor_name) > 1:
-        raise ValueError(f"Multiple ImageProcessor classes found in {image_processing_module_file}")
-
-    image_processor_name = image_processor_name[0]
-    fast_image_processor_name = image_processor_name + "Fast"
-    fast_image_processing_module_file = image_processing_module_file.replace(".py", "_fast.py")
-
-    print(f"Adding {fast_image_processor_name} to {fast_image_processing_module_file}")
-
-    add_fast_image_processor_to_main_init(
-        fast_image_processor_name=fast_image_processor_name,
-        model_name=model_name,
-    )
-
-    add_fast_image_processor_to_model_init(
-        fast_image_processing_module_file=fast_image_processing_module_file,
-        fast_image_processor_name=fast_image_processor_name,
-        model_name=model_name,
-    )
-
-    add_fast_image_processor_to_auto(
-        image_processor_name=image_processor_name,
-        fast_image_processor_name=fast_image_processor_name,
-    )
-
-    add_fast_image_processor_to_dummy(fast_image_processor_name=fast_image_processor_name)
-
-    add_fast_image_processor_to_doc(
-        fast_image_processor_name=fast_image_processor_name,
-        model_name=model_name,
-    )
-
-    add_fast_image_processor_to_tests(
-        fast_image_processor_name=fast_image_processor_name,
-        model_name=model_name,
-    )
-
-    add_fast_image_processor_file(
-        fast_image_processing_module_file=fast_image_processing_module_file,
-        fast_image_processor_name=fast_image_processor_name,
-        content_base_file=content_base_file,
-    )
-
-
-def add_new_model_like_command_factory(args: Namespace):
-    return AddFastImageProcessorCommand(model_name=args.model_name)
-
-
-class AddFastImageProcessorCommand(BaseTransformersCLICommand):
-    @staticmethod
-    def register_subcommand(parser: ArgumentParser):
-        add_fast_image_processor_parser = parser.add_parser("add-fast-image-processor")
-        add_fast_image_processor_parser.add_argument(
-            "--model-name",
-            type=str,
-            required=True,
-            help="The name of the folder containing the model's implementation.",
-        )
-        add_fast_image_processor_parser.set_defaults(func=add_new_model_like_command_factory)
-
-    def __init__(self, model_name: str, *args):
-        self.model_name = model_name
-
-    def run(self):
-        add_fast_image_processor(model_name=self.model_name)
--- a/src/transformers/commands/transformers_cli.py
+++ b/src/transformers/commands/transformers_cli.py
@ -15,7 +15,6 @@

 from transformers import HfArgumentParser

-from .add_fast_image_processor import AddFastImageProcessorCommand
 from .add_new_model_like import AddNewModelLikeCommand
 from .chat import ChatCommand
 from .convert import ConvertCommand
@ -41,7 +40,6 @@ def main():
    UserCommands.register_subcommand(commands_parser)
    AddNewModelLikeCommand.register_subcommand(commands_parser)
    LfsCommands.register_subcommand(commands_parser)
-    AddFastImageProcessorCommand.register_subcommand(commands_parser)

    # Let's go
    args = parser.parse_args()
--- a/src/transformers/data/processors/squad.py
+++ b/src/transformers/data/processors/squad.py
@ -249,7 +249,7 @@ def squad_convert_example_to_features(
        else:
            p_mask[-len(span["tokens"]) : -(len(truncated_query) + sequence_added_tokens)] = 0

-        pad_token_indices = np.where(np.atleast_1d(span["input_ids"] == tokenizer.pad_token_id))
+        pad_token_indices = np.where(span["input_ids"] == tokenizer.pad_token_id)
        special_token_indices = np.asarray(
            tokenizer.get_special_tokens_mask(span["input_ids"], already_has_special_tokens=True)
        ).nonzero()
--- a/src/transformers/generation/configuration_utils.py
+++ b/src/transformers/generation/configuration_utils.py
@ -785,7 +785,8 @@ class GenerationConfig(PushToHubMixin):
            for arg_name in ("cache_implementation", "cache_config", "return_legacy_cache"):
                if getattr(self, arg_name) is not None:
                    logger.warning_once(
-                        no_cache_warning.format(cache_arg=arg_name, cache_arg_value=getattr(self, arg_name))
+                        no_cache_warning.format(cache_arg=arg_name, cache_arg_value=getattr(self, arg_name)),
+                        UserWarning,
                    )

        # 6.  check watermarking arguments
@ -1578,7 +1579,7 @@ class SynthIDTextWatermarkingConfig(BaseWatermarkingConfig):


@dataclass
-class CompileConfig:
+class CompileConfig(object):
    """
    Class that holds arguments relative to `torch.compile` behavior, when using automatic compilation in `generate`.
    See [`torch.compile`](https://pytorch.org/docs/stable/generated/torch.compile.html) for more details on the arguments.
@ -1619,9 +1620,7 @@ class CompileConfig:
    backend: Union[str, Callable] = "inductor"
    mode: str = "reduce-overhead"
    options: Optional[dict] = None
-    # Used to flag our `generate` call to compile on e.g. CPU. Often not optimal, but useful for testing purposes.
-    _compile_all_devices = None

    def to_dict(self) -> Dict[str, Any]:
        """Serializes this instance to a Python dictionary."""
-        return copy.deepcopy({key: value for key, value in self.__dict__.items() if key != "_compile_all_devices"})
+        return copy.deepcopy(self.__dict__)
--- a/src/transformers/generation/stopping_criteria.py
+++ b/src/transformers/generation/stopping_criteria.py
@ -245,26 +245,26 @@ class StopStringCriteria(StoppingCriteria):
        vocab = tokenizer.get_vocab()
        token_list, token_indices = tuple(vocab.keys()), tuple(vocab.values())
        self.embedding_vec, self.max_valid_positions, self.max_valid_end_lens = self.clean_and_embed_tokens_with_cache(
-            token_list, token_indices, tokenizer
+            token_list, token_indices, self.stop_strings, tokenizer
        )

        self.maximum_token_len = max([len(stop_string) for stop_string in self.stop_strings])
        self.num_stop_strings = len(self.stop_strings)
        self.target_lens = torch.tensor([len(stop_string) for stop_string in stop_strings], dtype=torch.int32)

-    def clean_and_embed_tokens_with_cache(self, token_list, token_indices, tokenizer):
+    def clean_and_embed_tokens_with_cache(self, token_list, token_indices, stop_strings, tokenizer):
        # We don't use the tokenizer in the cache key, because I don't trust it to have well-behaved equality
-        if (token_list, token_indices, self.stop_strings) in STOP_STRING_EMBEDDING_CACHE:
+        if (token_list, token_indices, stop_strings) in STOP_STRING_EMBEDDING_CACHE:
            embedding_vec, max_valid_positions, max_valid_end_lens = STOP_STRING_EMBEDDING_CACHE[
                (token_list, token_indices, self.stop_strings)
            ]
-            STOP_STRING_EMBEDDING_CACHE.move_to_end((token_list, token_indices, self.stop_strings))
+            STOP_STRING_EMBEDDING_CACHE.move_to_end((token_list, token_indices, stop_strings))
        else:
            clean_token_list, clean_token_indices = self.clean_tokenizer_vocab(tokenizer)
            embedding_vec, max_valid_positions, max_valid_end_lens = self._stop_string_create_embedding_vec(
-                clean_token_list, clean_token_indices, self.stop_strings
+                clean_token_list, clean_token_indices, stop_strings
            )
-            STOP_STRING_EMBEDDING_CACHE[(token_list, token_indices, self.stop_strings)] = (
+            STOP_STRING_EMBEDDING_CACHE[(token_list, token_indices, stop_strings)] = (
                embedding_vec,
                max_valid_positions,
                max_valid_end_lens,
@ -357,9 +357,7 @@ class StopStringCriteria(StoppingCriteria):
            )
        max_valid_end_lens = max(valid_end_lens)
        vec_size = len(stop_strings) * (max_valid_positions + max_valid_end_lens) + 1
-        # We use +2 instead of +1 so we can have a dummy entry at the end. We will clamp all token values
-        # over the max to this, ensuring they do not contribute to stop string matching.
-        gather_vec = np.full((max(token_indices) + 2, vec_size), dtype=np.int32, fill_value=-1)
+        gather_vec = np.full((len(token_list), vec_size), dtype=np.int32, fill_value=-1)

        for i, stop_string in enumerate(stop_strings):
            positions = token_valid_positions[stop_string]
@ -397,9 +395,6 @@ class StopStringCriteria(StoppingCriteria):
        # Flip input_ids because we're only matching strings at the end of the generated sequence
        flipped_ids = torch.flip(input_ids, (1,))

-        # Clip out-of-vocab values to the dummy value at the end of the embedding vector
-        flipped_ids = torch.clamp(flipped_ids, max=self.embedding_vec.size(0) - 1)
-
        # Size of the vector of positions a single token can match
        max_valid_positions = self.max_valid_positions

--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@ -381,13 +381,9 @@ class GenerationMixin:
        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
        # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
        #              (we can't check exception 3 while compiling)
-        # Excpetion 4: If input_embeds are passed then slice it through `cache_position`, to keep only the unprocessed tokens and
-        # generate the first token for each sequence. Later use the generated Input ids for continuation.
        if past_key_values is not None:
            model_inputs["past_key_values"] = past_key_values
-            if inputs_embeds is not None and input_ids.shape[1] == 0:  # Exception 4
-                inputs_embeds = inputs_embeds[:, -cache_position.shape[0] :]
-            elif (
+            if (
                inputs_embeds is not None  # Exception 1
                or (is_torchdynamo_compiling() or cache_position[-1] >= input_ids.shape[1])  # Exception 3
            ):
@ -397,9 +393,9 @@ class GenerationMixin:

        # 3. Prepare base model inputs
        input_ids_key = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step for every prompt.
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
        if not self.config.is_encoder_decoder:
-            if inputs_embeds is not None and len(cache_position) == inputs_embeds.shape[1]:
+            if inputs_embeds is not None and cache_position[0] == 0:
                model_inputs[input_ids_key] = None
                model_inputs["inputs_embeds"] = inputs_embeds
            else:
@ -410,28 +406,23 @@ class GenerationMixin:
            model_inputs[input_ids_key] = input_ids.clone(memory_format=torch.contiguous_format)

        # 4. Create missing `position_ids` on the fly
-        attention_mask = (
-            kwargs.pop("decoder_attention_mask", None) if self.config.is_encoder_decoder else attention_mask
-        )
-        attention_mask_key = "decoder_attention_mask" if self.config.is_encoder_decoder else "attention_mask"
-        position_ids_key = "decoder_position_ids" if self.config.is_encoder_decoder else "position_ids"
        if (
            attention_mask is not None
-            and kwargs.get(position_ids_key) is None
-            and position_ids_key in set(inspect.signature(self.forward).parameters.keys())
+            and kwargs.get("position_ids") is None
+            and "position_ids" in set(inspect.signature(self.forward).parameters.keys())
        ):
            position_ids = attention_mask.long().cumsum(-1) - 1
            position_ids.masked_fill_(attention_mask == 0, 1)
-            kwargs[position_ids_key] = position_ids  # placed in kwargs for further processing (see below)
+            kwargs["position_ids"] = position_ids  # placed in kwargs for further processing (see below)

        # 5. Slice model inputs if it's an input that should have the same length as `input_ids`
-        for model_input_name in ["position_ids", "token_type_ids", "decoder_position_ids"]:
+        for model_input_name in ["position_ids", "token_type_ids"]:
            model_input = kwargs.get(model_input_name)
            if model_input is not None:
                if past_key_values is not None:
                    current_input_length = (
                        model_inputs["inputs_embeds"].shape[1]
-                        if model_inputs.get("inputs_embeds") is not None
+                        if model_inputs["inputs_embeds"] is not None
                        else model_inputs[input_ids_key].shape[1]
                    )
                    model_input = model_input[:, -current_input_length:]
@ -478,7 +469,7 @@ class GenerationMixin:
                    past_key_values=past_key_values,
                )
        if attention_mask is not None:
-            model_inputs[attention_mask_key] = attention_mask
+            model_inputs["attention_mask"] = attention_mask

        # 7. Forward ALL kwargs that are uninitialized (e.g. `use_cache`).
        for key, value in kwargs.items():
@ -740,6 +731,7 @@ class GenerationMixin:
                    key != "cache_position"
                    and dict_to_expand[key] is not None
                    and isinstance(dict_to_expand[key], torch.Tensor)
+                    and not isinstance(dict_to_expand[key], Cache)
                ):
                    dict_to_expand[key] = dict_to_expand[key].repeat_interleave(expand_size, dim=0)
            return dict_to_expand
@ -3186,11 +3178,9 @@ class GenerationMixin:
        model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)

        model_forward = self.__call__
-        if isinstance(model_kwargs.get("past_key_values"), Cache):
-            is_compileable = model_kwargs["past_key_values"].is_compileable and self._supports_static_cache
-            if is_compileable and (
-                self.device.type == "cuda" or generation_config.compile_config._compile_all_devices
-            ):
+        if isinstance(model_kwargs.get("past_key_values"), StaticCache):
+            if self.device.type == "cuda":
+                logger.warning_once("Using `torch.compile`.")
                os.environ["TOKENIZERS_PARALLELISM"] = "0"
                model_forward = self.get_compiled_call(generation_config.compile_config)

@ -4530,13 +4520,13 @@ def _split(data, full_batch_size: int, num_hidden_layers: int, split_size: int =
    """
    if data is None:
        return [None] * (full_batch_size // split_size)
-    if isinstance(data, torch.Tensor):
-        return [data[i : i + split_size] for i in range(0, full_batch_size, split_size)]
    # New cache format
    elif isinstance(data, DynamicCache) or (
        isinstance(data, EncoderDecoderCache) and isinstance(data.self_attention_cache, DynamicCache)
    ):
        return data.batch_split(full_batch_size, split_size, num_hidden_layers)
+    if isinstance(data, torch.Tensor):
+        return [data[i : i + split_size] for i in range(0, full_batch_size, split_size)]
    elif isinstance(data, tuple):
        # If the elements of the tuple are also tuples (e.g., past_key_values in our earlier example)
        if isinstance(data[0], tuple):
@ -4643,13 +4633,13 @@ def stack_model_outputs(model_outputs: List[ModelOutput], config: PretrainedConf
        """
        if any(data is None for data in data):
            return None
-        if isinstance(data[0], torch.Tensor):
-            return torch.cat(data, dim=0)
        # New cache format
-        elif isinstance(data[0], DynamicCache):
+        if isinstance(data[0], DynamicCache):
            return DynamicCache.from_batch_splits(data, num_hidden_layers=num_hidden_layers)
        elif isinstance(data[0], EncoderDecoderCache):
            return EncoderDecoderCache.from_batch_splits(data, num_hidden_layers=num_hidden_layers)
+        elif isinstance(data[0], torch.Tensor):
+            return torch.cat(data, dim=0)
        elif isinstance(data[0], tuple):
            # If the elements of the tuple are also tuples (e.g., past_key_values in our earlier example)
            if isinstance(data[0][0], tuple):
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Ilyas Moutawwakil	090d9c4b2a	Merge branch 'main' into tensor-cache	2025-01-24 12:02:45 +01:00
IlyasMoutawwakil	5ccb79c16d	fixed dynamic cache	2025-01-23 16:45:28 +01:00
IlyasMoutawwakil	80b49d721b	rebased	2025-01-22 17:31:39 +01:00
IlyasMoutawwakil	dc1bd15ba9	Merge branch 'main' into tensor-cache	2025-01-22 17:30:23 +01:00
IlyasMoutawwakil	338f5954b9	more reverts	2025-01-22 17:29:48 +01:00
Ilyas Moutawwakil	2f4e0bc93e	Update src/transformers/cache_utils.py	2025-01-22 17:18:28 +01:00
IlyasMoutawwakil	485f959f85	revert	2025-01-22 17:17:17 +01:00
IlyasMoutawwakil	2bbbbbcf97	add device and dtype setters	2025-01-22 17:15:12 +01:00
Ilyas Moutawwakil	85c71b004b	Merge branch 'main' into tensor-cache	2025-01-22 15:53:33 +01:00
IlyasMoutawwakil	da60604f2c	fix test_cache_utils	2025-01-22 15:43:14 +01:00
IlyasMoutawwakil	6e9799c817	add clone and to	2025-01-22 15:42:43 +01:00
IlyasMoutawwakil	4950a9e3f0	extract wrapper kwargs from init signature to correctly instantate	2025-01-22 13:49:01 +01:00
IlyasMoutawwakil	b67b6eb9b2	make cache class exportable and executorch compatible	2025-01-20 18:47:30 +01:00
IlyasMoutawwakil	d269417aab	fix zamba and jamba dynamic cache	2025-01-20 17:21:49 +01:00
IlyasMoutawwakil	95c1686ee0	style	2025-01-20 17:09:21 +01:00
IlyasMoutawwakil	8606594ad4	fix boolean evaluation	2025-01-20 17:08:37 +01:00
IlyasMoutawwakil	45bb39bb80	torch tensor subclassing	2025-01-20 17:01:49 +01:00
IlyasMoutawwakil	a77a94b209	unproxy cache	2025-01-20 14:43:41 +01:00
IlyasMoutawwakil	d4b631edd0	use tensor cache instead of module cache	2025-01-20 14:17:28 +01:00