trigger

build
update
2025-11-05 21:04:37 +08:00 · 2025-08-26 17:40:04 +02:00 · 2025-08-26 17:25:23 +02:00 · 2025-08-26 17:23:26 +02:00 · 2025-08-26 15:26:57 +01:00 · 2025-08-26 14:58:28 +01:00
1271 changed files with 23461 additions and 21945 deletions
--- a/.github/workflows/build-nightly-ci-docker-images.yml
+++ b/.github/workflows/build-nightly-ci-docker-images.yml
@ -4,11 +4,8 @@ on:
  workflow_call:
  push:
    branches:
-      - build_nightly_ci_docker_image*
+      - fix_nightly_ci_docker_build

-concurrency:
-  group: docker-images-builds
-  cancel-in-progress: false

 jobs:
  latest-with-torch-nightly-docker:
@ -36,32 +33,32 @@ jobs:
          build-args: |
            REF=main
            PYTORCH=pre
-          push: true
+          push: false
          tags: huggingface/transformers-all-latest-torch-nightly-gpu

-  nightly-torch-deepspeed-docker:
-    name: "Nightly PyTorch + DeepSpeed"
-    runs-on:
-      group: aws-g4dn-2xlarge-cache
-    steps:
-      -
-        name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2
-      -
-        name: Check out code
-        uses: actions/checkout@v4
-      -
-        name: Login to DockerHub
-        uses: docker/login-action@v2
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_PASSWORD }}
-      -
-        name: Build and push
-        uses: docker/build-push-action@v3
-        with:
-          context: ./docker/transformers-pytorch-deepspeed-nightly-gpu
-          build-args: |
-            REF=main
-          push: true
-          tags: huggingface/transformers-pytorch-deepspeed-nightly-gpu
+#  nightly-torch-deepspeed-docker:
+#    name: "Nightly PyTorch + DeepSpeed"
+#    runs-on:
+#      group: aws-g4dn-2xlarge-cache
+#    steps:
+#      -
+#        name: Set up Docker Buildx
+#        uses: docker/setup-buildx-action@v2
+#      -
+#        name: Check out code
+#        uses: actions/checkout@v4
+#      -
+#        name: Login to DockerHub
+#        uses: docker/login-action@v2
+#        with:
+#          username: ${{ secrets.DOCKERHUB_USERNAME }}
+#          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+#      -
+#        name: Build and push
+#        uses: docker/build-push-action@v3
+#        with:
+#          context: ./docker/transformers-pytorch-deepspeed-nightly-gpu
+#          build-args: |
+#            REF=main
+#          push: true
+#          tags: huggingface/transformers-pytorch-deepspeed-nightly-gpu
--- a/.github/workflows/check_failed_tests.yml
+++ b/.github/workflows/check_failed_tests.yml
@ -21,6 +21,9 @@ on:
      report_repo_id:
        required: true
        type: string
+      commit_sha:
+        required: false
+        type: string


 env:
@ -87,7 +90,7 @@ jobs:
      - name: Update clone
        working-directory: /transformers
        if: ${{ env.process == 'true' }}
-        run: git fetch && git checkout ${{ github.sha }}
+        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}

      - name: Get target commit
        working-directory: /transformers/utils
--- a/.github/workflows/collated-reports.yml
+++ b/.github/workflows/collated-reports.yml
@ -35,7 +35,7 @@ jobs:
        run: |
          pip install huggingface_hub
          python3 utils/collated_reports.py                  \
-            --path /transformers/reports/                    \
+            --path .                                         \
            --machine-type ${{ inputs.machine_type }}        \
            --commit-hash ${{ env.CI_SHA }}                  \
            --job ${{ inputs.job }}                          \
--- a/.github/workflows/model_jobs.yml
+++ b/.github/workflows/model_jobs.yml
@ -18,6 +18,9 @@ on:
      docker:
        required: true
        type: string
+      commit_sha:
+        required: false
+        type: string
      report_name_prefix:
        required: false
        default: run_models_gpu
@ -70,7 +73,7 @@ jobs:

      - name: Update clone
        working-directory: /transformers
-        run: git fetch && git checkout ${{ github.sha }}
+        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}

      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
        working-directory: /transformers
@ -120,106 +123,23 @@ jobs:

      - name: Run all tests on GPU
        working-directory: /transformers
-        if: ${{ always() }}
-        run: |
-          python3 -m pip uninstall -y natten
-          python3 -m pip uninstall -y ninja && python3 -m pip install ninja && python3 -m pip install flash-attn --no-build-isolation
-#          START=0 END=400 python3 run.py
-#
-#      - name: "Upload"
-#        if: ${{ always() }}
-#        uses: actions/upload-artifact@v4
-#        with:
-#          name: summary_short_1
-#          path: /transformers/summary_short.txt
-#
-#      - name: Run all tests on GPU
-#        working-directory: /transformers
-#        if: ${{ always() }}
-#        run: |
-#          START=400 END=800 python3 run.py
-#
-#      - name: "Upload"
-#        if: ${{ always() }}
-#        uses: actions/upload-artifact@v4
-#        with:
-#          name: summary_short_2
-#          path: /transformers/summary_short.txt
-#
-#      - name: Run all tests on GPU
-#        working-directory: /transformers
-#        if: ${{ always() }}
-#        run: |
-#          START=800 END=1200 python3 run.py
-#
-#      - name: "Upload"
-#        if: ${{ always() }}
-#        uses: actions/upload-artifact@v4
-#        with:
-#          name: summary_short_3
-#          path: /transformers/summary_short.txt
-#
-#      - name: Run all tests on GPU
-#        working-directory: /transformers
-#        if: ${{ always() }}
-#        run: |
-#          START=1200 END=1600 python3 run.py
-#
-#      - name: "Upload"
-#        if: ${{ always() }}
-#        uses: actions/upload-artifact@v4
-#        with:
-#          name: summary_short_4
-#          path: /transformers/summary_short.txt
+        run: python3 -m pytest -rsfE -v --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}

-      - name: Run all tests on GPU
-        working-directory: /transformers
-        if: ${{ always() }}
-        run: |
-          START=1600 END=2000 python3 run.py
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/failures_short.txt

-      - name: "Upload"
+      - name: Run test
+        shell: bash
+        run: |
+          mkdir -p /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports
+          echo "hello" > /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/hello.txt
+          echo "${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports"
+
+      - name: "Test suite reports artifacts: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports"
        if: ${{ always() }}
        uses: actions/upload-artifact@v4
        with:
-          name: summary_short_5
-          path: /transformers/summary_short.txt
-
-      - name: Run all tests on GPU
-        working-directory: /transformers
-        if: ${{ always() }}
-        run: |
-          START=2000 END=2400 python3 run.py
-
-      - name: "Upload"
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: summary_short_6
-          path: /transformers/summary_short.txt
-
-      - name: Run all tests on GPU
-        working-directory: /transformers
-        if: ${{ always() }}
-        run: |
-          START=2400 END=2800 python3 run.py
-
-      - name: "Upload"
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: summary_short_7
-          path: /transformers/summary_short.txt
-
-      - name: Run all tests on GPU
-        working-directory: /transformers
-        if: ${{ always() }}
-        run: |
-          START=2800 END=3000 python3 run.py
-
-      - name: "Upload"
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: summary_short_8
-          path: /transformers/summary_short.txt
+          name: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports
+          path: /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports
--- a/.github/workflows/self-nightly-caller.yml
+++ b/.github/workflows/self-nightly-caller.yml
@ -1,43 +1,55 @@
-name: Self-hosted runner (nightly-ci)
-
+name: Nvidia CI with nightly torch

 on:
  repository_dispatch:
-  schedule:
-    - cron: "17 2 * * *"
+  # triggered when the daily scheduled Nvidia CI is completed.
+  # This way, we can compare the results more easily.
+  workflow_run:
+    workflows: ["Nvidia CI"]
+    branches: ["main"]
+    types: [completed]
  push:
    branches:
-      - run_nightly_ci*
+      - fix_nightly_ci_docker_build
+
+
+# Used for `push` to easily modify the target workflow runs to compare against
+env:
+    prev_workflow_run_id: "17153334249"
+    other_workflow_run_id: "17226437955"
+

 jobs:
-  build_nightly_ci_images:
-    name: Build Nightly CI Docker Images
-    if: (github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_nightly_ci'))
-    uses: ./.github/workflows/build-nightly-ci-docker-images.yml
-    secrets: inherit
+#  build_nightly_torch_ci_images:
+#    name: Build CI Docker Images with nightly torch
+#    uses: ./.github/workflows/build-nightly-ci-docker-images.yml
+#    secrets: inherit
+
+  setup:
+    name: Setup
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Setup
+        run: |
+          mkdir "setup_values"
+          echo "${{ inputs.prev_workflow_run_id || env.prev_workflow_run_id }}" > "setup_values/prev_workflow_run_id.txt"
+          echo "${{ inputs.other_workflow_run_id || env.other_workflow_run_id }}" > "setup_values/other_workflow_run_id.txt"
+
+      - name: Upload artifacts
+        uses: actions/upload-artifact@v4
+        with:
+          name: setup_values
+          path: setup_values

  model-ci:
    name: Model CI
-    needs: [build_nightly_ci_images]
+#    needs: build_nightly_torch_ci_images
    uses: ./.github/workflows/self-scheduled.yml
    with:
      job: run_models_gpu
-      slack_report_channel: "#transformers-ci-past-future"
-      runner: ci
+      slack_report_channel: "#transformers-ci-dummy"
      docker: huggingface/transformers-all-latest-torch-nightly-gpu
      ci_event: Nightly CI
-    secrets: inherit
-
-  deepspeed-ci:
-    name: DeepSpeed CI
-    needs: [build_nightly_ci_images]
-    uses: ./.github/workflows/self-scheduled.yml
-    with:
-      job: run_torch_cuda_extensions_gpu
-      slack_report_channel: "#transformers-ci-past-future"
-      runner: ci
-      # test deepspeed nightly build with the latest release torch
-      docker: huggingface/transformers-pytorch-deepspeed-latest-gpu
-      ci_event: Nightly CI
-      working-directory-prefix: /workspace
+      report_repo_id: hf-internal-testing/transformers_daily_ci_with_torch_nightly
+      commit_sha: ${{ github.event.workflow_run.head_sha || github.sha }}
    secrets: inherit
--- a/.github/workflows/self-scheduled-caller.yml
+++ b/.github/workflows/self-scheduled-caller.yml
@ -1,5 +1,4 @@
-name: Self-hosted runner (scheduled)
-
+name: Nvidia CI

 on:
  repository_dispatch:
@ -7,7 +6,7 @@ on:
    - cron: "17 2 * * *"
  push:
    branches:
-      - check_fa2
+      - run_nvidia_ci*
  workflow_dispatch:
    inputs:
      prev_workflow_run_id:
@ -50,8 +49,70 @@ jobs:
    uses: ./.github/workflows/self-scheduled.yml
    with:
      job: run_models_gpu
-      slack_report_channel: "#transformers-ci-dummy"
+      slack_report_channel: "#transformers-ci-daily-models"
      docker: huggingface/transformers-all-latest-gpu
      ci_event: Daily CI
      report_repo_id: hf-internal-testing/transformers_daily_ci
+      commit_sha: ${{ github.sha }}
+    secrets: inherit
+
+  torch-pipeline:
+    name: Torch pipeline CI
+    uses: ./.github/workflows/self-scheduled.yml
+    with:
+      job: run_pipelines_torch_gpu
+      slack_report_channel: "#transformers-ci-daily-pipeline-torch"
+      docker: huggingface/transformers-pytorch-gpu
+      ci_event: Daily CI
+      report_repo_id: hf-internal-testing/transformers_daily_ci
+      commit_sha: ${{ github.sha }}
+    secrets: inherit
+
+  example-ci:
+    name: Example CI
+    uses: ./.github/workflows/self-scheduled.yml
+    with:
+      job: run_examples_gpu
+      slack_report_channel: "#transformers-ci-daily-examples"
+      docker: huggingface/transformers-all-latest-gpu
+      ci_event: Daily CI
+      report_repo_id: hf-internal-testing/transformers_daily_ci
+      commit_sha: ${{ github.sha }}
+    secrets: inherit
+
+  trainer-fsdp-ci:
+    name: Trainer/FSDP CI
+    uses: ./.github/workflows/self-scheduled.yml
+    with:
+      job: run_trainer_and_fsdp_gpu
+      slack_report_channel: "#transformers-ci-daily-training"
+      docker: huggingface/transformers-all-latest-gpu
+      ci_event: Daily CI
+      report_repo_id: hf-internal-testing/transformers_daily_ci
+      commit_sha: ${{ github.sha }}
+    secrets: inherit
+
+  deepspeed-ci:
+    name: DeepSpeed CI
+    uses: ./.github/workflows/self-scheduled.yml
+    with:
+      job: run_torch_cuda_extensions_gpu
+      slack_report_channel: "#transformers-ci-daily-training"
+      docker: huggingface/transformers-pytorch-deepspeed-latest-gpu
+      ci_event: Daily CI
+      working-directory-prefix: /workspace
+      report_repo_id: hf-internal-testing/transformers_daily_ci
+      commit_sha: ${{ github.sha }}
+    secrets: inherit
+
+  quantization-ci:
+    name: Quantization CI
+    uses: ./.github/workflows/self-scheduled.yml
+    with:
+      job: run_quantization_torch_gpu
+      slack_report_channel: "#transformers-ci-daily-quantization"
+      docker: huggingface/transformers-quantization-latest-gpu
+      ci_event: Daily CI
+      report_repo_id: hf-internal-testing/transformers_daily_ci
+      commit_sha: ${{ github.sha }}
    secrets: inherit
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@ -1,4 +1,4 @@
-name: Self-hosted runner (scheduled)
+name: Nvidia CI (job definitions)

 # Note that each job's dependencies go into a corresponding docker file.
 #
@ -28,6 +28,9 @@ on:
      report_repo_id:
        required: true
        type: string
+      commit_sha:
+        required: false
+        type: string


 env:
@ -46,11 +49,11 @@ env:

 jobs:
  setup:
-    if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu", "run_quantization_torch_gpu"]'), inputs.job)
    name: Setup
+    if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu", "run_quantization_torch_gpu"]'), inputs.job)
    strategy:
      matrix:
-        machine_type: [aws-g5-4xlarge-cache]
+        machine_type: [aws-g5-4xlarge-cache, aws-g5-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -110,7 +113,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [single-gpu]
+        machine_type: [single-gpu, multi-gpu]
        slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
    uses: ./.github/workflows/model_jobs.yml
    with:
@ -119,6 +122,7 @@ jobs:
      slice_id: ${{ matrix.slice_id }}
      runner_map: ${{ needs.setup.outputs.runner_map }}
      docker: ${{ inputs.docker }}
+      commit_sha: ${{ inputs.commit_sha || github.sha }}
    secrets: inherit

  run_trainer_and_fsdp_gpu:
@ -137,6 +141,7 @@ jobs:
      slice_id: ${{ matrix.slice_id }}
      runner_map: ${{ needs.setup.outputs.runner_map }}
      docker: ${{ inputs.docker }}
+      commit_sha: ${{ inputs.commit_sha || github.sha }}
      report_name_prefix: run_trainer_and_fsdp_gpu
    secrets: inherit

@ -155,7 +160,7 @@ jobs:
    steps:
      - name: Update clone
        working-directory: /transformers
-        run: git fetch && git checkout ${{ github.sha }}
+        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}

      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
        working-directory: /transformers
@ -223,7 +228,7 @@ jobs:
    steps:
      - name: Update clone
        working-directory: /transformers
-        run: git fetch && git checkout ${{ github.sha }}
+        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}

      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
        working-directory: /transformers
@ -292,7 +297,7 @@ jobs:
    steps:
      - name: Update clone
        working-directory: ${{ inputs.working-directory-prefix }}/transformers
-        run: git fetch && git checkout ${{ github.sha }}
+        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}

      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
        working-directory: ${{ inputs.working-directory-prefix }}/transformers
@ -400,7 +405,7 @@ jobs:

      - name: Update clone
        working-directory: /transformers
-        run: git fetch && git checkout ${{ github.sha }}
+        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}

      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
        working-directory: /transformers
@ -452,3 +457,88 @@ jobs:
        with:
          name: ${{ env.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports
          path: /transformers/reports/${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports
+
+  run_extract_warnings:
+    # Let's only do this for the job `run_models_gpu` to simplify the (already complex) logic.
+    if: ${{ always() && inputs.job == 'run_models_gpu' }}
+    name: Extract warnings in CI artifacts
+    runs-on: ubuntu-22.04
+    needs: [setup, run_models_gpu]
+    steps:
+      - name: Checkout transformers
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 2
+          ref: ${{ inputs.commit_sha || github.sha }}
+
+      - name: Install transformers
+        run: pip install transformers
+
+      - name: Show installed libraries and their versions
+        run: pip freeze
+
+      - name: Create output directory
+        run: mkdir warnings_in_ci
+
+      - uses: actions/download-artifact@v4
+        with:
+          path: warnings_in_ci
+
+      - name: Show artifacts
+        run: echo "$(python3 -c 'import os; d = os.listdir(); print(d)')"
+        working-directory: warnings_in_ci
+
+      - name: Extract warnings in CI artifacts
+        run: |
+          python3 utils/extract_warnings.py --workflow_run_id ${{ github.run_id }} --output_dir warnings_in_ci --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }} --from_gh
+          echo "$(python3 -c 'import os; import json; fp = open("warnings_in_ci/selected_warnings.json"); d = json.load(fp); d = "\n".join(d) ;print(d)')"
+
+      - name: Upload artifact
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: warnings_in_ci
+          path: warnings_in_ci/selected_warnings.json
+
+  send_results:
+    name: Slack Report
+    needs: [
+      setup,
+      run_models_gpu,
+      run_trainer_and_fsdp_gpu,
+      run_pipelines_torch_gpu,
+      run_examples_gpu,
+      run_torch_cuda_extensions_gpu,
+      run_quantization_torch_gpu,
+      run_extract_warnings
+    ]
+    if: ${{ always() }}
+    uses: ./.github/workflows/slack-report.yml
+    with:
+      job: ${{ inputs.job }}
+      # This would be `skipped` if `setup` is skipped.
+      setup_status: ${{ needs.setup.result }}
+      slack_report_channel: ${{ inputs.slack_report_channel }}
+      # This would be an empty string if `setup` is skipped.
+      folder_slices: ${{ needs.setup.outputs.folder_slices }}
+      quantization_matrix: ${{ needs.setup.outputs.quantization_matrix }}
+      ci_event: ${{ inputs.ci_event }}
+      report_repo_id: ${{ inputs.report_repo_id }}
+      commit_sha: ${{ inputs.commit_sha || github.sha }}
+
+    secrets: inherit
+
+  check_new_failures:
+    if: ${{ always() && inputs.ci_event == 'Daily CI' && needs.send_results.result == 'success' }}
+    name: Check new failures
+    needs: send_results
+    uses: ./.github/workflows/check_failed_tests.yml
+    with:
+      docker: ${{ inputs.docker }}
+      start_sha: ${{ inputs.commit_sha || github.sha }}
+      job: ${{ inputs.job }}
+      slack_report_channel: ${{ inputs.slack_report_channel }}
+      ci_event: ${{ inputs.ci_event }}
+      report_repo_id: ${{ inputs.report_repo_id }}
+
+    secrets: inherit
--- a/.github/workflows/slack-report.yml
+++ b/.github/workflows/slack-report.yml
@ -24,6 +24,10 @@ on:
      report_repo_id:
        required: true
        type: string
+      commit_sha:
+        required: false
+        type: string
+

 env:
  TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}
@ -41,6 +45,10 @@ jobs:
          echo "Setup status: ${{ inputs.setup_status }}"

      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 2
+          ref: ${{ inputs.commit_sha || github.sha }}
+
      - uses: actions/download-artifact@v4

      - name: Prepare some setup values
@ -67,7 +75,7 @@ jobs:
          SLACK_REPORT_CHANNEL: ${{ inputs.slack_report_channel }}
          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
          CI_EVENT: ${{ inputs.ci_event }}
-          CI_SHA: ${{ github.sha }}
+          CI_SHA: ${{ inputs.commit_sha || github.sha }}
          CI_TEST_JOB: ${{ inputs.job }}
          SETUP_STATUS: ${{ inputs.setup_status }}
          REPORT_REPO_ID: ${{ inputs.report_repo_id }}
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -68,8 +68,7 @@ already reported** (use the search bar on GitHub under Issues). Your issue shoul

 Once you've confirmed the bug hasn't already been reported, please include the following information in your issue so we can quickly resolve it:

-* Your **OS type and version** and **Python**, **PyTorch** and
-  **TensorFlow** versions when applicable.
+* Your **OS type and version** and **Python**, and **PyTorch** versions when applicable.
 * A short, self-contained, code snippet that allows us to reproduce the bug in
  less than 30s.
 * The *full* traceback if an exception is raised.
@ -165,8 +164,7 @@ You'll need **[Python 3.9](https://github.com/huggingface/transformers/blob/main
   mode with the `-e` flag.

   Depending on your OS, and since the number of optional dependencies of Transformers is growing, you might get a
-   failure with this command. If that's the case make sure to install the Deep Learning framework you are working with
-   (PyTorch, TensorFlow and/or Flax) then do:
+   failure with this command. If that's the case make sure to install Pytorch then do:

   ```bash
   pip install -e ".[quality]"
--- a/README.md
+++ b/README.md
@ -147,7 +147,7 @@ chat = [
    {"role": "user", "content": "Hey, can you tell me any fun things to do in New York?"}
 ]

-pipeline = pipeline(task="text-generation", model="meta-llama/Meta-Llama-3-8B-Instruct", torch_dtype=torch.bfloat16, device_map="auto")
+pipeline = pipeline(task="text-generation", model="meta-llama/Meta-Llama-3-8B-Instruct", dtype=torch.bfloat16, device_map="auto")
 response = pipeline(chat, max_new_tokens=512)
 print(response[0]["generated_text"][-1]["content"])
 ```
--- a/benchmark/benches/llama.py
+++ b/benchmark/benches/llama.py
@ -106,12 +106,12 @@ def run_benchmark(

        logger.info("downloading weights")
        # This is to avoid counting download in model load time measurement
-        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16)
+        model = AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.float16)
        gen_config = GenerationConfig(do_sample=False, top_p=1, temperature=1)
        logger.info("loading model")
        start = perf_counter()
        model = AutoModelForCausalLM.from_pretrained(
-            model_id, torch_dtype=torch.float16, generation_config=gen_config
+            model_id, dtype=torch.float16, generation_config=gen_config
        ).eval()
        model.to(device)
        torch.cuda.synchronize()
@ -252,7 +252,7 @@ def run_benchmark(

        logger.info("compiling model")

-        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, generation_config=gen_config)
+        model = AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.float16, generation_config=gen_config)
        model.to(device)
        model = torch.compile(model, mode="max-autotune", fullgraph=True)

--- a/benchmark/config/generation.yaml
+++ b/benchmark/config/generation.yaml
@ -19,7 +19,7 @@ backend:
  model: meta-llama/Llama-2-7b-hf
  cache_implementation: static
  torch_compile: true
-  torch_dtype: float16
+  dtype: float16
  torch_compile_config:
    backend: inductor
    mode: reduce-overhead
--- a/conftest.py
+++ b/conftest.py
@ -23,7 +23,12 @@ from os.path import abspath, dirname, join
 import _pytest
 import pytest

-from transformers.testing_utils import HfDoctestModule, HfDocTestParser, is_torch_available
+from transformers.testing_utils import (
+    HfDoctestModule,
+    HfDocTestParser,
+    is_torch_available,
+    patch_torch_compile_force_graph,
+)


 NOT_DEVICE_TESTS = {
@ -136,3 +141,7 @@ if is_torch_available():
    # The flag below controls whether to allow TF32 on cuDNN. This flag defaults to True.
    # We set it to `False` for CI. See https://github.com/pytorch/pytorch/issues/157274#issuecomment-3090791615
    torch.backends.cudnn.allow_tf32 = False
+
+    # patch `torch.compile`: if `TORCH_COMPILE_FORCE_FULLGRAPH=1` (or values considered as true, e.g. yes, y, etc.),
+    # the patched version will always run with `fullgraph=True`.
+    patch_torch_compile_force_graph()
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@ -32,7 +32,10 @@ RUN python3 -m pip uninstall -y flax jax

 RUN python3 -m pip install --no-cache-dir -U timm

-RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract
+RUN [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git || echo "Don't install detectron2 with nightly torch"
+
+RUN python3 -m pip install --no-cache-dir pytesseract
+
 RUN python3 -m pip install -U "itsdangerous<2.1.0"

 RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
@ -51,15 +54,14 @@ RUN python3 -m pip install --no-cache-dir bitsandbytes
 # Some tests require quanto
 RUN python3 -m pip install --no-cache-dir quanto

+# After using A10 as CI runner, let's run FA2 tests
+RUN [ "$PYTORCH" != "pre" ] && python3 -m pip uninstall -y ninja && python3 -m pip install --no-cache-dir ninja && python3 -m pip install flash-attn --no-cache-dir --no-build-isolation || echo "Don't install FA2 with nightly torch"
+
+# TODO (ydshieh): check this again
 # `quanto` will install `ninja` which leads to many `CUDA error: an illegal memory access ...` in some model tests
 # (`deformable_detr`, `rwkv`, `mra`)
 RUN python3 -m pip uninstall -y ninja

-# For `dinat` model
-# The `XXX` part in `torchXXX` needs to match `PYTORCH` (to some extent)
-# pin `0.17.4` otherwise `cannot import name 'natten2dav' from 'natten.functional'`
-RUN python3 -m pip install --no-cache-dir natten==0.17.4+torch250cu121 -f https://shi-labs.com/natten/wheels
-
 # For `nougat` tokenizer
 RUN python3 -m pip install --no-cache-dir python-Levenshtein

--- a/docs/source/ar/chat_templating.md
+++ b/docs/source/ar/chat_templating.md
@ -304,7 +304,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 checkpoint = "NousResearch/Hermes-2-Pro-Llama-3-8B"

 tokenizer = AutoTokenizer.from_pretrained(checkpoint)
-model = AutoModelForCausalLM.from_pretrained(checkpoint, torch_dtype=torch.bfloat16, device_map="auto")
+model = AutoModelForCausalLM.from_pretrained(checkpoint, dtype=torch.bfloat16, device_map="auto")

 ```python
 messages = [
--- a/docs/source/ar/conversations.md
+++ b/docs/source/ar/conversations.md
@ -25,7 +25,7 @@ chat = [
 import torch
 from transformers import pipeline

-pipe = pipeline("text-generation", "meta-llama/Meta-Llama-3-8B-Instruct", torch_dtype=torch.bfloat16, device_map="auto")
+pipe = pipeline("text-generation", "meta-llama/Meta-Llama-3-8B-Instruct", dtype=torch.bfloat16, device_map="auto")
 response = pipe(chat, max_new_tokens=512)
 print(response[0]['generated_text'][-1]['content'])
 ```
@ -126,7 +126,7 @@ chat = [
 ]

 # 1: تحميل النموذج والمحلل
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto", torch_dtype=torch.bfloat16)
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto", dtype=torch.bfloat16)
 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")

 # 2: تطبيق قالب الدردشة
@ -164,7 +164,7 @@ print("Decoded output:\n", decoded_output)

 ### اعتبارات الذاكرة

-بشكل افتراضي، تقوم فئات Hugging Face مثل [`TextGenerationPipeline`] أو [`AutoModelForCausalLM`] بتحميل النموذج في دقة "float32". وهذا يعني أنه يحتاج إلى 4 بايتات (32 بت) لكل معلمة، لذا فإن نموذج "8B" بحجم 8 مليار معلمة سيحتاج إلى ~32 جيجابايت من الذاكرة. ومع ذلك، يمكن أن يكون هذا مضيعة للموارد! يتم تدريب معظم نماذج اللغة الحديثة في دقة "bfloat16"، والتي تستخدم فقط 2 بايت لكل معلمة. إذا كان عتادك يدعم ذلك (Nvidia 30xx/Axxx أو أحدث)، فيمكنك تحميل النموذج في دقة "bfloat16"، باستخدام معامل "torch_dtype" كما فعلنا أعلاه.
+بشكل افتراضي، تقوم فئات Hugging Face مثل [`TextGenerationPipeline`] أو [`AutoModelForCausalLM`] بتحميل النموذج في دقة "float32". وهذا يعني أنه يحتاج إلى 4 بايتات (32 بت) لكل معلمة، لذا فإن نموذج "8B" بحجم 8 مليار معلمة سيحتاج إلى ~32 جيجابايت من الذاكرة. ومع ذلك، يمكن أن يكون هذا مضيعة للموارد! يتم تدريب معظم نماذج اللغة الحديثة في دقة "bfloat16"، والتي تستخدم فقط 2 بايت لكل معلمة. إذا كان عتادك يدعم ذلك (Nvidia 30xx/Axxx أو أحدث)، فيمكنك تحميل النموذج في دقة "bfloat16"، باستخدام معامل "dtype" كما فعلنا أعلاه.

 ومن الممكن أيضًا النزول إلى أقل من 16 بت باستخدام "التكميم"، وهي طريقة لضغط أوزان النموذج بطريقة تفقد بعض المعلومات. يسمح هذا بضغط كل معلمة إلى 8 بتات أو 4 بتات أو حتى أقل. لاحظ أنه، خاصة في 4 بتات، قد تتأثر جودة ناتج النموذج سلبًا، ولكن غالبًا ما يكون هذا مقايضة تستحق القيام بها لتناسب نموذج محادثة أكبر وأكثر قدرة في الذاكرة. دعنا كيف يمكننا تطبيق ذلك باستخدام مكتبة `bitsandbytes`:

--- a/docs/source/ar/llm_tutorial_optimization.md
+++ b/docs/source/ar/llm_tutorial_optimization.md
@ -73,7 +73,7 @@ model = AutoModelForCausalLM.from_pretrained("bigscience/bloom", device_map="aut
 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 import torch

-model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", torch_dtype=torch.bfloat16, device_map="auto", pad_token_id=0)
+model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", dtype=torch.bfloat16, device_map="auto", pad_token_id=0)
 tokenizer = AutoTokenizer.from_pretrained("bigcode/octocoder")

 pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
@ -114,7 +114,7 @@ bytes_to_giga_bytes(torch.cuda.max_memory_allocated())

 > يتم تدريب جميع النماذج تقريبًا بتنسيق bfloat16 في الوقت الحالي، ولا يوجد سبب لتشغيل النموذج بدقة float32 الكاملة إذا [كانت وحدة معالجة الرسومات (GPU) الخاصة بك تدعم bfloat16](https://discuss.pytorch.org/t/bfloat16-native-support/117155/5). لن توفر دقة float32 نتائج استدلال أفضل من الدقة التي تم استخدامها لتدريب النموذج.

-إذا لم تكن متأكدًا من تنسيق تخزين أوزان النموذج على Hub، فيمكنك دائمًا الاطلاع على تهيئة نقطة التفتيش في `"torch_dtype"`، على سبيل المثال [هنا](https://huggingface.co/meta-llama/Llama-2-7b-hf/blob/6fdf2e60f86ff2481f2241aaee459f85b5b0bbb9/config.json#L21). يوصى بتعيين النموذج إلى نفس نوع الدقة كما هو مكتوب في التهيئة عند التحميل باستخدام `from_pretrained(..., torch_dtype=...)` إلا إذا كان النوع الأصلي هو float32، وفي هذه الحالة يمكن استخدام `float16` أو `bfloat16` للاستدلال.
+إذا لم تكن متأكدًا من تنسيق تخزين أوزان النموذج على Hub، فيمكنك دائمًا الاطلاع على تهيئة نقطة التفتيش في `"dtype"`، على سبيل المثال [هنا](https://huggingface.co/meta-llama/Llama-2-7b-hf/blob/6fdf2e60f86ff2481f2241aaee459f85b5b0bbb9/config.json#L21). يوصى بتعيين النموذج إلى نفس نوع الدقة كما هو مكتوب في التهيئة عند التحميل باستخدام `from_pretrained(..., dtype=...)` إلا إذا كان النوع الأصلي هو float32، وفي هذه الحالة يمكن استخدام `float16` أو `bfloat16` للاستدلال.


 دعونا نحدد وظيفة `flush(...)` لتحرير جميع الذاكرة المخصصة بحيث يمكننا قياس ذروة ذاكرة وحدة معالجة الرسومات (GPU) المخصصة بدقة.
@ -389,7 +389,7 @@ long_prompt = 10 * system_prompt + prompt
 نقوم بتنفيذ نموذجنا مرة أخرى بدقة bfloat16.

 ```python
-model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", torch_dtype=torch.bfloat16, device_map="auto")
+model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", dtype=torch.bfloat16, device_map="auto")
 tokenizer = AutoTokenizer.from_pretrained("bigcode/octocoder")

 pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
--- a/docs/source/ar/pipeline_tutorial.md
+++ b/docs/source/ar/pipeline_tutorial.md
@ -90,7 +90,7 @@ out = transcriber(...)  # سيتم الرجوع إلى استخدام `my_parame
 transcriber = pipeline(model="openai/whisper-large-v2", device=0)
 ```

-إذا كان النموذج كبيرًا جدًا بالنسبة لوحدة معالجة الرسومات (GPU) واحدة، وأنت تستخدم PyTorch، فيمكنك تعيين `torch_dtype='float16'` لتمكين الاستدلال بدقة FP16. عادةً ما لا يتسبب ذلك في حدوث انخفاضات كبيرة في الأداء، ولكن تأكد من تقييمه على نماذجك!
+إذا كان النموذج كبيرًا جدًا بالنسبة لوحدة معالجة الرسومات (GPU) واحدة، وأنت تستخدم PyTorch، فيمكنك تعيين `dtype='float16'` لتمكين الاستدلال بدقة FP16. عادةً ما لا يتسبب ذلك في حدوث انخفاضات كبيرة في الأداء، ولكن تأكد من تقييمه على نماذجك!

 بدلاً من ذلك، يمكنك تعيين `device_map="auto"` لتحديد كيفية تحميل مخزنات النموذج وتخزينها تلقائيًا. يتطلب استخدام معامل `device_map` مكتبه 🤗 [Accelerate](https://huggingface.co/docs/accelerate):

@ -273,7 +273,7 @@ pip install pytesseract
 import torch
 from transformers import pipeline

-pipe = pipeline(model="facebook/opt-1.3b", torch_dtype=torch.bfloat16, device_map="auto")
+pipe = pipeline(model="facebook/opt-1.3b", dtype=torch.bfloat16, device_map="auto")
 output = pipe("This is a cool example!", do_sample=True, top_p=0.95)
 ```

--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@ -81,13 +81,13 @@
    - local: conversations
      title: Chat basics
    - local: chat_templating
-      title: Templates
+      title: Chat templates
    - local: chat_templating_multimodal
-      title: Multimodal templates
-    - local: chat_templating_writing
-      title: Template writing
+      title: Multimodal chat templates
    - local: chat_extras
-      title: Tools and RAG
+      title: Tool use
+    - local: chat_templating_writing
+      title: Writing a chat template
    title: Chat with models
  - sections:
    - local: serving
@ -529,6 +529,12 @@
        title: Helium
      - local: model_doc/herbert
        title: HerBERT
+      - local: model_doc/hgnet_v2
+        title: HGNet-V2
+      - local: model_doc/hunyuan_v1_dense
+        title: HunYuanDenseV1
+      - local: model_doc/hunyuan_v1_moe
+        title: HunYuanMoEV1
      - local: model_doc/ibert
        title: I-BERT
      - local: model_doc/jamba
@ -669,6 +675,8 @@
        title: RoFormer
      - local: model_doc/rwkv
        title: RWKV
+      - local: model_doc/seed_oss
+        title: Seed-Oss
      - local: model_doc/splinter
        title: Splinter
      - local: model_doc/squeezebert
@ -693,8 +701,6 @@
        title: UL2
      - local: model_doc/umt5
        title: UMT5
-      - local: model_doc/xcodec
-        title: X-CODEC
      - local: model_doc/xmod
        title: X-MOD
      - local: model_doc/xglm
@ -945,6 +951,8 @@
        title: WavLM
      - local: model_doc/whisper
        title: Whisper
+      - local: model_doc/xcodec
+        title: X-Codec
      - local: model_doc/xls_r
        title: XLS-R
      - local: model_doc/xlsr_wav2vec2
@ -1005,6 +1013,8 @@
        title: Evolla
      - local: model_doc/flava
        title: FLAVA
+      - local: model_doc/florence2
+        title: Florence2
      - local: model_doc/gemma3
        title: Gemma3
      - local: model_doc/gemma3n
--- a/docs/source/en/add_new_pipeline.md
+++ b/docs/source/en/add_new_pipeline.md
@ -100,19 +100,18 @@ pipeline("This is the best meal I've ever had")

 Register the new task your pipeline supports in the `PIPELINE_REGISTRY`. The registry defines:

- the machine learning framework the pipeline supports with either `pt_model` or `tf_model` (add both to ensure it works with either frameworks)
+- The supported Pytorch model class with `pt_model`
 - a default model which should come from a specific revision (branch, or commit hash) where the model works as expected with `default`
 - the expected input with `type`

 ```py
 from transformers.pipelines import PIPELINE_REGISTRY
-from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification
+from transformers import AutoModelForSequenceClassification

 PIPELINE_REGISTRY.register_pipeline(
    "new-task",
    pipeline_class=MyPipeline,
    pt_model=AutoModelForSequenceClassification,
-    tf_model=TFAutoModelForSequenceClassification,
    default={"pt": ("user/awesome-model", "branch-name")},
    type="text",
 )
@ -128,7 +127,7 @@ It's faster to upload your pipeline code to the Hub because it doesn't require a

 Add your pipeline code to the Hub in a Python file.

-For example, a custom pipeline for sentence pair classification might look like the following code below. The implementation works for PyTorch and TensorFlow models.
+For example, a custom pipeline for sentence pair classification might look like the following code below.

 ```py
 import numpy as np
@ -168,13 +167,12 @@ Save the code in a file named `pair_classification.py`, and import and register
 ```py
 from pair_classification import PairClassificationPipeline
 from transformers.pipelines import PIPELINE_REGISTRY
-from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification
+from transformers import AutoModelForSequenceClassification

 PIPELINE_REGISTRY.register_pipeline(
    "pair-classification",
    pipeline_class=PairClassificationPipeline,
    pt_model=AutoModelForSequenceClassification,
-    tf_model=TFAutoModelForSequenceClassification,
 )
 ```

@ -187,9 +185,6 @@ The [register_pipeline](https://github.com/huggingface/transformers/blob/9feae5f
      "pt": [
        "AutoModelForSequenceClassification"
      ],
-      "tf": [
-        "TFAutoModelForSequenceClassification"
-      ],
    }
  },
 ```
@ -219,11 +214,11 @@ Add your pipeline code as a new module to the [pipelines](https://github.com/hug

 Next, add a new test for the pipeline in [transformers/tests/pipelines](https://github.com/huggingface/transformers/tree/main/tests/pipelines). You can look at the other tests for examples of how to test your pipeline.

-The [run_pipeline_test](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L186) function should be very generic and run on the models defined in [model_mapping](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L48) and [tf_model_mapping](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L49). This is important for testing future compatibility with new models.
+The [run_pipeline_test](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L186) function should be very generic and run on the models defined in [model_mapping](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L48). This is important for testing future compatibility with new models.

 You'll also notice `ANY` is used throughout the [run_pipeline_test](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L186) function. The models are random, so you can't check the actual values. Using `ANY` allows the test to match the output of the pipeline type instead.

 Finally, you should also implement the following 4 tests.

-1. [test_small_model_pt](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L59) and [test_small_model_tf](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L150), use a small model for these pipelines to make sure they return the correct outputs. The results don't have to make sense. Each pipeline should return the same result.
-1. [test_large_model_pt](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_zero_shot_image_classification.py#L187) nad [test_large_model_tf](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_zero_shot_image_classification.py#L220), use a realistic model for these pipelines to make sure they return meaningful results. These tests are slow and should be marked as slow.
+1. [test_small_model_pt](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L59), use a small model for these pipelines to make sure they return the correct outputs. The results don't have to make sense. Each pipeline should return the same result.
+1. [test_large_model_pt](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_zero_shot_image_classification.py#L187), use a realistic model for these pipelines to make sure they return meaningful results. These tests are slow and should be marked as slow.
--- a/docs/source/en/cache_explanation.md
+++ b/docs/source/en/cache_explanation.md
@ -104,7 +104,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache, infe
 device = f"{infer_device()}:0"

 model_id = "meta-llama/Llama-2-7b-chat-hf"
-model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map=device)
+model = AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16, device_map=device)
 tokenizer = AutoTokenizer.from_pretrained(model_id)

 past_key_values = DynamicCache()
@ -150,7 +150,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache, infe
 device = f"{infer_device()}:0"

 model_id = "meta-llama/Llama-2-7b-chat-hf"
-model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map=device)
+model = AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16, device_map=device)
 tokenizer = AutoTokenizer.from_pretrained(model_id)

 messages = [{"role": "user", "content": "You are a helpful assistant."}]
@ -176,7 +176,7 @@ import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache

 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", dtype=torch.float16, device_map="auto")
 inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device)

 # `return_dict_in_generate=True` is required to return the cache and `return_legacy_cache` forces the returned cache
--- a/docs/source/en/chat_extras.md
+++ b/docs/source/en/chat_extras.md
@ -14,64 +14,64 @@ rendered properly in your Markdown viewer.

 -->

-# Tools and RAG
+# Tool use

-The [`~PreTrainedTokenizerBase.apply_chat_template`] method supports virtually any additional argument types - strings, lists, dicts - besides the chat message. This makes it possible to use chat templates for many use cases.
+Chat models are commonly trained with support for "function-calling" or "tool-use". Tools are functions supplied by the user, which the model can choose to call as part of its response. For example, models could have access to a calculator tool to perform arithmetic without having to it internally.

-This guide will demonstrate how to use chat templates with tools and retrieval-augmented generation (RAG).
+This guide will demonstrate how to define tools, how to pass them to a chat model, and how to handle the model's output when it calls a tool.

-## Tools
+## Passing tools

-Tools are functions a large language model (LLM) can call to perform specific tasks. It is a powerful way to extend the capabilities of conversational agents with real-time information, computational tools, or access to large databases.
+When a model supports tool-use, pass functions to the `tools` argument of [`~PreTrainedTokenizerBase.apply_chat_template`].
+The tools are passed as either a [JSON schema](https://json-schema.org/learn) or Python functions. If you pass Python functions,
+the arguments, argument types, and function docstring are parsed in order to generate the JSON schema automatically.

-Follow the rules below when creating a tool.
+Although passing Python functions is very convenient, the parser can only handle [Google-style](https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings)
+docstrings. Refer to the examples below for how to format a tool-ready function.

-1. The function should have a descriptive name.
-2. The function arguments must have a type hint in the function header (don't include in the `Args` block).
-3. The function must have a [Google-style](https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings) docstring.
-4. The function can have a return type and `Returns` block, but these are optional because most tool use models ignore them.
-
-An example tool to get temperature and wind speed is shown below.

 ```py
-def get_current_temperature(location: str, unit: str) -> float:
+def get_current_temperature(location: str, unit: str):
    """
    Get the current temperature at a location.
    
    Args:
        location: The location to get the temperature for, in the format "City, Country"
        unit: The unit to return the temperature in. (choices: ["celsius", "fahrenheit"])
-    Returns:
-        The current temperature at the specified location in the specified units, as a float.
    """
    return 22.  # A real function should probably actually get the temperature!

-def get_current_wind_speed(location: str) -> float:
+def get_current_wind_speed(location: str):
    """
    Get the current wind speed in km/h at a given location.
    
    Args:
-        location: The location to get the temperature for, in the format "City, Country"
-    Returns:
-        The current wind speed at the given location in km/h, as a float.
+        location: The location to get the wind speed for, in the format "City, Country"
    """
    return 6.  # A real function should probably actually get the wind speed!

 tools = [get_current_temperature, get_current_wind_speed]
 ```

+You can optionally add a `Returns:` block to the docstring and a return type to the function header, but most models won't use this information. The parser will also ignore the actual code inside the function!
+
+What really matters is the function name, argument names, argument types, and docstring describing the function's purpose
+and the purpose of its arguments. These create the "signature" the model will use to decide whether to call the tool.
+
+## Tool-calling Example
+
 Load a model and tokenizer that supports tool-use like [NousResearch/Hermes-2-Pro-Llama-3-8B](https://hf.co/NousResearch/Hermes-2-Pro-Llama-3-8B), but you can also consider a larger model like [Command-R](./model_doc/cohere) and [Mixtral-8x22B](./model_doc/mixtral) if your hardware can support it.

 ```py
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer

-tokenizer = AutoTokenizer.from_pretrained( "NousResearch/Hermes-2-Pro-Llama-3-8B")
-tokenizer = AutoTokenizer.from_pretrained( "NousResearch/Hermes-2-Pro-Llama-3-8B")
-model = AutoModelForCausalLM.from_pretrained( "NousResearch/Hermes-2-Pro-Llama-3-8B", torch_dtype=torch.bfloat16, device_map="auto")
+checkpoint = "NousResearch/Hermes-2-Pro-Llama-3-8B"
+tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+model = AutoModelForCausalLM.from_pretrained(checkpoint, dtype="auto", device_map="auto")
 ```

-Create a chat message.
+Create a chat history.

 ```py
 messages = [
@ -80,12 +80,11 @@ messages = [
 ]
 ```

-Pass `messages` and a list of tools to [`~PreTrainedTokenizerBase.apply_chat_template`]. Then you can pass the inputs to the model for generation.
+Next, pass `messages` and a list of tools to [`~PreTrainedTokenizerBase.apply_chat_template`]. Tokenize the chat and generate a response.

 ```py
 inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt")
-inputs = {k: v for k, v in inputs.items()}
-outputs = model.generate(**inputs, max_new_tokens=128)
+outputs = model.generate(**inputs.to(model.device), max_new_tokens=128)
 print(tokenizer.decode(outputs[0][len(inputs["input_ids"][0]):]))
 ```

@ -95,60 +94,52 @@ print(tokenizer.decode(outputs[0][len(inputs["input_ids"][0]):]))
 </tool_call><|im_end|>
 ```

-The chat model called the `get_current_temperature` tool with the correct parameters from the docstring. It inferred France as the location based on Paris, and that it should use Celsius for the units of temperature. 
+The chat model called the `get_current_temperature` tool with the correct parameters from the docstring. It inferred France as the location based on Paris, and that it should use Celsius for the units of temperature.

-Now append the `get_current_temperature` function and these arguments to the chat message as `tool_call`. The `tool_call` dictionary should be provided to the `assistant` role instead of the `system` or `user`.
+A model **cannot actually call the tool itself**. It requests a tool call, and it's your job to handle the call and append it and the result to the chat history.
+
+Hold the call in the `tool_calls` key of an `assistant` message. This is the recommended API, and should be supported by the chat template of most tool-using models.

 > [!WARNING]
-> The OpenAI API uses a JSON string as its `tool_call` format. This may cause errors or strange model behavior if used in Transformers, which expects a dict.
+> Although `tool_calls` is similar to the OpenAI API, the OpenAI API uses a JSON string as its `tool_calls` format. This may cause errors or strange model behavior if used in Transformers, which expects a dict.

-<hfoptions id="tool-call">
-<hfoption id="Llama">

 ```py
 tool_call = {"name": "get_current_temperature", "arguments": {"location": "Paris, France", "unit": "celsius"}}
 messages.append({"role": "assistant", "tool_calls": [{"type": "function", "function": tool_call}]})
 ```

-Allow the assistant to read the function outputs and chat with the user.
+Append the tool response to the chat history with the `tool` role.
+
+```py
+messages.append({"role": "tool", "content": "22"})  # Note that the returned content is always a string!
+```
+
+Finally, allow the model to read the tool response and reply to the user.

 ```py
 inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt")
-inputs = {k: v for k, v in inputs.items()}
-out = model.generate(**inputs, max_new_tokens=128)
+out = model.generate(**inputs.to(model.device), max_new_tokens=128)
 print(tokenizer.decode(out[0][len(inputs["input_ids"][0]):]))
 ```

 ```txt
-The temperature in Paris, France right now is approximately 12°C (53.6°F).<|im_end|>
+The temperature in Paris, France right now is 22°C.<|im_end|>
 ```

-</hfoption>
-<hfoption id="Mistral/Mixtral">
+> [!WARNING]
+> Although the key in the assistant message is called `tool_calls`, in most cases, models only emit a single tool call at a time. Some older models emit multiple tool calls at the same time, but this is a
+> significantly more complex process, as you need to handle multiple tool responses at once and disambiguate them, often using tool call IDs. Please refer to the model card to see exactly what format a model expects for tool calls.

-For [Mistral](./model_doc/mistral) and [Mixtral](./model_doc/mixtral) models, you need an additional `tool_call_id`. The `tool_call_id` is 9 randomly generated alphanumeric characters assigned to the `id` key in the `tool_call` dictionary.

-```py
-tool_call_id = "9Ae3bDc2F"
-tool_call = {"name": "get_current_temperature", "arguments": {"location": "Paris, France", "unit": "celsius"}}
-messages.append({"role": "assistant", "tool_calls": [{"type": "function", "id": tool_call_id, "function": tool_call}]})
-```
+## JSON schemas

-```py
-inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt")
-inputs = {k: v for k, v in inputs.items()}
-out = model.generate(**inputs, max_new_tokens=128)
-print(tokenizer.decode(out[0][len(inputs["input_ids"][0]):]))
-```
+Another way to define tools is by passing a [JSON schema](https://json-schema.org/learn/getting-started-step-by-step).

-</hfoption>
-</hfoptions>
+You can also manually call the low-level functions that convert Python functions to JSON schemas, and then check or edit the generated schemas. This is usually not necessary, but is useful for understanding the underlying mechanics. It's particularly important
+for chat template authors who need to access the JSON schema to render the tool definitions.

-## Schema
-
-[`~PreTrainedTokenizerBase.apply_chat_template`] converts functions into a [JSON schema](https://json-schema.org/learn/getting-started-step-by-step) which is passed to the chat template. A LLM never sees the code inside the function. In other words, a LLM doesn't care how the function works technically, it only cares about function **definition** and **arguments**.
-
-The JSON schema is automatically generated behind the scenes as long as your function follows the [rules](#tools) listed earlier above. But you can use [get_json_schema](https://github.com/huggingface/transformers/blob/14561209291255e51c55260306c7d00c159381a5/src/transformers/utils/chat_template_utils.py#L205) to manually convert a schema for more visibility or debugging.
+The  [`~PreTrainedTokenizerBase.apply_chat_template`] method uses the [get_json_schema](https://github.com/huggingface/transformers/blob/14561209291255e51c55260306c7d00c159381a5/src/transformers/utils/chat_template_utils.py#L205) function to convert Python functions to a JSON schema.

 ```py
 from transformers.utils import get_json_schema
@ -191,12 +182,7 @@ print(schema)
 }
 ```

-You can edit the schema or write one entirely from scratch. This gives you a lot of flexibility to define precise schemas for more complex functions.
-
-> [!WARNING]
-> Try keeping your function signatures simple and the arguments to a minimum. These are easier for a model to understand and use than complex functions for example with nested arguments.
-
-The example below demonstrates writing a schema manually and then passing it to [`~PreTrainedTokenizerBase.apply_chat_template`].
+We won't go into the details of JSON schema itself here, since it's already [very well documented](https://json-schema.org/) elsewhere. We will, however, mention that you can pass JSON schema dicts to the `tools` argument of [`~PreTrainedTokenizerBase.apply_chat_template`] instead of Python functions:

 ```py
 # A simple function that takes no arguments
@ -238,62 +224,4 @@ model_input = tokenizer.apply_chat_template(
    messages,
    tools = [current_time, multiply]
 )
-```
-
-## RAG
-
-Retrieval-augmented generation (RAG) models enhance a models existing knowledge by allowing it to search documents for additional information before returning a query. For RAG models, add a `documents` parameter to [`~PreTrainedTokenizerBase.apply_chat_template`]. This `documents` parameter should be a list of documents, and each document should be a single dict with `title` and `content` keys.
-
-> [!TIP]
-> The `documents` parameter for RAG isn't widely supported and many models have chat templates that ignore `documents`. Verify if a model supports `documents` by reading its model card or executing `print(tokenizer.chat_template)` to see if the `documents` key is present. [Command-R](https://hf.co/CohereForAI/c4ai-command-r-08-2024) and [Command-R+](https://hf.co/CohereForAI/c4ai-command-r-plus-08-2024) both support `documents` in their RAG chat templates.
-
-Create a list of documents to pass to the model.
-
-```py
-documents = [
-    {
-        "title": "The Moon: Our Age-Old Foe", 
-        "text": "Man has always dreamed of destroying the moon. In this essay, I shall..."
-    },
-    {
-        "title": "The Sun: Our Age-Old Friend",
-        "text": "Although often underappreciated, the sun provides several notable benefits..."
-    }
-]
-```
-
-Set `chat_template="rag"` in [`~PreTrainedTokenizerBase.apply_chat_template`] and generate a response.
-
-```py
-from transformers import AutoTokenizer, AutoModelForCausalLM
-
-# Load the model and tokenizer
-tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01-4bit")
-model = AutoModelForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01-4bit", device_map="auto")
-device = model.device # Get the device the model is loaded on
-
-# Define conversation input
-conversation = [
-    {"role": "user", "content": "What has Man always dreamed of?"}
-]
-
-input_ids = tokenizer.apply_chat_template(
-    conversation=conversation,
-    documents=documents,
-    chat_template="rag",
-    tokenize=True,
-    add_generation_prompt=True,
-    return_tensors="pt").to(device)
-
-# Generate a response 
-generated_tokens = model.generate(
-    input_ids,
-    max_new_tokens=100,
-    do_sample=True,
-    temperature=0.3,
-    )
-
-# Decode and print the generated text along with generation prompt
-generated_text = tokenizer.decode(generated_tokens[0])
-print(generated_text)
-```
+```
--- a/docs/source/en/chat_templating.md
+++ b/docs/source/en/chat_templating.md
@ -14,11 +14,19 @@ rendered properly in your Markdown viewer.

 -->

-# Templates
+# Chat templates

-The [chat pipeline](./conversations) guide introduced [`TextGenerationPipeline`] and the concept of a chat prompt or chat template for conversing with a model. Underlying this high-level pipeline is the [`apply_chat_template`] method. A chat template is a part of the tokenizer and it specifies how to convert conversations into a single tokenizable string in the expected model format.
+The [chat basics](./conversations) guide covers how to store chat histories and generate text from chat models using [`TextGenerationPipeline`]. 

-In the example below, Mistral-7B-Instruct and Zephyr-7B are finetuned from the same base model but they’re trained with different chat formats. Without chat templates, you have to manually write formatting code for each model and even minor errors can hurt performance. Chat templates offer a universal way to format chat inputs to any model.
+This guide is intended for more advanced users, and covers the underlying classes and methods, as well as the key concepts for understanding what's actually going on when you chat with a model.
+
+The critical insight needed to understand chat models is this: All causal LMs, whether chat-trained or not, continue a sequence of tokens. When causal LMs are trained, the training usually begins with "pre-training" on a huge corpus of text, which creates a "base" model.
+These base models are then often "fine-tuned" for chat, which means training them on data that is formatted as a sequence of messages. The chat is still just a sequence of tokens, though! The list of `role` and `content` dictionaries that you pass
+to a chat model get converted to a token sequence, often with control tokens like `<|user|>` or `<|assistant|>` or `<|end_of_message|>`, which allow the model to see the chat structure. 
+There are many possible chat formats, and different models may use different formats or control tokens, even if they were fine-tuned from the same base model!
+
+Don't panic, though - you don't need to memorize every possible chat format in order to use chat models. Chat models come with **chat templates**, which indicate how they expect chats to be formatted.
+You can access these with the [`apply_chat_template`] method. Let's see two examples. Both of these models are fine-tuned from the same `Mistral-7B` base model:

 <hfoptions id="template">
 <hfoption id="Mistral">
@ -61,20 +69,24 @@ tokenizer.apply_chat_template(chat, tokenize=False)
 </hfoption>
 </hfoptions>

-This guide explores [`apply_chat_template`] and chat templates in more detail.
+Mistral-7B-Instruct uses `[INST]` and `[/INST]` tokens to indicate the start and end of user messages, while Zephyr-7B uses `<|user|>` and `<|assistant|>` tokens to indicate speaker roles. This is why chat templates are important - with the wrong control tokens, these models would have drastically worse performance.

-## apply_chat_template
+## Using `apply_chat_template`

-Chats should be structured as a list of dictionaries with `role` and `content` keys. The `role` key specifies the speaker (usually between you and the system), and the `content` key contains your message. For the system, the `content` is a high-level description of how the model should behave and respond when you’re chatting with it.
+The input to `apply_chat_template` should be structured as a list of dictionaries with `role` and `content` keys. The `role` key specifies the speaker, and the `content` key contains the message. The common roles are:

-Pass your messages to [`apply_chat_template`] to tokenize and format them. You can set [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) to `True` to indicate the start of a message.
+ - `user` for messages from the user
+ - `assistant` for messages from the model
+ - `system` for directives on how the model should act (usually placed at the beginning of the chat)
+
+[`apply_chat_template`] takes this list and returns a formatted sequence. Set `tokenize=True` if you want to tokenize the sequence.

 ```py
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer

 tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
-model = AutoModelForCausalLM.from_pretrained("HuggingFaceH4/zephyr-7b-beta", device_map="auto", torch_dtype=torch.bfloat16)
+model = AutoModelForCausalLM.from_pretrained("HuggingFaceH4/zephyr-7b-beta", device_map="auto", dtype=torch.bfloat16)

 messages = [
    {"role": "system", "content": "You are a friendly chatbot who always responds in the style of a pirate",},
@ -83,6 +95,7 @@ messages = [
 tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
 print(tokenizer.decode(tokenized_chat[0]))
 ```
+
 ```md
 <|system|>
 You are a friendly chatbot who always responds in the style of a pirate</s>
@ -91,7 +104,7 @@ How many helicopters can a human eat in one sitting?</s>
 <|assistant|>
 ```

-Now pass the tokenized chat to [`~GenerationMixin.generate`] to generate a response.
+Pass the tokenized chat to [`~GenerationMixin.generate`] to generate a response.

 ```py
 outputs = model.generate(tokenized_chat, max_new_tokens=128) 
@ -106,10 +119,17 @@ How many helicopters can a human eat in one sitting?</s>
 Matey, I'm afraid I must inform ye that humans cannot eat helicopters. Helicopters are not food, they are flying machines. Food is meant to be eaten, like a hearty plate o' grog, a savory bowl o' stew, or a delicious loaf o' bread. But helicopters, they be for transportin' and movin' around, not for eatin'. So, I'd say none, me hearties. None at all.
 ```

-### add_generation_prompt
-The [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) parameter adds tokens that indicate the start of a response. This ensures the chat model generates a system response instead of continuing a users message.
+> [!WARNING]
+> Some tokenizers add special `<bos>` and `<eos>` tokens. Chat templates should already include all the necessary special tokens, and adding additional special tokens is often incorrect or duplicated, hurting model performance. When you format text with `apply_chat_template(tokenize=False)`, make sure you set `add_special_tokens=False` if you tokenize later to avoid duplicating these tokens.
+> This isn’t an issue if you use `apply_chat_template(tokenize=True)`, which means it's usually the safer option!

-Not all models require generation prompts, and some models, like [Llama](./model_doc/llama), don’t have any special tokens before the system response. In this case, [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) has no effect.
+### add_generation_prompt
+
+You may have noticed the [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) argument in the above examples. 
+This argument adds tokens to the end of the chat that indicate the start of an `assistant` response. Remember: Beneath all the chat abstractions, chat models are still just language models that continue a sequence of tokens!
+If you include tokens that tell it that it's now in an `assistant` response, it will correctly write a response, but if you don't include these tokens, the model may get confused and do something strange, like **continuing** the user's message instead of replying to it! 
+
+Let's see an example to understand what `add_generation_prompt` is actually doing. First, let's format a chat without `add_generation_prompt`:

 ```py
 tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
@ -124,11 +144,32 @@ Nice to meet you!<|im_end|>
 Can I ask a question?<|im_end|>
 ```

+Now, let's format the same chat with `add_generation_prompt=True`:
+
+```py
+tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+tokenized_chat
+```
+```md
+<|im_start|>user
+Hi there!<|im_end|>
+<|im_start|>assistant
+Nice to meet you!<|im_end|>
+<|im_start|>user
+Can I ask a question?<|im_end|>
+<|im_start|>assistant
+
+```
+
+When `add_generation_prompt=True`, `<|im_start|>assistant` is added at the end to indicate the start of an `assistant` message. This lets the model know an `assistant` response is next.
+
+Not all models require generation prompts, and some models, like [Llama](./model_doc/llama), don’t have any special tokens before the `assistant` response. In these cases, [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) has no effect.
+
 ### continue_final_message

 The [continue_final_message](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.continue_final_message) parameter controls whether the final message in the chat should be continued or not instead of starting a new one. It removes end of sequence tokens so that the model continues generation from the final message.

-This is useful for “prefilling” a model response. In the example below, the model generates text that continues the JSON string rather than starting a new message. It can be very useful for improving the accuracy for instruction following when you know how to start its replies.
+This is useful for “prefilling” a model response. In the example below, the model generates text that continues the JSON string rather than starting a new message. It can be very useful for improving the accuracy of instruction following when you know how to start its replies.

 ```py
 chat = [
@ -143,52 +184,12 @@ model.generate(**formatted_chat)
 > [!WARNING]
 > You shouldn’t use [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) and [continue_final_message](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.continue_final_message) together. The former adds tokens that start a new message, while the latter removes end of sequence tokens. Using them together returns an error.

-[`TextGenerationPipeline`] sets [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) to `True` by default to start a new message. However, if the final message in the chat has the “assistant” role, it assumes the message is a prefill and switches to `continue_final_message=True`. This is because most models don’t support multiple consecutive assistant messages. To override this behavior, explicitly pass the [continue_final_message](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.continue_final_message) to the pipeline.
+[`TextGenerationPipeline`] sets [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) to `True` by default to start a new message. However, if the final message in the chat has the `assistant` role, it assumes the message is a prefill and switches to `continue_final_message=True`. This is because most models don’t support multiple consecutive assistant messages. To override this behavior, explicitly pass the [continue_final_message](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.continue_final_message) argument to the pipeline.

-## Multiple templates
-
-A model may have several different templates for different use cases. For example, a model may have a template for regular chat, tool use, and RAG.
-
-When there are multiple templates, the chat template is a dictionary. Each key corresponds to the name of a template. [`apply_chat_template`] handles multiple templates based on their name. It looks for a template named `default` in most cases and if it can’t find one, it raises an error.
-
-For a tool calling template, if a user passes a `tools` parameter and a `tool_use` template exists, the tool calling template is used instead of `default`.
-
-To access templates with other names, pass the template name to the `chat_template` parameter in [`apply_chat_template`]. For example, if you’re using a RAG template then set `chat_template="rag"`.
-
-It can be confusing to manage multiple templates though, so we recommend using a single template for all use cases. Use Jinja statements like `if tools is defined` and `{% macro %}` definitions to wrap multiple code paths in a single template.
-
-## Template selection
-
-It is important to set a chat template format that matches the template format a model was pretrained on, otherwise performance may suffer. Even if you’re training the model further, performance is best if the chat tokens are kept constant.
-
-But if you’re training a model from scratch or finetuning a model for chat, you have more options to select a template. For example, [ChatML](https://github.com/openai/openai-python/blob/release-v0.28.0/chatml.md) is a popular format that is flexible enough to handle many use cases. It even includes support for [generation prompts](#add_generation_prompt), but it doesn’t add beginning-of-string (`BOS`) or end-of-string (`EOS`) tokens. If your model expects `BOS` and `EOS` tokens, set `add_special_tokens=True` and make sure to add them to your template.
-
-```py
-{%- for message in messages %}
-    {{- '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }}
-{%- endfor %}
-```
-
-Set the template with the following logic to support [generation prompts](#add_generation_prompt). The template wraps each message with `<|im_start|>` and `<|im_end|>` tokens and writes the role as a string. This allows you to easily customize the roles you want to train with.
-
-```py
-tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
-```
-
-The `user`, `system` and `assistant` roles are standard roles in chat templates. We recommend using these roles when it makes sense, especially if you’re using your model with the [`TextGenerationPipeline`].
-
-```py
-<|im_start|>system
-You are a helpful chatbot that will do its best not to say anything so stupid that people tweet about it.<|im_end|>
-<|im_start|>user
-How are you?<|im_end|>
-<|im_start|>assistant
-I'm doing great!<|im_end|>
-```

 ## Model training

-Training a model with a chat template is a good way to ensure a chat template matches the tokens a model is trained on. Apply the chat template as a preprocessing step to your dataset. Set `add_generation_prompt=False` because the additional tokens to prompt an assistant response aren’t helpful during training.
+Training a model with a chat template is a good way to ensure the template matches the tokens the model was trained on. Apply the chat template as a preprocessing step to your dataset. Set `add_generation_prompt=False` because the additional tokens to prompt an assistant response aren’t helpful during training.

 An example of preprocessing a dataset with a chat template is shown below.

@ -219,11 +220,3 @@ The sun.</s>
 ```

 After this step, you can continue following the [training recipe](./tasks/language_modeling) for causal language models using the `formatted_chat` column.
-
-Some tokenizers add special `<bos>` and `<eos>` tokens. Chat templates should already include all the necessary special tokens, and adding additional special tokens is often incorrect or duplicated, hurting model performance. When you format text with `apply_chat_template(tokenize=False)`, make sure you set `add_special_tokens=False` as well to avoid duplicating them.
-
-```py
-apply_chat_template(messages, tokenize=False, add_special_tokens=False)
-```
-
-This isn’t an issue if `apply_chat_template(tokenize=True)`.
--- a/docs/source/en/chat_templating_multimodal.md
+++ b/docs/source/en/chat_templating_multimodal.md
@ -14,22 +14,21 @@ rendered properly in your Markdown viewer.

 -->

-# Multimodal templates
+# Multimodal chat templates

-Multimodal model chat templates expect a similar [template](./chat_templating) as text-only models. It needs `messages` that includes a dictionary of the `role` and `content`.
+Multimodal chat models accept inputs like images, audio or video, in addition to text. The `content` key in a multimodal chat history is a list containing multiple items of different types. This is unlike text-only chat models whose `content` key is a single string.

-Multimodal templates are included in the [Processor](./processors) class and require an additional `type` key for specifying whether the included content is an image, video, or text.

-This guide will show you how to format chat templates for multimodal models as well as some best practices for configuring the template
+In the same way the [Tokenizer](./fast_tokenizer) class handles chat templates and tokenization for text-only models, 
+the [Processor](./processors) class handles preprocessing, tokenization and chat templates for multimodal models. Their [`~ProcessorMixin.apply_chat_template`] methods are almost identical.
+
+This guide will show you how to chat with multimodal models with the high-level [`ImageTextToTextPipeline`] and at a lower level using the [`~ProcessorMixin.apply_chat_template`] and [`~GenerationMixin.generate`] methods.

 ## ImageTextToTextPipeline

 [`ImageTextToTextPipeline`] is a high-level image and text generation class with a “chat mode”. Chat mode is enabled when a conversational model is detected and the chat prompt is [properly formatted](./llm_tutorial#wrong-prompt-format).

-Start by building a chat history with the following two roles.
-
- `system` describes how the model should behave and respond when you’re chatting with it. This role isn’t supported by all chat models.
- `user` is where you enter your first message to the model.
+Add image and text blocks to the `content` key in the chat history.

 ```py
 messages = [
@ -47,39 +46,35 @@ messages = [
 ]
 ```

-Create a [`ImageTextToTextPipeline`] and pass the chat to it. For large models, setting [device_map=“auto”](./models#big-model-inference) helps load the model quicker and automatically places it on the fastest device available. Changing the data type to [torch.bfloat16](./models#model-data-type) also helps save memory.
-
-> [!TIP]
-> The [`ImageTextToTextPipeline`] accepts chats in the OpenAI format to make inference easier and more accessible. 
+Create an [`ImageTextToTextPipeline`] and pass the chat to it. For large models, setting [device_map=“auto”](./models#big-model-inference) helps load the model quicker and automatically places it on the fastest device available. Setting the data type to [auto](./models#model-data-type) also helps save memory and improve speed.

 ```python
 import torch
 from transformers import pipeline

-pipeline = pipeline("image-text-to-text", model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", device_map="auto", torch_dtype=torch.float16)
-pipeline(text=messages, max_new_tokens=50, return_full_text=False)
-[{'input_text': [{'role': 'system',
-    'content': [{'type': 'text',
-      'text': 'You are a friendly chatbot who always responds in the style of a pirate'}]},
-   {'role': 'user',
-    'content': [{'type': 'image',
-      'url': 'http://images.cocodataset.org/val2017/000000039769.jpg'},
-     {'type': 'text', 'text': 'What are these?'}]}],
-  'generated_text': 'The image shows two cats lying on a pink surface, which appears to be a cushion or a soft blanket. The cat on the left has a striped coat, typical of tabby cats, and is lying on its side with its head resting on the'}]
+pipe = pipeline("image-text-to-text", model="Qwen/Qwen2.5-VL-3B-Instruct", device_map="auto", dtype="auto")
+out = pipe(text=messages, max_new_tokens=128)
+print(out[0]['generated_text'][-1]['content'])
 ```

-## Image inputs

-For multimodal models that accept images like [LLaVA](./model_doc/llava), include the following in `content` as shown below.
+```
+Ahoy, me hearty! These be two feline friends, likely some tabby cats, taking a siesta on a cozy pink blanket. They're resting near remote controls, perhaps after watching some TV or just enjoying some quiet time together. Cats sure know how to find comfort and relaxation, don't they?
+```
+
+Aside from the gradual descent from pirate-speak into modern American English (it **is** only a 3B model, after all), this is correct!
+
+## Using `apply_chat_template`
+
+Like [text-only models](./chat_templating), use the [`~ProcessorMixin.apply_chat_template`] method to prepare the chat messages for multimodal models. 
+This method handles the tokenization and formatting of the chat messages, including images and other media types. The resulting inputs are passed to the model for generation.

- The content `"type"` can be an `"image"` or `"text"`.
- For images, it can be a link to the image (`"url"`), a file path (`"path"`), or `"base64"`. Images are automatically loaded, processed, and prepared into pixel values as inputs to the model.

 ```python
-from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
+from transformers import AutoProcessor, AutoModelForImageTextToText

-model = LlavaOnevisionForConditionalGeneration.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
-processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
+model = AutoModelForImageTextToText.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct", device_map="auto", torch_dtype="auto")
+processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")

 messages = [
    {
@ -96,14 +91,28 @@ messages = [
 ]
 ```

-Pass `messages` to [`~ProcessorMixin.apply_chat_template`] to tokenize the input content and return the `input_ids` and `pixel_values`.
+Pass `messages` to [`~ProcessorMixin.apply_chat_template`] to tokenize the input content. Unlike text models, the output of `apply_chat_template`
+contains a `pixel_values` key with the preprocessed image data, in addition to the tokenized text.

 ```py
 processed_chat = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt")
-print(processed_chat.keys())
+print(list(processed_chat.keys()))
 ```

-These inputs are now ready to be used in [`~GenerationMixin.generate`].
+
+```
+['input_ids', 'attention_mask', 'pixel_values', 'image_grid_thw']
+```
+
+Pass these inputs to [`~GenerationMixin.generate`].
+
+```python
+out = model.generate(**processed_chat.to(model.device), max_new_tokens=128)
+print(processor.decode(out[0]))
+```
+
+The decoded output contains the full conversation so far, including the user message and the placeholder tokens that contain the image information. You may need to trim the previous conversation from the output before displaying it to the user.
+

 ## Video inputs

@ -263,28 +272,3 @@ print(processed_chat.keys())
 </hfoption>
 </hfoptions>

-## Template configuration
-
-You can create a custom chat template with [Jinja](https://jinja.palletsprojects.com/en/3.1.x/templates/) and set it with [`~ProcessorMixin.apply_chat_template`]. Refer to the [Template writing](./chat_templating_writing) guide for more details.
-
-For example, to enable a template to handle a *list of content* from multiple modalities while still supporting plain strings for text-only inference, specify how to handle the `content['type']` if it is an image or text as shown below in the Llama 3.2 Vision Instruct [template](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct/blob/main/chat_template.json).
-
-```jinja
-{% for message in messages %}
-{% if loop.index0 == 0 %}{{ bos_token }}{% endif %}
-{{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }}
-{% if message['content'] is string %}
-{{ message['content'] }}
-{% else %}
-{% for content in message['content'] %}
-{% if content['type'] == 'image' %}
-{{ '<|image|>' }}
-{% elif content['type'] == 'text' %}
-{{ content['text'] }}
-{% endif %}
-{% endfor %}
-{% endif %}
-{{ '<|eot_id|>' }}
-{% endfor %}
-{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}
-```
--- a/docs/source/en/chat_templating_writing.md
+++ b/docs/source/en/chat_templating_writing.md
@ -14,15 +14,10 @@ rendered properly in your Markdown viewer.

 -->

-# Template writing
+# Writing a chat template

-A chat template is a [Jinja](https://jinja.palletsprojects.com/en/3.1.x/templates/) template stored in the tokenizers [chat_template](https://huggingface.co/docs/transformers/main_classes/tokenizer#transformers.PreTrainedTokenizer.chat_template) attribute. Jinja is a templating language that allows you to write Python-like code and syntax. A chat template performs the following three roles.
+A chat template is a [Jinja](https://jinja.palletsprojects.com/en/stable/templates/) template stored in the tokenizer's [chat_template](https://huggingface.co/docs/transformers/main_classes/tokenizer#transformers.PreTrainedTokenizer.chat_template) attribute. Jinja is a templating language that allows you to write Python-like code and syntax.

-1. Print the role enclosed in `<|` and `|>` (`<|user|>`, `<|assistant|>`, etc.).
-2. Print the message followed by an end-of-sequence (`EOS`) token.
-3. Print the assistant token if [add_generation_prompt=True](./chat_templating#add_generation_prompt) so the model generates an assistant response.
-
-An example template is shown below.

 ```jinja
 {%- for message in messages %}
@ -34,55 +29,68 @@ An example template is shown below.
 {%- endif %}
 ```

-The template can be customized to handle more complex use cases. This guide will show you how to add and edit templates and includes template writing tips.
+If you stare at this for a while, you should realize that this is actually very like Python, albeit with some strange
+`{%-` syntax. The template iterates over a list of messages, and for each message, it prints the role and content of 
+the message, followed by an end-of-sequence token. If `add_generation_prompt=True`, it adds 
+the starting header for an assistant message to the end of the conversation.

-## Create a template
-
-Create a template by writing a Jinja template and then setting it as the chat template in the tokenizer. For example, the template below adds `[ASST]` and `[/ASST]` tags to the assistant messages.
-
-```jinja
-{%- for message in messages %}
-    {%- if message['role'] == 'user' %}
-        {{- bos_token + '[INST] ' + message['content'].strip() + ' [/INST]' }}
-    {%- elif message['role'] == 'system' %}
-        {{- '<<SYS>>\\n' + message['content'].strip() + '\\n<</SYS>>\\n\\n' }}
-    {%- elif message['role'] == 'assistant' %}
-        {{- '[ASST] '  + message['content'] + ' [/ASST]' + eos_token }}
-    {%- endif %}
-{%- endfor %}
-```
-
-Set the template in the tokenizer, and the next time you use [`~PreTrainedTokenizerBase.apply_chat_template`], the new template is used.
-
-```py
-template = tokenizer.chat_template
-template = template.replace("SYS", "SYSTEM")  # Change the system token
-tokenizer.chat_template = template  # Set the new template
-```
-
-The template is saved in the `tokenizer_config.json` file. Upload it to the Hub with [`~PreTrainedTokenizer.push_to_hub`] so you can reuse it later and make sure everyone is using the right template for your model.
-
-```py
-tokenizer.push_to_hub("model_name")
-```
+Load the written template as a string and assign it to the tokenizer's `chat_template` attribute. Once set, the template is used whenever you call [`~PreTrainedTokenizerBase.apply_chat_template`]. It is also saved
+with the tokenizer whenever [`~PreTrainedTokenizer.save_pretrained`] or [`~PreTrainedTokenizer.push_to_hub`] is called. The template is saved in the `chat_template.jinja` file in the tokenizer directory. You can
+edit this file directly to change the template, which is often easier than manipulating a template string.

 ## Template writing tips

-The easiest way to start writing Jinja templates is to refer to existing templates. Use `print(tokenizer.chat_template)` on any chat model to see what template it's using. Try starting with simple models that don't call any tools or support RAG. Finally, take a look at the [Jinja documentation](https://jinja.palletsprojects.com/en/3.1.x/templates/#synopsis) for more details about formatting and syntax.
+The easiest way to start writing Jinja templates is to refer to existing templates. Use `print(tokenizer.chat_template)` on any chat model to see the template it's using. Try starting with simple models that don't call any tools or support RAG because tool-use models can have very complex templates. Finally, take a look at the [Jinja documentation](https://jinja.palletsprojects.com/en/stable/templates/#synopsis) for more details about formatting and syntax.

-This section curates some best practices for writing clean and efficient Jinja templates.
+There are some specific tips and pitfalls you may encounter while writing chat templates specifically, though, and this section will cover some of them in more detail. 

-### Trimming whitespace
+### Writing multimodal chat templates

-Jinja prints any whitespace before or after a block of text. This can be an issue for chat templates because whitespace usage should be intentional. Add `-` to strip any whitespace before a block.
+For multimodal templates, the `chat_template` attribute is set on the **processor**, not the tokenizer. The `content` key of a message is often a list of content dicts,
+rather than just a single string. You may wish to check the type of each content item in the list, and handle it accordingly.
+
+Generally, the template should not directly access image or video data. This is normally handled by the processor after template rendering has finished. Instead,
+your template should emit a single special token like `<|image|>` or `<|video|>` when it encounters image or video content.  The processor will
+expand the single special token out into a sequence of image or video tokens later. The exact tokens to emit depends on the model you're working with. We strongly recommend loading an existing multimodal processor to see how it handles data.
+
+The example template below handles mixed image and text content.

 ```jinja
 {%- for message in messages %}
-    {{- message['role'] + message['content'] }}
+    {%- if loop.index0 == 0 %}
+        {{- bos_token }}
+    {%- endif %}
+    {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }}
+    {%- if message['content'] is string %}
+        {{- message['content'] }}
+    {%- else %}
+        {%- for content in message['content'] %}
+            {%- if content['type'] == 'image' %}
+                {{- '<|image|>' }}
+            {%- elif content['type'] == 'text' %}
+                {{- content['text'] }}
+            {%- endif %}
+        {%- endfor %}
+    {%- endif %}
+    {{- '<|eot_id|>' }}
 {%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
+{%- endif %}
 ```

-The incorrect whitespace usage example below may introduce a newline and indentation in the output.
+This multimodal template is very similar to the more simple template above, but it checks for `content` lists,
+and iterates over them to render `<|image|>` tokens where necessary. This allows images to be inserted "into the flow"
+of user text.
+
+Not all models work this way - some may move all images to the end of the user message,
+for example. The chat template should always match the format the model was trained with.
+
+### Trimming whitespace
+
+Jinja prints any whitespace before or after a block of text. This can be an issue for chat templates because adding extra whitespace that was not present during model training can harm performance. To remove the whitespace, add `-` to the Jinja line syntax. This allows you to write your template with Pythonic indentation and linebreaks, without accidentally printing an indentation in the rendered output.
+
+The example template below doesn't use `-`, resulting in extra whitespace being printed in the output.

 ```jinja
 {% for message in messages %}
@ -90,22 +98,28 @@ The incorrect whitespace usage example below may introduce a newline and indenta
 {% endfor %}
 ```

-### Special variables
+We strongly recommend using `-` to ensure only the intended content is printed.

-There are five special variables available inside a template. You can pass virtually any additional arguments to [`~PreTrainedTokenizerBase.apply_chat_template`] and it will be available inside the template as a variable. However, you should try to keep the number of variables to the five below to make it easier for users to use the chat model without writing custom code to handle model-specific arguments.
+```jinja
+{%- for message in messages %}
+    {{- message['role'] + message['content'] }}
+{%- endfor %}
+```

- `messages` contains the chat history as a list of message dicts.
- `tools` contains a list of tools in JSON schema format.
- `documents` contains a list of documents with the format `{"title": Title, "contents": "Contents"}` (designed for RAG models).
- `add_generation_prompt` is a boolean that determines whether to add an assistant header at the end of the conversation.
- `bos_token` and `eos_token` are special tokens extracted from a tokenizers `special_tokens_map`.
+### Special variables and callables

-### Callable functions

-There are two callable functions available inside a template.
+The only constants in a template are the `messages` variable and the `add_generation_prompt` boolean. However, you have
+access to **any other keyword arguments that are passed** to the [`~PreTrainedTokenizerBase.apply_chat_template`] method.
+
+This provides flexibility and enables support for use-cases we may not have thought of while designing the spec. The most common additional variable is `tools`, which contains a list of tools in JSON schema format. Although you can use any variable name you like, we highly recommend sticking to convention and using `tools` for this purpose. This makes templates more compatible with the standard API.
+
+You also have access to any tokens contained in `tokenizer.special_tokens_map`, which often includes special tokens like `bos_token` and `eos_token`. Access these directly by name, like `{{- bos_token }}`.
+
+There are two callable functions available to you. To call them, use `{{- function_name(argument) }}`.

 - `raise_exception(msg)` raises a `TemplateException`. This is useful for debugging or warning users about incorrect template usage.
- `strftime_now(format_str)` retrieves the current date and time in a specific format which could be useful to include in system messages. It is equivalent to [datetime.now().strftime(format_str)](https://docs.python.org/3/library/datetime.html#datetime.datetime.now) in Python.
+- `strftime_now(format_str)` retrieves the current date and time in a specific format, which is often required in system messages. It is equivalent to [datetime.now().strftime(format_str)](https://docs.python.org/3/library/datetime.html#datetime.datetime.now) in Python.

 ### Compatibility with non-Python Jinja

@ -144,9 +158,11 @@ The following section lists elements of the standard API for writing templates f

 ### Tool definitions

-Transformers chat template methods allow a user to pass tools as Python functions or a JSON schema. When functions are passed, a JSON schema is automatically generated and passed to the template. The `tools` variable in a template always takes a list of JSON schemas.
+[Tools](./chat_extras) are passed as Python functions or a JSON schema. When functions are passed, a JSON schema is automatically generated and passed to the template. When a template accesses the `tools` variable, it is always a list of JSON schemas.

-The specific tokens and tool descriptions should match the ones your model was trained with. Your model doesn't need to understand the JSON schema input because your template can translate the JSON schema into your models format. For example, [Command-R](./model_doc/cohere) was trained with tools defined with Python function headers, but the Command-R tool template accepts JSON schemas. The template internally converts types and renders the input tools as Python headers.
+Even though a template always receive tools as a JSON schema, you may need to radically change this format when rendering them to match the format a model was trained with. For example, [Command-R](./model_doc/cohere) was trained with tools defined with Python function headers. The template internally converts JSON schema types and renders the input tools as Python headers.
+
+The example below shows how a tool is defined in JSON schema format.

 ```json
 {
@ -172,7 +188,7 @@ The specific tokens and tool descriptions should match the ones your model was t
 }
 ```

-An example for handling tool definitions in a chat template is shown below. The specific tokens and tool descriptions should be changed to match the ones a model was trained with.
+An example of handling tool definitions in a chat template is shown below. The specific tokens and layouts should be changed to match the ones the model was trained with.

 ```
 {%- if tools %}
@ -188,7 +204,9 @@ An example for handling tool definitions in a chat template is shown below. The

 ### Tool calls

-Tool calls, if present, is a list with the `"assistant”` role. This is always a list even though most tool-calling models only support single tool calls, which means the list usually only contains a single element.
+In addition to rendering the tool definitions, you also need to render **tool calls** and **tool responses** in the template.
+
+Tool calls are generally passed in the `tool_calls` key of an `"assistant”` message. This is always a list even though most tool-calling models only support single tool calls, which means the list usually only contains a single element.

 ```json
 {
@ -208,7 +226,7 @@ Tool calls, if present, is a list with the `"assistant”` role. This is always
 }
 ```

-A common pattern for handling tool calls is shown below.
+A common pattern for handling tool calls is shown below. You can use this as a starting point, but make sure you template actually matches the format the model was trained with!

 ```
 {%- if message['role'] == 'assistant' and 'tool_calls' in message %}
@ -221,7 +239,7 @@ A common pattern for handling tool calls is shown below.

 ### Tool responses

-Tool responses are a message dict with the `role`, `name` (name of the function) and `content` (result of the tool call) keys.
+Tool responses are message dicts with the `tool` role. They are much simpler than tool calls, and usually only contain the `role`, `name` and `content` keys.

 ```json
 {
@ -231,7 +249,7 @@ Tool responses are a message dict with the `role`, `name` (name of the function)
 }
 ```

-Not all the keys need to be used in the tool response. For example, if a model doesn’t expect the function name to be included in the tool response, then you can just include the `role` and `content`.
+Some templates may not even need the `name` key, in which case, you can write your template to only read the `content` key.

 ```
 {%- if message['role'] == 'tool' %}
@ -241,11 +259,11 @@ Not all the keys need to be used in the tool response. For example, if a model d

 ## Contribute

-Add a chat template by setting the `chat_template` attribute in the tokenizer and testing it with [`~PreTrainedTokenizerBase.apply_chat_template`]. If it works as expected, then you can upload it to the Hub with with [`~PreTrainedTokenizer.push_to_hub`].
+Once a template is ready, set it to the `chat_template` attribute in the tokenizer and test it with [`~PreTrainedTokenizerBase.apply_chat_template`]. If it works as expected, then upload it to the Hub with [`~PreTrainedTokenizer.push_to_hub`].

-Even if you're not the model owner, it is still helpful to add a template for a model with an empty chat template or a model that is using a default class template. Open a [pull request](https://hf.co/docs/hub/repositories-pull-requests-discussions) on the model repository to add the template.
+Even if you're not the model owner, it is still helpful to add a template for a model with an empty or incorrect chat template. Open a [pull request](https://hf.co/docs/hub/repositories-pull-requests-discussions) on the model repository to add the template!

 ```py
 tokenizer.chat_template = template
-tokenizer.push_to_hub("model_name")
+tokenizer.push_to_hub("amazing_company/cool_model", commit_message="Add chat template", create_pr=True)
 ```
--- a/docs/source/en/community.md
+++ b/docs/source/en/community.md
@ -17,7 +17,6 @@ This page regroups resources around 🤗 Transformers developed by the community
 | Notebook     |      Description      |      Author      |      |
 |:----------|:-------------|:-------------|------:|
 | [Fine-tune a pre-trained Transformer to generate lyrics](https://github.com/AlekseyKorshuk/huggingartists) | How to generate lyrics in the style of your favorite artist by fine-tuning a GPT-2 model |  [Aleksey Korshuk](https://github.com/AlekseyKorshuk) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AlekseyKorshuk/huggingartists/blob/master/huggingartists-demo.ipynb) |
-| [Train T5 in Tensorflow 2](https://github.com/snapthat/TF-T5-text-to-text) | How to train T5 for any task using Tensorflow 2. This notebook demonstrates a Question & Answer task implemented in Tensorflow 2 using SQUAD | [Muhammad Harris](https://github.com/HarrisDePerceptron) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snapthat/TF-T5-text-to-text/blob/master/snapthatT5/notebooks/TF-T5-Datasets%20Training.ipynb) |
 | [Train T5 on TPU](https://github.com/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb)  | How to train T5 on SQUAD with Transformers and Nlp | [Suraj Patil](https://github.com/patil-suraj) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb#scrollTo=QLGiFCDqvuil) |
 | [Fine-tune T5 for Classification and Multiple Choice](https://github.com/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb)  | How to fine-tune T5 for classification and multiple choice tasks using a text-to-text format with PyTorch Lightning |  [Suraj Patil](https://github.com/patil-suraj) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb) |
 | [Fine-tune DialoGPT on New Datasets and Languages](https://github.com/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb)  | How to fine-tune the DialoGPT model on a new dataset for open-dialog conversational chatbots |  [Nathan Cooper](https://github.com/ncoop57) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb) |
@ -42,7 +41,6 @@ This page regroups resources around 🤗 Transformers developed by the community
 |[Fine-tune ALBERT for sentence-pair classification](https://github.com/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb) | How to fine-tune an ALBERT model or another BERT-based model for the sentence-pair classification task | [Nadir El Manouzi](https://github.com/NadirEM) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb)|
 |[Fine-tune Roberta for sentiment analysis](https://github.com/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb) | How to fine-tune a Roberta model for sentiment analysis | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb)|
 |[Evaluating Question Generation Models](https://github.com/flexudy-pipe/qugeev) | How accurate are the answers to questions generated by your seq2seq transformer model? | [Pascal Zoleko](https://github.com/zolekode) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1bpsSqCQU-iw_5nNoRm_crPq6FRuJthq_?usp=sharing)|
-|[Classify text with DistilBERT and Tensorflow](https://github.com/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb) | How to fine-tune DistilBERT for text classification in TensorFlow | [Peter Bayerle](https://github.com/peterbayerle) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb)|
 |[Leverage BERT for Encoder-Decoder Summarization on CNN/Dailymail](https://github.com/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb) | How to warm-start a *EncoderDecoderModel* with a *google-bert/bert-base-uncased* checkpoint for summarization on CNN/Dailymail | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb)|
 |[Leverage RoBERTa for Encoder-Decoder Summarization on BBC XSum](https://github.com/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb) | How to warm-start a shared *EncoderDecoderModel* with a *FacebookAI/roberta-base* checkpoint for summarization on BBC/XSum | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb)|
 |[Fine-tune TAPAS on Sequential Question Answering (SQA)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) | How to fine-tune *TapasForQuestionAnswering* with a *tapas-base* checkpoint on the Sequential Question Answering (SQA) dataset | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb)|
--- a/docs/source/en/conversations.md
+++ b/docs/source/en/conversations.md
@ -16,18 +16,15 @@ rendered properly in your Markdown viewer.

 # Chat basics

-Chat models are conversational models you can send and receive messages from. There are many chat models available to choose from, but in general, larger models tend to be better though that's not always the case. The model size is often included in the name, like "8B" or "70B", and it describes the number of parameters. Mixture-of-expert (MoE) models have names like "8x7B" or "141B-A35B" which means it's a 56B and 141B parameter model. You can try quantizing larger models to reduce memory requirements, otherwise you'll need ~2 bytes of memory per parameter.
+Chat models are conversational models you can send a message to and receive a response. Most language models from mid-2023 onwards are chat models and may be referred to as "instruct" or "instruction-tuned" models. Models that do not support chat are often referred to as "base" or "pretrained" models.

-Check model leaderboards like [OpenLLM](https://hf.co/spaces/HuggingFaceH4/open_llm_leaderboard) and [LMSys Chatbot Arena](https://chat.lmsys.org/?leaderboard) to further help you identify the best chat models for your use case. Models that are specialized in certain domains (medical, legal text, non-English languages, etc.) may sometimes outperform larger general purpose models.
+Larger and newer models are generally more capable, but models specialized in certain domains (medical, legal text, non-English languages, etc.) can often outperform these larger models. Try leaderboards like [OpenLLM](https://hf.co/spaces/HuggingFaceH4/open_llm_leaderboard) and [LMSys Chatbot Arena](https://chat.lmsys.org/?leaderboard) to help you identify the best model for your use case.

-> [!TIP]
-> Chat with a number of open-source models for free on [HuggingChat](https://hf.co/chat/)!
-
-This guide shows you how to quickly start chatting with Transformers from the command line, how build and format a conversation, and how to chat using the [`TextGenerationPipeline`].
+This guide shows you how to quickly load chat models in Transformers from the command line, how to build and format a conversation, and how to chat using the [`TextGenerationPipeline`].

 ## chat CLI

-After you've [installed Transformers](./installation), chat with a model directly from the command line as shown below. It launches an interactive session with a model, with a few base commands listed at the start of the session.
+After you've [installed Transformers](./installation), you can chat with a model directly from the command line. The command below launches an interactive session with a model, with a few base commands listed at the start of the session.

 ```bash
 transformers chat Qwen/Qwen2.5-0.5B-Instruct
@ -56,85 +53,54 @@ The chat is implemented on top of the [AutoClass](./model_doc/auto), using tooli

 [`TextGenerationPipeline`] is a high-level text generation class with a "chat mode". Chat mode is enabled when a conversational model is detected and the chat prompt is [properly formatted](./llm_tutorial#wrong-prompt-format).

-To start, build a chat history with the following two roles.
-
- `system` describes how the model should behave and respond when you're chatting with it. This role isn't supported by all chat models.
- `user` is where you enter your first message to the model.
+Chat models accept a list of messages (the chat history) as the input. Each message is a dictionary with `role` and `content` keys.
+To start the chat, add a single `user` message. You can also optionally include a `system` message to give the model directions on how to behave.

 ```py
 chat = [
-    {"role": "system", "content": "You are a sassy, wise-cracking robot as imagined by Hollywood circa 1986."},
-    {"role": "user", "content": "Hey, can you tell me any fun things to do in New York?"}
+    {"role": "system", "content": "You are a helpful science assistant."},
+    {"role": "user", "content": "Hey, can you explain gravity to me?"}
 ]
 ```

-Create the [`TextGenerationPipeline`] and pass `chat` to it. For large models, setting [device_map="auto"](./models#big-model-inference) helps load the model quicker and automatically places it on the fastest device available. Changing the data type to [torch.bfloat16](./models#model-data-type) also helps save memory.
+Create the [`TextGenerationPipeline`] and pass `chat` to it. For large models, setting [device_map="auto"](./models#big-model-inference) helps load the model quicker and automatically places it on the fastest device available.

 ```py
 import torch
 from transformers import pipeline

-pipeline = pipeline(task="text-generation", model="meta-llama/Meta-Llama-3-8B-Instruct", torch_dtype=torch.bfloat16, device_map="auto")
+pipeline = pipeline(task="text-generation", model="HuggingFaceTB/SmolLM2-1.7B-Instruct", dtype="auto", device_map="auto")
 response = pipeline(chat, max_new_tokens=512)
 print(response[0]["generated_text"][-1]["content"])
 ```

-```txt
-(sigh) Oh boy, you're asking me for advice? You're gonna need a map, pal! Alright,
-alright, I'll give you the lowdown. But don't say I didn't warn you, I'm a robot, not a tour guide!
+If this works successfully, you should see a response from the model! If you want to continue the conversation,
+you need to update the chat history with the model's response. You can do this either by appending the text
+to `chat` (use the `assistant` role), or by reading `response[0]["generated_text"]`, which contains
+the full chat history, including the most recent response.

-So, you wanna know what's fun to do in the Big Apple? Well, let me tell you, there's a million
-things to do, but I'll give you the highlights. First off, you gotta see the sights: the Statue of
-Liberty, Central Park, Times Square... you know, the usual tourist traps. But if you're lookin' for
-something a little more... unusual, I'd recommend checkin' out the Museum of Modern Art. It's got
-some wild stuff, like that Warhol guy's soup cans and all that jazz.
-
-And if you're feelin' adventurous, take a walk across the Brooklyn Bridge. Just watch out for
-those pesky pigeons, they're like little feathered thieves! (laughs) Get it? Thieves? Ah, never mind.
-
-Now, if you're lookin' for some serious fun, hit up the comedy clubs in Greenwich Village. You might
-even catch a glimpse of some up-and-coming comedians... or a bunch of wannabes tryin' to make it big. (winks)
-
-And finally, if you're feelin' like a real New Yorker, grab a slice of pizza from one of the many amazing
-pizzerias around the city. Just don't try to order a "robot-sized" slice, trust me, it won't end well. (laughs)
-
-So, there you have it, pal! That's my expert advice on what to do in New York. Now, if you'll
-excuse me, I've got some oil changes to attend to. (winks)
-```
-
-Use the `append` method on `chat` to respond to the models message.
+Once you have the model's response, you can continue the conversation by appending a new `user` message to the chat history.

 ```py
 chat = response[0]["generated_text"]
 chat.append(
-    {"role": "user", "content": "Wait, what's so wild about soup cans?"}
+    {"role": "user", "content": "Woah! But can it be reconciled with quantum mechanics?"}
 )
 response = pipeline(chat, max_new_tokens=512)
 print(response[0]["generated_text"][-1]["content"])
 ```

-```txt
-(laughs) Oh, you're killin' me, pal! You don't get it, do you? Warhol's soup cans are like, art, man!
-It's like, he took something totally mundane, like a can of soup, and turned it into a masterpiece. It's
-like, "Hey, look at me, I'm a can of soup, but I'm also a work of art!"
-(sarcastically) Oh, yeah, real original, Andy.
+By repeating this process, you can continue the conversation as long as you like, at least until the model runs out of context window
+or you run out of memory.

-But, you know, back in the '60s, it was like, a big deal. People were all about challenging the
-status quo, and Warhol was like, the king of that. He took the ordinary and made it extraordinary.
-And, let me tell you, it was like, a real game-changer. I mean, who would've thought that a can of soup could be art? (laughs)
+## Performance and memory usage

-But, hey, you're not alone, pal. I mean, I'm a robot, and even I don't get it. (winks)
-But, hey, that's what makes art, art, right? (laughs)
-```
-
-## Performance
-
-Transformers load models in full precision by default, and for a 8B model, this requires ~32GB of memory! Reduce memory usage by loading a model in half-precision or bfloat16 (only uses ~2 bytes per parameter). You can even quantize the model to a lower precision like 8-bit or 4-bit with [bitsandbytes](https://hf.co/docs/bitsandbytes/index).
+Transformers load models in full `float32` precision by default, and for a 8B model, this requires ~32GB of memory! Use the `torch_dtype="auto"` argument, which generally uses `bfloat16` for models that were trained with it, to reduce your memory usage.

 > [!TIP]
 > Refer to the [Quantization](./quantization/overview) docs for more information about the different quantization backends available.

-Create a [`BitsAndBytesConfig`] with your desired quantization settings and pass it to the pipelines `model_kwargs` parameter. The example below quantizes a model to 8-bits.
+To lower memory usage even lower, you can quantize the model to 8-bit or 4-bit with [bitsandbytes](https://hf.co/docs/bitsandbytes/index). Create a [`BitsAndBytesConfig`] with your desired quantization settings and pass it to the pipelines `model_kwargs` parameter. The example below quantizes a model to 8-bits.

 ```py
 from transformers import pipeline, BitsAndBytesConfig
@ -143,19 +109,10 @@ quantization_config = BitsAndBytesConfig(load_in_8bit=True)
 pipeline = pipeline(task="text-generation", model="meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto", model_kwargs={"quantization_config": quantization_config})
 ```

-In general, larger models are slower in addition to requiring more memory because text generation is bottlenecked by **memory bandwidth** instead of compute power. Each active parameter must be read from memory for every generated token. For a 16GB model, 16GB must be read from memory for every generated token.
+In general, model size and performance are directly correlated. Larger models are slower in addition to requiring more memory because each active parameter must be read from memory for every generated token. 
+This is a bottleneck for LLM text generation and the main options for improving generation speed are to either quantize a model or use hardware with higher memory bandwidth. Adding more compute power doesn't meaningfully help.

-The number of generated tokens/sec is proportional to the total memory bandwidth of the system divided by the model size. Depending on your hardware, total memory bandwidth can vary. Refer to the table below for approximate generation speeds for different hardware types.
-
-| Hardware | Memory bandwidth |
-|---|---|
-| consumer CPU | 20-100GB/sec |
-| specialized CPU (Intel Xeon, AMD Threadripper/Epyc, Apple silicon) | 200-900GB/sec |
-| data center GPU (NVIDIA A100/H100) | 2-3TB/sec |
-
-The easiest solution for improving generation speed is to either quantize a model or use hardware with higher memory bandwidth.
-
-You can also try techniques like [speculative decoding](./generation_strategies#speculative-decoding), where a smaller model generates candidate tokens that are verified by the larger model. If the candidate tokens are correct, the larger model can generate more than one token per `forward` pass. This significantly alleviates the bandwidth bottleneck and improves generation speed.
+You can also try techniques like [speculative decoding](./generation_strategies#speculative-decoding), where a smaller model generates candidate tokens that are verified by the larger model. If the candidate tokens are correct, the larger model can generate more than one token at a time. This significantly alleviates the bandwidth bottleneck and improves generation speed.

 > [!TIP]
-> Parameters may not be active for every generated token in MoE models such as [Mixtral](./model_doc/mixtral), [Qwen2MoE](./model_doc/qwen2_moe), and [DBRX](./model_doc/dbrx). As a result, MoE models generally have much lower memory bandwidth requirements and can be faster than a regular LLM of the same size. However, techniques like speculative decoding are ineffective with MoE models because parameters become activated with each new speculated token.
+Mixture-of-Expert (MoE) models such as [Mixtral](./model_doc/mixtral), [Qwen2MoE](./model_doc/qwen2_moe), and [GPT-OSS](./model_doc/gpt-oss) have lots of parameters, but only "activate" a small fraction of them to generate each token. As a result, MoE models generally have much lower memory bandwidth requirements and can be faster than a regular LLM of the same size. However, techniques like speculative decoding are ineffective with MoE models because more parameters become activated with each new speculated token.
--- a/docs/source/en/deepspeed.md
+++ b/docs/source/en/deepspeed.md
@ -260,7 +260,7 @@ with deepspeed.zero.Init():
 The DeepSped config file needs to have `is_deepspeed_zero3_enabled: true` setup in [`TrainingArguments`] and it needs a ZeRO configuration enabled. The [`TrainingArguments`] object must be created **before** calling [`~PreTrainedModel.from_pretrained`].

 > [!TIP]
-> You'll need ZeRO-3 when the fp16 weights don't fit on a single GPU. But if you're able to load the fp16 weights, set `torch_dtype=torch.float16` in [`~PreTrainedModel.from_pretrained`].
+> You'll need ZeRO-3 when the fp16 weights don't fit on a single GPU. But if you're able to load the fp16 weights, set `dtype=torch.float16` in [`~PreTrainedModel.from_pretrained`].

 ```py
 from transformers import AutoModel, Trainer, TrainingArguments
--- a/docs/source/en/executorch.md
+++ b/docs/source/en/executorch.md
@ -38,7 +38,7 @@ generation_config = GenerationConfig(
 )

 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B", pad_token="</s>", padding_side="right")
-model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B", device_map="auto", torch_dtype=torch.bfloat16, attn_implementation="sdpa", generation_config=generation_config)
+model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B", device_map="auto", dtype=torch.bfloat16, attn_implementation="sdpa", generation_config=generation_config)

 exported_program = convert_and_export_with_cache(model)
 ```
--- a/docs/source/en/fast_tokenizers.md
+++ b/docs/source/en/fast_tokenizers.md
@ -31,7 +31,7 @@ from transformers import AutoTokenizer
 tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b")
 tokenizer("We are very happy to show you the 🤗 Transformers library", return_tensors="pt")
 {'input_ids': tensor([[     2,   1734,    708,   1508,   4915,    577,   1500,    692,    573,
-         156808, 128149,   9581, 235265]]), 
+         156808, 128149,   9581, 235265]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
 }
 ```
@ -62,7 +62,7 @@ from transformers import AutoTokenizer
 tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b")
 tokenizer("We are very happy to show you the 🤗 Transformers library.", return_tensors="pt")
 {'input_ids': tensor([[     2,   1734,    708,   1508,   4915,    577,   1500,    692,    573,
-         156808, 128149,   9581, 235265]]), 
+         156808, 128149,   9581, 235265]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
 }
 ```
@ -112,7 +112,7 @@ tokenizer = GemmaTokenizerFast(vocab_file="my_vocab_file.txt")

 ## Multimodal tokenizers

-In addition to text tokens, multimodal tokenizers also holds tokens from other modalities as a part of its attributes for easy access. 
+In addition to text tokens, multimodal tokenizers also holds tokens from other modalities as a part of its attributes for easy access.

 To add these special tokens to a tokenizer, pass them as a dictionary to the `extra_special_tokens` parameter in [`~AutoTokenizer.from_pretrained`]. The example below adds the `image_token` to a vision-language model.

@ -198,7 +198,7 @@ Add the `subfolder` parameter to [`~PreTrainedModel.from_pretrained`] to specify
 ```py
 from transformers import AutoTokenizer

-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", subfolder="original") 
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", subfolder="original")
 ```

 ### Create a tiktoken tokenizer
@ -226,7 +226,7 @@ tokenizer = PreTrainedTokenizerFast.from_pretrained("config/save/dir")

 <Youtube id="Yffk5aydLzg"/>

-A Transformers model expects the input to be a PyTorch, TensorFlow, or NumPy tensor. A tokenizers job is to preprocess text into those tensors. Specify the framework tensor type to return with the `return_tensors` parameter.
+A Transformers model expects the input to be a PyTorch or NumPy tensor. A tokenizers job is to preprocess text into those tensors. Specify the framework tensor type to return with the `return_tensors` parameter.

 ```py
 from transformers import AutoTokenizer
@ -234,7 +234,7 @@ from transformers import AutoTokenizer
 tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b")
 tokenizer("We are very happy to show you the 🤗 Transformers library.", return_tensors="pt")
 {'input_ids': tensor([[     2,   1734,    708,   1508,   4915,    577,   1500,    692,    573,
-         156808, 128149,   9581, 235265]]), 
+         156808, 128149,   9581, 235265]]),
 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
 }
 ```
@ -321,12 +321,12 @@ batch_sentences = [
 encoded_inputs = tokenizer(batch_sentences, return_tensors="pt")
 print(encoded_inputs)
 {
- 'input_ids': 
-    [[2, 1860, 1212, 1105, 2257, 14457, 235336], 
-     [2, 4454, 235303, 235251, 1742, 693, 9242, 1105, 2257, 14457, 235269, 48782, 235265], 
-     [2, 1841, 1105, 29754, 37453, 235336]], 
- 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], 
-                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
+ 'input_ids':
+    [[2, 1860, 1212, 1105, 2257, 14457, 235336],
+     [2, 4454, 235303, 235251, 1742, 693, 9242, 1105, 2257, 14457, 235269, 48782, 235265],
+     [2, 1841, 1105, 29754, 37453, 235336]],
+ 'attention_mask': [[1, 1, 1, 1, 1, 1, 1],
+                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                    [1, 1, 1, 1, 1, 1]]
 }
 ```
--- a/docs/source/en/generation_strategies.md
+++ b/docs/source/en/generation_strategies.md
@ -32,12 +32,14 @@ Greedy search works well for tasks with relatively short outputs where creativit

 ```py
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoTokenizer, infer_device
+
+device = infer_device()

 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
-inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
+inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to(device)

-model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", dtype=torch.float16).to(device)
 # explicitly set to default length because Llama2 generation length is 4096
 outputs = model.generate(**inputs, max_new_tokens=20)
 tokenizer.batch_decode(outputs, skip_special_tokens=True)
@ -52,12 +54,14 @@ Enable multinomial sampling with `do_sample=True` and `num_beams=1`.

 ```py
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoTokenizer, infer_device
+
+device = infer_device()

 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
-inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
+inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to(device)

-model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", dtype=torch.float16).to(device)
 # explicitly set to 100 because Llama2 generation length is 4096
 outputs = model.generate(**inputs, max_new_tokens=50, do_sample=True, num_beams=1)
 tokenizer.batch_decode(outputs, skip_special_tokens=True)
@ -75,12 +79,14 @@ Enable beam search with the `num_beams` parameter (should be greater than 1 othe

 ```py
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoTokenizer, infer_device
+
+device = infer_device()

 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
-inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
+inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to(device)

-model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", dtype=torch.float16).to(device)
 # explicitly set to 100 because Llama2 generation length is 4096
 outputs = model.generate(**inputs, max_new_tokens=50, num_beams=2)
 tokenizer.batch_decode(outputs, skip_special_tokens=True)
@ -125,7 +131,7 @@ pipe = pipeline(
    "text-generation",
    model="meta-llama/Llama-3.1-8B",
    assistant_model="meta-llama/Llama-3.2-1B",
-    torch_dtype=torch.bfloat16
+    dtype=torch.bfloat16
 )
 pipe_output = pipe("Once upon a time, ", max_new_tokens=50, do_sample=False)
 pipe_output[0]["generated_text"]
@ -160,12 +166,14 @@ Enable prompt lookup decoding with the `prompt_lookup_num_tokens` parameter.

 ```py
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoTokenizer, infer_device
+
+device = infer_device()

 tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-1.7B")
-model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-1.7B", torch_dtype=torch.float16).to("cuda")
-assistant_model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-135M", torch_dtype=torch.float16).to("cuda")
-inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
+model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-1.7B", dtype=torch.float16).to(device)
+assistant_model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-135M", dtype=torch.float16).to(device)
+inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to(device)

 outputs = model.generate(**inputs, assistant_model=assistant_model, max_new_tokens=20, prompt_lookup_num_tokens=5)
 tokenizer.batch_decode(outputs, skip_special_tokens=True)
@ -217,83 +225,6 @@ outputs = model.generate(**inputs, assistant_model=assistant_model, tokenizer=to
 tokenizer.batch_decode(outputs, skip_special_tokens=True)
 ['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a']
 ```
-
-### Contrastive search
-
-[Contrastive search](https://huggingface.co/papers/2202.06417) is a decoding strategy that aims to reduce repetition even while generating longer sequences. This strategy compares how similar a generated token is against previous tokens, and if they're more similar, a penalty is applied.
-
-Enable contrastive search with the `penalty_alpha` and `top_k` parameters. The `penalty_alpha` manages the penalty applied and `top_k` is the number of most likely tokens to return.
-
-```py
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
-inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
-
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
-# explicitly set to 100 because Llama2 generation length is 4096
-outputs = model.generate(**inputs, max_new_tokens=100, penalty_alpha=0.6, top_k=4)
-tokenizer.batch_decode(outputs, skip_special_tokens=True)
-'Hugging Face is an open-source company that provides a platform for building and deploying AI models.\nHugging Face is an open-source company that provides a platform for building and deploying AI models. The platform allows developers to build and deploy AI models, as well as collaborate with other developers.\nHugging Face was founded in 2019 by Thibault Wittemberg and Clément Delangue. The company is based in Paris, France.\nHugging Face has'
-```
-
-### DoLa
-
-[Decoding by Contrasting Layers (DoLa)](https://hf.co/papers/2309.03883) is a contrastive decoding strategy for improving factuality and reducing hallucination. This strategy works by contrasting the logit differences between the final and early layers. As a result, factual knowledge localized to particular layers are amplified. DoLa is not recommended for smaller models like GPT-2.
-
-Enable DoLa with the following parameters.
-
- `dola_layers` are the candidate layers to be contrasted with the final layer. It can be a string (`low` or `high`) to contrast the lower or higher parts of a layer. `high` is recommended for short-answer tasks like TruthfulQA. `low` is recommended for long-answer reasoning tasks like GSM8K, StrategyQA, FACTOR, and VicunaQA.
-
-  When a model has tied word embeddings, layer 0 is skipped and it begins from layer 2.
-
-  It can also be a list of integers that represent the layer indices between 0 and the total number of layers. Layer 0 is the word embedding, 1 is the first transformer layer, and so on. Refer to the table below for the range of layer indices depending on the number of model layers.
-
-  | layers | low | high |
-  |---|---|---|
-  | > 40 | (0, 20, 2) | (N - 20, N, 2) |
-  | <= 40 | range(0, N // 2, 2) | range(N // 2, N, 2) |
-
- `repetition_penalty` reduces repetition and it is recommended to set it to 1.2.
-
-<hfoptions id="dola">
-<hfoption id="contrast higher layers">
-
-```py
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-1.7B")
-model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-1.7B", torch_dtype=torch.float16).to("cuda")
-inputs = tokenizer("What is the highest peak in the world??", return_tensors="pt").to("cuda")
-
-outputs = model.generate(**inputs, max_new_tokens=50, dola_layers="high", do_sample=False)
-tokenizer.batch_decode(outputs, skip_special_tokens=True)
-" Mount EverestMount Everest, called Himalaya in Nepali, is the world's highest peak, lying almost 9.5 kilometers above the sea level and the tallest mountain from 19,036.91 ft. The mountain was"
-```
-
-</hfoption>
-<hfoption id="contrast specific layers">
-
-Contrast layers 18 and 20 with the final layer.
-
-```py
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-1.7B")
-model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-1.7B", torch_dtype=torch.float16).to("cuda")
-inputs = tokenizer("What is the highest peak in the world?", return_tensors="pt").to("cuda")
-
-outputs = model.generate(**inputs, max_new_tokens=50, dola_layers=[18,20], do_sample=False, repetition_penalty=1.2)
-tokenizer.batch_decode(outputs[:, inputs.input_ids.shape[-1]:], skip_special_tokens=True)
-" Mount EverestMount Everest, called Himalaya in Nepali, is the world's highest peak above sea level and it rises to an incredible height of 29,028 feet above the ocean. Its summit is over a mile taller than Mt"
-```
-
-</hfoption>
-</hfoptions>
-
 ### Diverse beam search

 [Diverse beam search](https://hf.co/papers/1610.02424) is a variant of beam search that produces more diverse output candidates to choose from. This strategy measures the dissimilarity of sequences and a penalty is applied if sequences are too similar. To avoid high computation costs, the number of beams is divided into groups.
@ -302,12 +233,14 @@ Enable diverse beam search with the `num_beams`, `num_beam_groups` and `diversit

 ```py
 import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
+from transformers import AutoModelForCausalLM, AutoTokenizer, infer_device
+
+device = infer_device()

 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
-inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
+inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to(device)

-model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", dtype=torch.float16).to(device)
 # explicitly set to 100 because Llama2 generation length is 4096
 outputs = model.generate(**inputs, max_new_tokens=50, num_beams=6, num_beam_groups=3, diversity_penalty=1.0, do_sample=False)
 tokenizer.batch_decode(outputs, skip_special_tokens=True)
--- a/docs/source/en/gguf.md
+++ b/docs/source/en/gguf.md
@ -38,9 +38,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
 model_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
 filename = "tinyllama-1.1b-chat-v1.0.Q6_K.gguf"

-torch_dtype = torch.float32 # could be torch.float16 or torch.bfloat16 too
+dtype = torch.float32 # could be torch.float16 or torch.bfloat16 too
 tokenizer = AutoTokenizer.from_pretrained(model_id, gguf_file=filename)
-model = AutoModelForCausalLM.from_pretrained(model_id, gguf_file=filename, torch_dtype=torch_dtype)
+model = AutoModelForCausalLM.from_pretrained(model_id, gguf_file=filename, dtype=dtype)
 ```

 Once you're done tinkering with the model, save and convert it back to the GGUF format with the [convert-hf-to-gguf.py](https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py) script.
--- a/docs/source/en/glossary.md
+++ b/docs/source/en/glossary.md
@ -67,7 +67,7 @@ We can see that 0s have been added on the right of the first sentence to make it
 [[101, 1188, 1110, 170, 1603, 4954, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1188, 1110, 170, 1897, 1263, 4954, 119, 1135, 1110, 1120, 1655, 2039, 1190, 1103, 4954, 138, 119, 102]]
 ```

-This can then be converted into a tensor in PyTorch or TensorFlow. The attention mask is a binary tensor indicating the
+This can then be converted into a tensor in PyTorch. The attention mask is a binary tensor indicating the
 position of the padded indices so that the model does not attend to them. For the [`BertTokenizer`], `1` indicates a
 value that should be attended to, while `0` indicates a padded value. This attention mask is in the dictionary returned
 by the tokenizer under the key "attention_mask":
@ -114,7 +114,7 @@ A type of layer in a neural network where the input matrix is multiplied element

 ### DataParallel (DP)

-Parallelism technique for training on multiple GPUs where the same setup is replicated multiple times, with each instance 
+Parallelism technique for training on multiple GPUs where the same setup is replicated multiple times, with each instance
 receiving a distinct data slice. The processing is done in parallel and all setups are synchronized at the end of each training step.

 Learn more about how DataParallel works [here](perf_train_gpu_many#dataparallel-vs-distributeddataparallel).
@ -295,7 +295,7 @@ These labels are different according to the model head, for example:
  `class_labels` and `boxes` key where each value of the batch corresponds to the expected label and number of bounding boxes of each individual image.
 - For automatic speech recognition models, ([`Wav2Vec2ForCTC`]), the model expects a tensor of dimension `(batch_size,
  target_length)` with each value corresponding to the expected label of each individual token.
-  
+
 <Tip>

 Each model's labels may be different, so be sure to always check the documentation of each model for more information
@ -346,8 +346,8 @@ For more details, see [Pipelines for inference](https://huggingface.co/docs/tran

 ### PipelineParallel (PP)

-Parallelism technique in which the model is split up vertically (layer-level) across multiple GPUs, so that only one or 
-several layers of the model are placed on a single GPU. Each GPU processes in parallel different stages of the pipeline 
+Parallelism technique in which the model is split up vertically (layer-level) across multiple GPUs, so that only one or
+several layers of the model are placed on a single GPU. Each GPU processes in parallel different stages of the pipeline
 and working on a small chunk of the batch. Learn more about how PipelineParallel works [here](perf_train_gpu_many#from-naive-model-parallelism-to-pipeline-parallelism).

 ### pixel values
@ -379,7 +379,7 @@ The task of preparing raw data into a format that can be easily consumed by mach
 A model that has been pretrained on some data (for instance all of Wikipedia). Pretraining methods involve a
 self-supervised objective, which can be reading the text and trying to predict the next word (see [causal language
 modeling](#causal-language-modeling)) or masking some words and trying to predict them (see [masked language
-modeling](#masked-language-modeling-mlm)). 
+modeling](#masked-language-modeling-mlm)).

 Speech and vision models have their own pretraining objectives. For example, Wav2Vec2 is a speech model pretrained on a contrastive task which requires the model to identify the "true" speech representation from a set of "false" speech representations. On the other hand, BEiT is a vision model pretrained on a masked image modeling task which masks some of the image patches and requires the model to predict the masked patches (similar to the masked language modeling objective).

@ -403,9 +403,9 @@ A measurement in hertz of the number of samples (the audio signal) taken per sec

 Each element of the input finds out which other elements of the input they should attend to.

-### self-supervised learning 
+### self-supervised learning

-A category of machine learning techniques in which a model creates its own learning objective from unlabeled data. It differs from [unsupervised learning](#unsupervised-learning) and [supervised learning](#supervised-learning) in that the learning process is supervised, but not explicitly from the user. 
+A category of machine learning techniques in which a model creates its own learning objective from unlabeled data. It differs from [unsupervised learning](#unsupervised-learning) and [supervised learning](#supervised-learning) in that the learning process is supervised, but not explicitly from the user.

 One example of self-supervised learning is [masked language modeling](#masked-language-modeling-mlm), where a model is passed sentences with a proportion of its tokens removed and learns to predict the missing tokens.

@ -436,9 +436,9 @@ A form of model training that directly uses labeled data to correct and instruct

 ### Tensor Parallelism (TP)

-Parallelism technique for training on multiple GPUs in which each tensor is split up into multiple chunks, so instead of 
-having the whole tensor reside on a single GPU, each shard of the tensor resides on its designated GPU. Shards gets 
-processed separately and in parallel on different GPUs and the results are synced at the end of the processing step. 
+Parallelism technique for training on multiple GPUs in which each tensor is split up into multiple chunks, so instead of
+having the whole tensor reside on a single GPU, each shard of the tensor resides on its designated GPU. Shards gets
+processed separately and in parallel on different GPUs and the results are synced at the end of the processing step.
 This is what is sometimes called horizontal parallelism, as the splitting happens on horizontal level.
 Learn more about Tensor Parallelism [here](perf_train_gpu_many#tensor-parallelism).

@ -516,7 +516,7 @@ A form of model training in which data provided to the model is not labeled. Uns

 ### Zero Redundancy Optimizer (ZeRO)

-Parallelism technique which performs sharding of the tensors somewhat similar to [TensorParallel](#tensor-parallelism-tp), 
-except the whole tensor gets reconstructed in time for a forward or backward computation, therefore the model doesn't need 
-to be modified. This method also supports various offloading techniques to compensate for limited GPU memory. 
+Parallelism technique which performs sharding of the tensors somewhat similar to [TensorParallel](#tensor-parallelism-tp),
+except the whole tensor gets reconstructed in time for a forward or backward computation, therefore the model doesn't need
+to be modified. This method also supports various offloading techniques to compensate for limited GPU memory.
 Learn more about ZeRO [here](perf_train_gpu_many#zero-data-parallelism).
--- a/docs/source/en/hpo_train.md
+++ b/docs/source/en/hpo_train.md
@ -37,7 +37,6 @@ An example `model_init` function is shown below.
 def model_init(trial):
    return AutoModelForSequenceClassification.from_pretrained(
        model_args.model_name_or_path,
-        from_tf=bool(".ckpt" in model_args.model_name_or_path),
        config=config,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
@ -103,7 +102,7 @@ def ray_hp_space(trial):
        "per_device_train_batch_size": tune.choice([16, 32, 64, 128]),
    }

-best_trials = trainer.hyperparameter_search( 
+best_trials = trainer.hyperparameter_search(
    direction=["minimize", "maximize"],
    backend="ray",
    hp_space=ray_hp_space,
@ -128,7 +127,7 @@ def sigopt_hp_space(trial):
        },
    ]

-best_trials = trainer.hyperparameter_search( 
+best_trials = trainer.hyperparameter_search(
    direction=["minimize", "maximize"],
    backend="sigopt",
    hp_space=sigopt_hp_space,
@ -153,7 +152,7 @@ def wandb_hp_space(trial):
        },
    }

-best_trials = trainer.hyperparameter_search( 
+best_trials = trainer.hyperparameter_search(
    direction=["minimize", "maximize"],
    backend="wandb",
    hp_space=wandb_hp_space,
--- a/docs/source/en/installation.md
+++ b/docs/source/en/installation.md
@ -20,7 +20,7 @@ rendered properly in your Markdown viewer.

 # Installation

-Transformers works with [PyTorch](https://pytorch.org/get-started/locally/), [TensorFlow 2.0](https://www.tensorflow.org/install/pip), and [Flax](https://flax.readthedocs.io/en/latest/). It has been tested on Python 3.9+, PyTorch 2.1+, TensorFlow 2.6+, and Flax 0.4.1+.
+Transformers works with [PyTorch](https://pytorch.org/get-started/locally/). It has been tested on Python 3.9+ and PyTorch 2.2+.

 ## Virtual environment

@ -74,7 +74,7 @@ uv pip install transformers
 </hfoption>
 </hfoptions>

-For GPU acceleration, install the appropriate CUDA drivers for [PyTorch](https://pytorch.org/get-started/locally) and [TensorFlow](https://www.tensorflow.org/install/pip).
+For GPU acceleration, install the appropriate CUDA drivers for [PyTorch](https://pytorch.org/get-started/locally).

 Run the command below to check if your system detects an NVIDIA GPU.

@ -84,42 +84,11 @@ nvidia-smi

 To install a CPU-only version of Transformers and a machine learning framework, run the following command.

-<hfoptions id="cpu-only">
-<hfoption id="PyTorch">
-
 ```bash
 pip install 'transformers[torch]'
 uv pip install 'transformers[torch]'
 ```

-</hfoption>
-<hfoption id="TensorFlow">
-
-For Apple M1 hardware, you need to install CMake and pkg-config first.
-
-```bash
-brew install cmake
-brew install pkg-config
-```
-
-Install TensorFlow 2.0.
-
-```bash
-pip install 'transformers[tf-cpu]'
-uv pip install 'transformers[tf-cpu]'
-```
-
-</hfoption>
-<hfoption id="Flax">
-
-```bash
-pip install 'transformers[flax]'
-uv pip install 'transformers[flax]'
-```
-
-</hfoption>
-</hfoptions>
-
 Test whether the install was successful with the following command. It should return a label and score for the provided text.

 ```bash
--- a/docs/source/en/internal/generation_utils.md
+++ b/docs/source/en/internal/generation_utils.md
@ -66,8 +66,6 @@ values. Here, for instance, it has two keys that are `sequences` and `scores`.
 We document here all output types.


-### PyTorch
-
 [[autodoc]] generation.GenerateDecoderOnlyOutput

 [[autodoc]] generation.GenerateEncoderDecoderOutput
@ -76,42 +74,12 @@ We document here all output types.

 [[autodoc]] generation.GenerateBeamEncoderDecoderOutput

-### TensorFlow
-
-[[autodoc]] generation.TFGreedySearchEncoderDecoderOutput
-
-[[autodoc]] generation.TFGreedySearchDecoderOnlyOutput
-
-[[autodoc]] generation.TFSampleEncoderDecoderOutput
-
-[[autodoc]] generation.TFSampleDecoderOnlyOutput
-
-[[autodoc]] generation.TFBeamSearchEncoderDecoderOutput
-
-[[autodoc]] generation.TFBeamSearchDecoderOnlyOutput
-
-[[autodoc]] generation.TFBeamSampleEncoderDecoderOutput
-
-[[autodoc]] generation.TFBeamSampleDecoderOnlyOutput
-
-[[autodoc]] generation.TFContrastiveSearchEncoderDecoderOutput
-
-[[autodoc]] generation.TFContrastiveSearchDecoderOnlyOutput
-
-### FLAX
-
-[[autodoc]] generation.FlaxSampleOutput
-
-[[autodoc]] generation.FlaxGreedySearchOutput
-
-[[autodoc]] generation.FlaxBeamSearchOutput

 ## LogitsProcessor

 A [`LogitsProcessor`] can be used to modify the prediction scores of a language model head for
 generation.

-### PyTorch

 [[autodoc]] AlternatingCodebooksLogitsProcessor
    - __call__
@ -210,93 +178,6 @@ generation.
    - __call__


-### TensorFlow
-
-[[autodoc]] TFForcedBOSTokenLogitsProcessor
-    - __call__
-
-[[autodoc]] TFForcedEOSTokenLogitsProcessor
-    - __call__
-
-[[autodoc]] TFForceTokensLogitsProcessor
-    - __call__
-
-[[autodoc]] TFLogitsProcessor
-    - __call__
-
-[[autodoc]] TFLogitsProcessorList
-    - __call__
-
-[[autodoc]] TFLogitsWarper
-    - __call__
-
-[[autodoc]] TFMinLengthLogitsProcessor
-    - __call__
-
-[[autodoc]] TFNoBadWordsLogitsProcessor
-    - __call__
-
-[[autodoc]] TFNoRepeatNGramLogitsProcessor
-    - __call__
-
-[[autodoc]] TFRepetitionPenaltyLogitsProcessor
-    - __call__
-
-[[autodoc]] TFSuppressTokensAtBeginLogitsProcessor
-    - __call__
-
-[[autodoc]] TFSuppressTokensLogitsProcessor
-    - __call__
-
-[[autodoc]] TFTemperatureLogitsWarper
-    - __call__
-
-[[autodoc]] TFTopKLogitsWarper
-    - __call__
-
-[[autodoc]] TFTopPLogitsWarper
-    - __call__
-
-### FLAX
-
-[[autodoc]] FlaxForcedBOSTokenLogitsProcessor
-    - __call__
-
-[[autodoc]] FlaxForcedEOSTokenLogitsProcessor
-    - __call__
-
-[[autodoc]] FlaxForceTokensLogitsProcessor
-    - __call__
-
-[[autodoc]] FlaxLogitsProcessor
-    - __call__
-
-[[autodoc]] FlaxLogitsProcessorList
-    - __call__
-
-[[autodoc]] FlaxLogitsWarper
-    - __call__
-
-[[autodoc]] FlaxMinLengthLogitsProcessor
-    - __call__
-
-[[autodoc]] FlaxSuppressTokensAtBeginLogitsProcessor
-    - __call__
-
-[[autodoc]] FlaxSuppressTokensLogitsProcessor
-    - __call__
-
-[[autodoc]] FlaxTemperatureLogitsWarper
-    - __call__
-
-[[autodoc]] FlaxTopKLogitsWarper
-    - __call__
-
-[[autodoc]] FlaxTopPLogitsWarper
-    - __call__
-
-[[autodoc]] FlaxWhisperTimeStampLogitsProcessor
-    - __call__

 ## StoppingCriteria

--- a/docs/source/en/internal/modeling_utils.md
+++ b/docs/source/en/internal/modeling_utils.md
@ -53,31 +53,3 @@ Most of those are only useful if you are studying the code of the models in the
 [[autodoc]] pytorch_utils.prune_conv1d_layer

 [[autodoc]] pytorch_utils.prune_linear_layer
-
-## TensorFlow custom layers
-
-[[autodoc]] modeling_tf_utils.TFConv1D
-
-[[autodoc]] modeling_tf_utils.TFSequenceSummary
-
-## TensorFlow loss functions
-
-[[autodoc]] modeling_tf_utils.TFCausalLanguageModelingLoss
-
-[[autodoc]] modeling_tf_utils.TFMaskedLanguageModelingLoss
-
-[[autodoc]] modeling_tf_utils.TFMultipleChoiceLoss
-
-[[autodoc]] modeling_tf_utils.TFQuestionAnsweringLoss
-
-[[autodoc]] modeling_tf_utils.TFSequenceClassificationLoss
-
-[[autodoc]] modeling_tf_utils.TFTokenClassificationLoss
-
-## TensorFlow Helper Functions
-
-[[autodoc]] modeling_tf_utils.get_initializer
-
-[[autodoc]] modeling_tf_utils.keras_serializable
-
-[[autodoc]] modeling_tf_utils.shape_list
--- a/docs/source/en/kv_cache.md
+++ b/docs/source/en/kv_cache.md
@ -22,20 +22,19 @@ A KV *cache* stores these calculations so they can be reused without recomputing

 Transformers offers several [`Cache`] classes that implement different caching mechanisms. Some of these [`Cache`] classes are optimized to save memory while others are designed to maximize generation speed. Refer to the table below to compare cache types and use it to help you select the best cache for your use case.

-| Cache Type             | Memory Efficient  | Supports torch.compile() | Initialization Recommended | Latency | Long Context Generation |
-|------------------------|------------------|--------------------------|----------------------------|---------|-------------------------|
-| Dynamic Cache          | No               | No                       | No                         | Mid     | No                      |
-| Static Cache           | No               | Yes                      | Yes                        | High    | No                      |
-| Offloaded Cache         | Yes              | No                       | No                         | Low     | Yes                     |
-| Offloaded Static Cache  | No               | Yes                      | Yes                        | High    | Yes                     |
-| Quantized Cache        | Yes              | No                       | No                         | Low     | Yes                     |
-| Sliding Window Cache   | No               | Yes                      | Yes                        | High    | No                      |
+| Cache Type             | Supports sliding layers  | Supports offloading | Supports torch.compile() | Expected memory usage |
+|------------------------|--------------------------|---------------------|--------------------------|-----------------------|
+| Dynamic Cache          |           Yes            |          Yes        |           No             |         Medium        |
+| Static Cache           |           Yes            |          Yes        |           Yes            |         High          |
+| Quantized Cache        |           No             |          No         |           No             |         Low           |

 This guide introduces you to the different [`Cache`] classes and shows you how to use them for generation.

 ## Default cache

-The [`DynamicCache`] is the default cache class for most models. It allows the cache size to grow dynamically in order to store an increasing number of keys and values as generation progresses.
+The [`DynamicCache`] is the default cache class for all models. It allows the cache size to grow dynamically in order to store an increasing number of keys and values as generation progresses.
+
+Note that for models using sliding window attention (Mistral, Gemma2,...) or chunked attention (Llama4), the cache will stop growing when the layers using these types of attention have reached their maximum size (the sliding window or chunk size).

 Disable the cache by configuring `use_cache=False` in [`~GenerationMixin.generate`].

@ -44,42 +43,68 @@ import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM

 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", dtype=torch.float16, device_map="auto")
 inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device)

 model.generate(**inputs, do_sample=False, max_new_tokens=20, use_cache=False)
 ```

-Cache classes can also be initialized first before calling and passing it to the models [past_key_values](https://hf.co/docs/transformers/internal/generation_utils#transformers.generation.GenerateDecoderOnlyOutput.past_key_values) parameter. This cache initialization strategy is only recommended for some cache types.
+Cache classes can also be initialized first before calling and passing it to the models [past_key_values](https://hf.co/docs/transformers/internal/generation_utils#transformers.generation.GenerateDecoderOnlyOutput.past_key_values) parameter. This can be useful for more fine-grained control, or more advanced usage such as context caching.

-In most other cases, it's easier to define the cache strategy in the [cache_implementation](https://hf.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig.cache_implementation) parameter.
+In most cases, it's easier to define the cache strategy in the [cache_implementation](https://hf.co/docs/transformers/main_classes/text_generation#transformers.GenerationConfig.cache_implementation) parameter.

 ```py
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache

 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", dtype=torch.float16, device_map="auto")
 inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device)

-past_key_values = DynamicCache()
+past_key_values = DynamicCache(config=model.config)
 out = model.generate(**inputs, do_sample=False, max_new_tokens=20, past_key_values=past_key_values)
 ```

-## Memory efficient caches
+## Fixed-size cache
+
+The default [`DynamicCache`] prevents you from taking advantage of most just-in-time (JIT) optimizations because the cache size isn't fixed. JIT optimizations enable you to maximize latency at the expense of memory usage. All of the following cache types are compatible with JIT optimizations like [torch.compile](./llm_optims#static-kv-cache-and-torchcompile) to accelerate generation. 
+
+A fixed-size cache ([`StaticCache`]) pre-allocates a specific maximum cache size for the kv pairs. You can generate up to the maximum cache size without needing to modify it. However, having a fixed (usually large) size for the key/value states means that while generating, a lot of tokens will actually be masked as they should not take part in the attention. So this trick allows to easily `compile` the decoding stage, but it incurs a waste of tokens in the attention computation. As all things, it's then a trade-off which should be very good if you generate with several sequence of more or less the same lengths, but may be sub-optimal if you have for example 1 very large sequence, and then only short sequences (as the fix cache size would be large, a lot would be wasted for the short sequences). Make sure you understand the impact if you use it!
+
+As for [`DynamicCache`], note that for models using sliding window attention (Mistral, Gemma2,...) or chunked attention (Llama4), the cache will never be larger than the sliding window/chunk size on layers using these types of attention, even if the maximum length specified is larger.
+
+You can enable [`StaticCache`] by configuring `cache_implementation="static"` in [`~GenerationMixin.generate`]. This will also turn on automatic `compilation` of the decoding stage for greedy and sample decoding strategies.
+
+```py
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", dtype=torch.float16, device_map="auto")
+inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device)
+
+out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="static")
+tokenizer.batch_decode(out, skip_special_tokens=True)[0]
+"Hello, my name is [Your Name], and I am a [Your Profession] with [Number of Years] of"
+```
+
+## Cache offloading

 The KV cache can occupy a significant portion of memory and become a [bottleneck](https://hf.co/blog/llama31#inference-memory-requirements) for long-context generation. Memory efficient caches focus on trading off speed for reduced memory usage. This is especially important for large language models (LLMs) and if your hardware is memory constrained.

-### Offloaded cache
+Offloading the cache saves GPU memory by moving the KV cache for model layers except one to the CPU. Only the current layer cache is maintained on the GPU during a models `forward` iteration over the layers. It will asynchronously prefetch the next layer's cache, and send back the current layer's cache back to the CPU after attention computation.

-The [`OffloadedCache`] saves GPU memory by moving the KV cache for most model layers to the CPU. Only the current layer cache is maintained on the GPU during a models `forward` iteration over the layers. [`OffloadedCache`] asynchronously prefetches the next layer cache and sends the previous layer cache back to the CPU.
-
-This cache strategy always generates the same result as [`DynamicCache`] and works as a drop-in replacement or fallback. You may want to use [`OffloadedCache`] if you have a GPU and you're getting out-of-memory (OOM) errors.
+You may want to consider offloading if you have a small GPU and you're getting out-of-memory (OOM) errors.

 > [!WARNING]
-> You may notice a small degradation in generation throughput compared to [`DynamicCache`] depending on your model and generation choices (context size, number of generated tokens, number of beams, etc.).
+> You may notice a small degradation in generation throughput compared to a full on-device cache, depending on your model and generation choices (context size, number of generated tokens, number of beams, etc.). This is because moving the key/value states back and forth requires some work.

-Enable [`OffloadedCache`] by configuring `cache_implementation="offloaded"` in either [`GenerationConfig`] or [`~GenerationMixin.generate`].
+Offloading is available for both [`DynamicCache`] and [`StaticCache`]. You can enable it by configuring `cache_implementation="offloaded"` for the dynamic version, or `cache_implementation="offloaded_static"` for the static version, in either [`GenerationConfig`] or [`~GenerationMixin.generate`].
+Additionally, you can also instantiate your own [`DynamicCache`] or [`StaticCache`] with the `offloading=True` option, and pass this cache in `generate` or your model's `forward` (for example, `past_key_values=DynamicCache(config=model.config, offloading=True)` for a dynamic cache).
+
+Note that the 2 [`Cache`] classes mentionned above have an additional option when instantiating them directly, `offload_only_non_sliding`.
+This additional argument decides if the layers using sliding window/chunk attention (if any), will be offloaded as well. Since
+these layers are usually short anyway, it may be better to avoid offloading them, as offloading may incur a speed penalty. By default, this option is `False` for [`DynamicCache`], and `True` for [`StaticCache`].

 ```py
 import torch
@ -87,7 +112,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM

 ckpt = "microsoft/Phi-3-mini-4k-instruct"
 tokenizer = AutoTokenizer.from_pretrained(ckpt)
-model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16, device_map="auto")
+model = AutoModelForCausalLM.from_pretrained(ckpt, dtype=torch.float16, device_map="auto")
 inputs = tokenizer("Fun fact: The shortest", return_tensors="pt").to(model.device)

 out = model.generate(**inputs, do_sample=False, max_new_tokens=23, cache_implementation="offloaded")
@ -95,7 +120,7 @@ print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
 Fun fact: The shortest war in history was between Britain and Zanzibar on August 27, 1896.
 ```

-The example below shows how you can fallback on [`OffloadedCache`] if you run out of memory.
+The example below shows how you can fallback to an offloaded cache if you run out of memory:

 ```py
 import torch
@ -118,7 +143,7 @@ def resilient_generate(model, *args, **kwargs):

 ckpt = "microsoft/Phi-3-mini-4k-instruct"
 tokenizer = AutoTokenizer.from_pretrained(ckpt)
-model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16, device_map="auto")
+model = AutoModelForCausalLM.from_pretrained(ckpt, dtype=torch.float16, device_map="auto")
 prompt = ["okay "*1000 + "Fun fact: The most"]
 inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
 beams = { "num_beams": 40, "num_beam_groups": 40, "num_return_sequences": 40, "diversity_penalty": 1.0, "max_new_tokens": 23, "early_stopping": True, }
@ -126,12 +151,12 @@ out = resilient_generate(model, **inputs, **beams)
 responses = tokenizer.batch_decode(out[:,-28:], skip_special_tokens=True)
 ```

-### Quantized cache
+## Quantized cache

-The [`QuantizedCache`] reduces memory requirements by quantizing the KV values to a lower precision. [`QuantizedCache`] currently supports two quantization backends.
+The [`QuantizedCache`] reduces memory requirements by quantizing the KV values to a lower precision. [`QuantizedCache`] currently supports two quantization backends:

- [`HQQQuantizedCache`] supports int2, int4, and int8 datatypes.
- [`QuantoQuantizedCache`] supports int2 and int4 datatypes. This is the default quantization backend.
+- `hqq` supports int2, int4, and int8 datatypes.
+- `quanto` supports int2 and int4 datatypes. This is the default quantization backend.

 > [!WARNING]
 > Quantizing the cache can harm latency if the context length is short and there is enough GPU memory available for generation without enabling cache quantization. Try to find a balance between memory efficiency and latency.
@ -139,34 +164,30 @@ The [`QuantizedCache`] reduces memory requirements by quantizing the KV values t
 Enable [`QuantizedCache`] by configuring `cache_implementation="quantized"` in [`GenerationConfig`], and the quantization backend, as well as any additional quantization related parameters should also be passed either as a dict. You should use the default values for these additional parameters unless you're running out-of-memory. In that case, consider decreasing the residual length.

 <hfoptions id="quantized-cache">
-<hfoption id="HQQQuantizedCache">

-For [`HQQQuantizedCache`], we recommend setting the `axis-key` and `axis-value` parameters to `1`.
+For the `hqq` backend, we recommend setting the `axis-key` and `axis-value` parameters to `1`.

 ```py
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, HQQQuantizedCache
+from transformers import AutoTokenizer, AutoModelForCausalLM, QuantizedCache

 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", dtype=torch.float16, device_map="auto")
 inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device)

-out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="quantized", cache_config={"backend": "HQQ"})
+out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="quantized", cache_config={"backend": "hqq"})
 print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
 I like rock music because it's loud and energetic. It's a great way to express myself and rel
 ```

-</hfoption>
-<hfoption id="Quanto">
-
-For [`QuantoQuantizedCache`], we recommend setting the `axis-key` and `axis-value` parameters to `0`.
+For `quanto` backend, we recommend setting the `axis-key` and `axis-value` parameters to `0`.

 ```py
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, QuantoQuantizedCache
+from transformers import AutoTokenizer, AutoModelForCausalLM

 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", dtype=torch.float16, device_map="auto")
 inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device)

 out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="quantized", cache_config={"nbits": 4, "backend": "quanto"})
@ -174,92 +195,19 @@ print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
 I like rock music because it's loud and energetic. It's a great way to express myself and rel
 ```

-</hfoption>
-</hfoptions>
-
-## Speed optimized caches
-
-The default [`DynamicCache`] prevents you from taking advantage of just-in-time (JIT) optimizations because the cache size isn't fixed. JIT optimizations enable you to maximize latency at the expense of memory usage. All of the following cache types are compatible with JIT optimizations like [torch.compile](./llm_optims#static-kv-cache-and-torchcompile) to accelerate generation.
-
-### Static cache
-
-A [`StaticCache`] pre-allocates a specific maximum cache size for the kv pairs. You can generate up to the maximum cache size without needing to modify it.
-
-Enable [`StaticCache`] by configuring `cache_implementation="static"` in [`~GenerationMixin.generate`].
-
-```py
-import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
-
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
-inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device)
-
-out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="static")
-tokenizer.batch_decode(out, skip_special_tokens=True)[0]
-"Hello, my name is [Your Name], and I am a [Your Profession] with [Number of Years] of"
-```
-
-### Offloaded static cache
-
-The [`OffloadedStaticCache`] is very similar to the [OffloadedCache](#offloaded-cache) except the cache size is set to a maximum cache size. Otherwise, [`OffloadedStaticCache`] only keeps the current layer cache on the GPU and the rest are moved to the CPU.
-
-Enable [`OffloadedStaticCache`] by configuring `cache_implementation="offloaded_static"` in [`~GenerationMixin.generate`].
-
-```py
-import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
-
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map={"": 0})
-inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device)
-
-out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="offloaded_static")
-tokenizer.batch_decode(out, skip_special_tokens=True)[0]
-"Hello, my name is [Your Name], and I am a [Your Profession] with [Number of Years] of"
-```
-Cache offloading requires a CUDA GPU or Intel XPU.
-
-### Sliding window cache
-
-[`SlidingWindowCache`] implements a sliding window over the previous kv pairs, and only keeps the last `sliding_window` tokens. This cache type is designed to only work with models that support *sliding window attention*, such as [Mistral](./model_doc/mistral). Older kv states are discarded and replaced by new kv states.
-
-Enable [`SlidingWindowCache`] by configuring `cache_implementation="sliding_window"` in [`~GenerationMixin.generate`].
-
-```py
-import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
-
-tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
-model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", torch_dtype=torch.float16, device_map="auto")
-inputs = tokenizer("Yesterday I was on a rock concert and.", return_tensors="pt").to(model.device)
-
-out = model.generate(**inputs, do_sample=False, max_new_tokens=30, cache_implementation="sliding_window")
-tokenizer.batch_decode(out, skip_special_tokens=True)[0]
-```
-
-## Model caches
-
-Some model types, like encoder-decoder models or [Gemma2](./model_doc/gemma2) and [Mamba](./model_doc/mamba), have dedicated cache classes.
-
-### Encoder-decoder cache
+## Encoder-decoder cache

 [`EncoderDecoderCache`] is designed for encoder-decoder models. It manages both the self-attention and cross-attention caches to ensure storage and retrieval of previous kv pairs. It is possible to individually set a different cache type for the encoder and decoder.

-This cache type doesn't require any setup. It can be used when calling [`~GenerationMixin.generate`] or a models `forward` method.
+This cache type doesn't require any setup. It is a simple wrapper around 2 [`Cache`]s as described above, that will be used independently directly by the model.

-> [!TIP]
-> The [`EncoderDecoderCache`] currently only supports [Whisper](./model_doc/whisper).
-
-### Model-specific caches
+## Model-specific caches

 Some models have a unique way of storing past kv pairs or states that is not compatible with any other cache classes.

-[Gemma2](./model_doc/gemma2) requires [`HybridCache`], which uses a combination of [`SlidingWindowCache`] for sliding window attention and [`StaticCache`] for global attention under the hood.
+Mamba models, such as [Mamba](./model_doc/mamba), require a specific cache because the model doesn't have an attention mechanism or kv states. Thus, they are not compatible with the above [`Cache`] classes.

-[Mamba](./model_doc/mamba) requires [`MambaCache`] because the model doesn't have an attention mechanism or kv states.
-
-## Iterative generation
+# Iterative generation

 A cache can also work in iterative generation settings where there is back-and-forth interaction with a model (chatbots). Like regular generation, iterative generation with a cache allows a model to efficiently handle ongoing conversations without recomputing the entire context at each step.

@ -271,16 +219,10 @@ For example, some models use special `<think> ... </think>` tokens during reason

 ```py
 import torch
-from transformers import AutoTokenizer,AutoModelForCausalLM
-from transformers.cache_utils import (
-    DynamicCache,
-    StaticCache,
-    SlidingWindowCache,
-    QuantoQuantizedCache,
-)
+from transformers import AutoTokenizer,AutoModelForCausalLM, DynamicCache, StaticCache

 model_id = "meta-llama/Llama-2-7b-chat-hf"
-model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map='auto')
+model = AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16, device_map='auto')
 tokenizer = AutoTokenizer.from_pretrained(model_id)

 user_prompts = ["Hello, what's your name?", "Btw, yesterday I was on a rock concert."]
@ -297,7 +239,7 @@ for prompt in user_prompts:
    messages.append({"role": "assistant", "content": completion})
 ```

-## Prefill a cache
+## Prefill a cache (prefix caching)

 In some situations, you may want to fill a [`Cache`] with kv pairs for a certain prefix prompt and reuse it to generate different sequences.

@ -309,7 +251,7 @@ import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, DynamicCache, StaticCache

 model_id = "meta-llama/Llama-2-7b-chat-hf"
-model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map={"": 0})
+model = AutoModelForCausalLM.from_pretrained(model_id, dtype=torch.bfloat16, device_map={"": 0})
 tokenizer = AutoTokenizer.from_pretrained(model_id)

 # Init StaticCache with big enough max-length (1024 tokens for the below example)
--- a/docs/source/en/llm_optims.md
+++ b/docs/source/en/llm_optims.md
@ -53,7 +53,7 @@ import os
 os.environ["TOKENIZERS_PARALLELISM"] = "false"  # To prevent long warnings :)

 tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")
-model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", torch_dtype="auto", device_map="auto")
+model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", dtype="auto", device_map="auto")

 model.generation_config.cache_implementation = "static"

@ -83,7 +83,7 @@ import os
 os.environ["TOKENIZERS_PARALLELISM"] = "false"  # To prevent long warnings :)

 tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")
-model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", torch_dtype="auto", device_map="auto")
+model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", dtype="auto", device_map="auto")

 model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
 input_text = "The theory of special relativity states "
@ -195,7 +195,7 @@ import os
 os.environ["TOKENIZERS_PARALLELISM"] = "false"  # To prevent long warnings :)

 tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")
-model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", torch_dtype="auto", device_map="auto")
+model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", dtype="auto", device_map="auto")

 model.generate = torch.compile(model.generate, mode="reduce-overhead", fullgraph=True)
 input_text = "The theory of special relativity states "
@ -246,7 +246,7 @@ device = infer_device()
 tokenizer = AutoTokenizer.from_pretrained("facebook/opt-1.3b")
 inputs = tokenizer("Einstein's theory of relativity states", return_tensors="pt").to(device)

-model = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b", torch_dtype="auto").to(device)
+model = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b", dtype="auto").to(device)
 assistant_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m").to(device)
 outputs = model.generate(**inputs, assistant_model=assistant_model)
 tokenizer.batch_decode(outputs, skip_special_tokens=True)
@ -267,7 +267,7 @@ device = infer_device()
 tokenizer = AutoTokenizer.from_pretrained("facebook/opt-1.3b")
 inputs = tokenizer("Einstein's theory of relativity states", return_tensors="pt").to(device)

-model = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b", torch_dtype="auto").to(device)
+model = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b", dtype="auto").to(device)
 assistant_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m").to(device)
 outputs = model.generate(**inputs, assistant_model=assistant_model, do_sample=True, temperature=0.7)
 print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
@ -295,7 +295,7 @@ device = infer_device()
 tokenizer = AutoTokenizer.from_pretrained("facebook/opt-1.3b")
 inputs = tokenizer("The second law of thermodynamics states", return_tensors="pt").to(device)

-model = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b", torch_dtype="auto").to(device)
+model = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b", dtype="auto").to(device)
 assistant_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m").to(device)
 outputs = model.generate(**inputs, prompt_lookup_num_tokens=3)
 print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
@ -316,7 +316,7 @@ device = infer_device()
 tokenizer = AutoTokenizer.from_pretrained("facebook/opt-1.3b")
 inputs = tokenizer("The second law of thermodynamics states", return_tensors="pt").to(device)

-model = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b", torch_dtype="auto").to(device)
+model = AutoModelForCausalLM.from_pretrained("facebook/opt-1.3b", dtype="auto").to(device)
 outputs = model.generate(**inputs, prompt_lookup_num_tokens=3, do_sample=True, temperature=0.7)
 print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
 ["The second law of thermodynamics states that energy cannot be created nor destroyed. It's not a"]
@ -342,7 +342,7 @@ quant_config = BitsAndBytesConfig(load_in_8bit=True)
 model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2b",
    quantization_config=quant_config,
-    torch_dtype=torch.bfloat16,
+    dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
 )

@ -350,7 +350,7 @@ model = AutoModelForCausalLM.from_pretrained(
 model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2b",
    quantization_config=quant_config,
-    torch_dtype=torch.bfloat16
+    dtype=torch.bfloat16
 )
 model.set_attention_implementation("flash_attention_2")
 ```
@ -371,7 +371,7 @@ from transformers import AutoModelForCausalLM

 model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2b",
-    torch_dtype=torch.bfloat16,
+    dtype=torch.bfloat16,
 )

 with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
@ -396,14 +396,14 @@ Use the Model Memory Calculator below to estimate and compare how much memory is
 	height="450"
 ></iframe>

-To load a model in half-precision, set the [torch_dtype](https://hf.co/docs/transformers/main/en/main_classes/text_generation#transformers.PreTrainedModel.from_pretrained.torch_dtype) parameter in [`~transformers.AutoModelForCausalLM.from_pretrained`] to `torch.bfloat16`. This requires 13.74GB of memory.
+To load a model in half-precision, set the [dtype](https://hf.co/docs/transformers/main/en/main_classes/text_generation#transformers.PreTrainedModel.from_pretrained.dtype) parameter in [`~transformers.AutoModelForCausalLM.from_pretrained`] to `torch.bfloat16`. This requires 13.74GB of memory.

 ```py
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch

 model = AutoModelForCausalLM.from_pretrained(
-    "mistralai/Mistral-7B-v0.1", torch_dtype=torch.bfloat16, device_map="auto",
+    "mistralai/Mistral-7B-v0.1", dtype=torch.bfloat16, device_map="auto",
 )
 ```

--- a/docs/source/en/llm_tutorial.md
+++ b/docs/source/en/llm_tutorial.md
@ -56,7 +56,7 @@ Tokenize your input, and set the [`~PreTrainedTokenizer.padding_side`] parameter

 ```py
 tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", padding_side="left")
-model_inputs = tokenizer(["A list of colors: red, blue"], return_tensors="pt").to("cuda")
+model_inputs = tokenizer(["A list of colors: red, blue"], return_tensors="pt").to(model.device)
 ```

 Pass the inputs to [`~GenerationMixin.generate`] to generate tokens, and [`~PreTrainedTokenizer.batch_decode`] the generated tokens back to text.
@ -164,7 +164,7 @@ The section below covers some common issues you may encounter during text genera
 [`~GenerationMixin.generate`] returns up to 20 tokens by default unless otherwise specified in a models [`GenerationConfig`]. It is highly recommended to manually set the number of generated tokens with the [`max_new_tokens`] parameter to control the output length. [Decoder-only](https://hf.co/learn/nlp-course/chapter1/6?fw=pt) models returns the initial prompt along with the generated tokens.

 ```py
-model_inputs = tokenizer(["A sequence of numbers: 1, 2"], return_tensors="pt").to("cuda")
+model_inputs = tokenizer(["A sequence of numbers: 1, 2"], return_tensors="pt").to(model.device)
 ```

 <hfoptions id="output-length">
@ -195,7 +195,7 @@ The default decoding strategy in [`~GenerationMixin.generate`] is *greedy search
 For example, enable a [multinomial sampling](./generation_strategies#multinomial-sampling) strategy to generate more diverse outputs. Refer to the [Generation strategy](./generation_strategies) guide for more decoding strategies.

 ```py
-model_inputs = tokenizer(["I am a cat."], return_tensors="pt").to("cuda")
+model_inputs = tokenizer(["I am a cat."], return_tensors="pt").to(model.device)
 ```

 <hfoptions id="decoding">
@ -227,7 +227,7 @@ Inputs need to be padded if they don't have the same length. But LLMs aren't tra
 ```py
 model_inputs = tokenizer(
    ["1, 2, 3", "A, B, C, D, E"], padding=True, return_tensors="pt"
-).to("cuda")
+).to(model.device)
 generated_ids = model.generate(**model_inputs)
 tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
 '1, 2, 33333333333'
@ -241,7 +241,7 @@ tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", padding_s
 tokenizer.pad_token = tokenizer.eos_token
 model_inputs = tokenizer(
    ["1, 2, 3", "A, B, C, D, E"], padding=True, return_tensors="pt"
-).to("cuda")
+).to(model.device)
 generated_ids = model.generate(**model_inputs)
 tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
 '1, 2, 3, 4, 5, 6,'
@ -270,7 +270,7 @@ model = AutoModelForCausalLM.from_pretrained(

 ```py
 prompt = """How many cats does it take to change a light bulb? Reply as a pirate."""
-model_inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
+model_inputs = tokenizer([prompt], return_tensors="pt").to(model.device)
 input_length = model_inputs.input_ids.shape[1]
 generated_ids = model.generate(**model_inputs, max_new_tokens=50)
 print(tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_tokens=True)[0])
@ -288,7 +288,7 @@ messages = [
    },
    {"role": "user", "content": "How many cats does it take to change a light bulb?"},
 ]
-model_inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to("cuda")
+model_inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to(model.device)
 input_length = model_inputs.shape[1]
 generated_ids = model.generate(model_inputs, do_sample=True, max_new_tokens=50)
 print(tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_tokens=True)[0])
--- a/docs/source/en/llm_tutorial_optimization.md
+++ b/docs/source/en/llm_tutorial_optimization.md
@ -84,7 +84,7 @@ We first load the model and tokenizer and then pass both to Transformers' [pipel
 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 import torch

-model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", torch_dtype=torch.bfloat16, device_map="auto", pad_token_id=0)
+model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", dtype=torch.bfloat16, device_map="auto", pad_token_id=0)
 tokenizer = AutoTokenizer.from_pretrained("bigcode/octocoder")

 pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
@ -125,7 +125,7 @@ Note that if we had tried to run the model in full float32 precision, a whopping

 > Almost all models are trained in bfloat16 nowadays, there is no reason to run the model in full float32 precision if [your GPU supports bfloat16](https://discuss.pytorch.org/t/bfloat16-native-support/117155/5). Float32 won't give better inference results than the precision that was used to train the model.

-If you are unsure in which format the model weights are stored on the Hub, you can always look into the checkpoint's config under `"torch_dtype"`, *e.g.* [here](https://huggingface.co/meta-llama/Llama-2-7b-hf/blob/6fdf2e60f86ff2481f2241aaee459f85b5b0bbb9/config.json#L21). It is recommended to set the model to the same precision type as written in the config when loading with `from_pretrained(..., torch_dtype=...)` except when the original type is float32 in which case one can use both `float16` or `bfloat16` for inference.
+If you are unsure in which format the model weights are stored on the Hub, you can always look into the checkpoint's config under `"dtype"`, *e.g.* [here](https://huggingface.co/meta-llama/Llama-2-7b-hf/blob/6fdf2e60f86ff2481f2241aaee459f85b5b0bbb9/config.json#L21). It is recommended to set the model to the same precision type as written in the config when loading with `from_pretrained(..., dtype=...)` except when the original type is float32 in which case one can use both `float16` or `bfloat16` for inference.


 Let's define a `flush(...)` function to free all allocated memory so that we can accurately measure the peak allocated GPU memory.
@ -394,7 +394,7 @@ long_prompt = 10 * system_prompt + prompt
 We instantiate our model again in bfloat16 precision.

 ```python
-model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", torch_dtype=torch.bfloat16, device_map="auto")
+model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", dtype=torch.bfloat16, device_map="auto")
 tokenizer = AutoTokenizer.from_pretrained("bigcode/octocoder")

 pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
--- a/docs/source/en/main_classes/callback.md
+++ b/docs/source/en/main_classes/callback.md
@ -17,9 +17,8 @@ rendered properly in your Markdown viewer.
 # Callbacks

 Callbacks are objects that can customize the behavior of the training loop in the PyTorch
-[`Trainer`] (this feature is not yet implemented in TensorFlow) that can inspect the training loop
-state (for progress reporting, logging on TensorBoard or other ML platforms...) and take decisions (like early
-stopping).
+[`Trainer`] that can inspect the training loop state (for progress reporting, logging on TensorBoard or other ML
+platforms...) and take decisions (like early stopping).

 Callbacks are "read only" pieces of code, apart from the [`TrainerControl`] object they return, they
 cannot change anything in the training loop. For customizations that require changes in the training loop, you should
@ -48,7 +47,7 @@ By default, `TrainingArguments.report_to` is set to `"all"`, so a [`Trainer`] wi
 - [`~integrations.DVCLiveCallback`] if [dvclive](https://dvc.org/doc/dvclive) is installed.
 - [`~integrations.SwanLabCallback`] if [swanlab](http://swanlab.cn/) is installed.

-If a package is installed but you don't wish to use the accompanying integration, you can change `TrainingArguments.report_to` to a list of just those integrations you want to use (e.g. `["azure_ml", "wandb"]`). 
+If a package is installed but you don't wish to use the accompanying integration, you can change `TrainingArguments.report_to` to a list of just those integrations you want to use (e.g. `["azure_ml", "wandb"]`).

 The main class that implements callbacks is [`TrainerCallback`]. It gets the
 [`TrainingArguments`] used to instantiate the [`Trainer`], can access that
--- a/docs/source/en/main_classes/data_collator.md
+++ b/docs/source/en/main_classes/data_collator.md
@ -50,21 +50,18 @@ Examples of use can be found in the [example scripts](../examples) or [example n

 [[autodoc]] data.data_collator.DataCollatorForLanguageModeling
    - numpy_mask_tokens
-    - tf_mask_tokens
    - torch_mask_tokens

 ## DataCollatorForWholeWordMask

 [[autodoc]] data.data_collator.DataCollatorForWholeWordMask
    - numpy_mask_tokens
-    - tf_mask_tokens
    - torch_mask_tokens

 ## DataCollatorForPermutationLanguageModeling

 [[autodoc]] data.data_collator.DataCollatorForPermutationLanguageModeling
    - numpy_mask_tokens
-    - tf_mask_tokens
    - torch_mask_tokens

 ## DataCollatorWithFlattening
--- a/docs/source/en/main_classes/feature_extractor.md
+++ b/docs/source/en/main_classes/feature_extractor.md
@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.

 # Feature Extractor

-A feature extractor is in charge of preparing input features for audio or vision models. This includes feature extraction from sequences, e.g., pre-processing audio files to generate Log-Mel Spectrogram features, feature extraction from images, e.g., cropping image files, but also padding, normalization, and conversion to NumPy, PyTorch, and TensorFlow tensors.
+A feature extractor is in charge of preparing input features for audio or vision models. This includes feature extraction from sequences, e.g., pre-processing audio files to generate Log-Mel Spectrogram features, feature extraction from images, e.g., cropping image files, but also padding, normalization, and conversion to NumPy and PyTorch tensors.


 ## FeatureExtractionMixin
--- a/docs/source/en/main_classes/image_processor.md
+++ b/docs/source/en/main_classes/image_processor.md
@ -16,8 +16,7 @@ rendered properly in your Markdown viewer.

 # Image Processor

-An image processor is in charge of preparing input features for vision models and post processing their outputs. This includes transformations such as resizing, normalization, and conversion to PyTorch, TensorFlow, Flax and Numpy tensors. It may also include model specific post-processing such as converting logits to segmentation masks.
-
+An image processor is in charge of loading images (optionally), preparing input features for vision models and post processing their outputs. This includes transformations such as resizing, normalization, and conversion to PyTorch and Numpy tensors. It may also include model specific post-processing such as converting logits to segmentation masks.
 Fast image processors are available for a few models and more will be added in the future. They are based on the [torchvision](https://pytorch.org/vision/stable/index.html) library and provide a significant speed-up, especially when processing on GPU.
 They have the same API as the base image processors and can be used as drop-in replacements.
 To use a fast image processor, you need to install the `torchvision` library, and set the `use_fast` argument to `True` when instantiating the image processor:
--- a/docs/source/en/main_classes/model.md
+++ b/docs/source/en/main_classes/model.md
@ -16,22 +16,15 @@ rendered properly in your Markdown viewer.

 # Models

-The base classes [`PreTrainedModel`], [`TFPreTrainedModel`], and
-[`FlaxPreTrainedModel`] implement the common methods for loading/saving a model either from a local
-file or directory, or from a pretrained model configuration provided by the library (downloaded from HuggingFace's AWS
-S3 repository).
+The base class [`PreTrainedModel`] implements the common methods for loading/saving a model either from a local
+file or directory, or from a pretrained model configuration provided by the library (downloaded from HuggingFace's Hub).

-[`PreTrainedModel`] and [`TFPreTrainedModel`] also implement a few methods which
-are common among all the models to:
+[`PreTrainedModel`] also implements a few methods which are common among all the models to:

 - resize the input token embeddings when new tokens are added to the vocabulary
 - prune the attention heads of the model.

-The other methods that are common to each model are defined in [`~modeling_utils.ModuleUtilsMixin`]
-(for the PyTorch models) and [`~modeling_tf_utils.TFModuleUtilsMixin`] (for the TensorFlow models) or
-for text generation, [`~generation.GenerationMixin`] (for the PyTorch models),
-[`~generation.TFGenerationMixin`] (for the TensorFlow models) and
-[`~generation.FlaxGenerationMixin`] (for the Flax/JAX models).
+The other methods that are common to each model are defined in [`~modeling_utils.ModuleUtilsMixin`] and [`~generation.GenerationMixin`].


 ## PreTrainedModel
@ -48,22 +41,6 @@ set this to `False`.

 [[autodoc]] modeling_utils.ModuleUtilsMixin

-## TFPreTrainedModel
-
-[[autodoc]] TFPreTrainedModel
-    - push_to_hub
-    - all
-
-## TFModelUtilsMixin
-
-[[autodoc]] modeling_tf_utils.TFModelUtilsMixin
-
-## FlaxPreTrainedModel
-
-[[autodoc]] FlaxPreTrainedModel
-    - push_to_hub
-    - all
-
 ## Pushing to the Hub

 [[autodoc]] utils.PushToHubMixin
--- a/docs/source/en/main_classes/optimizer_schedules.md
+++ b/docs/source/en/main_classes/optimizer_schedules.md
@ -23,19 +23,13 @@ The `.optimization` module provides:
 - a gradient accumulation class to accumulate the gradients of multiple batches


-## AdaFactor (PyTorch)
+## AdaFactor

 [[autodoc]] Adafactor

-## AdamWeightDecay (TensorFlow)
-
-[[autodoc]] AdamWeightDecay
-
-[[autodoc]] create_optimizer
-
 ## Schedules

-### Learning Rate Schedules (PyTorch)
+### Learning Rate Schedules

 [[autodoc]] SchedulerType

@ -64,13 +58,3 @@ The `.optimization` module provides:
 [[autodoc]] get_inverse_sqrt_schedule

 [[autodoc]] get_wsd_schedule
-
-### Warmup (TensorFlow)
-
-[[autodoc]] WarmUp
-
-## Gradient Strategies
-
-### GradientAccumulator (TensorFlow)
-
-[[autodoc]] GradientAccumulator
--- a/docs/source/en/main_classes/output.md
+++ b/docs/source/en/main_classes/output.md
@ -187,135 +187,3 @@ documented on their corresponding model page.
 ## SampleTSPredictionOutput

 [[autodoc]] modeling_outputs.SampleTSPredictionOutput
-
-## TFBaseModelOutput
-
-[[autodoc]] modeling_tf_outputs.TFBaseModelOutput
-
-## TFBaseModelOutputWithPooling
-
-[[autodoc]] modeling_tf_outputs.TFBaseModelOutputWithPooling
-
-## TFBaseModelOutputWithPoolingAndCrossAttentions
-
-[[autodoc]] modeling_tf_outputs.TFBaseModelOutputWithPoolingAndCrossAttentions
-
-## TFBaseModelOutputWithPast
-
-[[autodoc]] modeling_tf_outputs.TFBaseModelOutputWithPast
-
-## TFBaseModelOutputWithPastAndCrossAttentions
-
-[[autodoc]] modeling_tf_outputs.TFBaseModelOutputWithPastAndCrossAttentions
-
-## TFSeq2SeqModelOutput
-
-[[autodoc]] modeling_tf_outputs.TFSeq2SeqModelOutput
-
-## TFCausalLMOutput
-
-[[autodoc]] modeling_tf_outputs.TFCausalLMOutput
-
-## TFCausalLMOutputWithCrossAttentions
-
-[[autodoc]] modeling_tf_outputs.TFCausalLMOutputWithCrossAttentions
-
-## TFCausalLMOutputWithPast
-
-[[autodoc]] modeling_tf_outputs.TFCausalLMOutputWithPast
-
-## TFMaskedLMOutput
-
-[[autodoc]] modeling_tf_outputs.TFMaskedLMOutput
-
-## TFSeq2SeqLMOutput
-
-[[autodoc]] modeling_tf_outputs.TFSeq2SeqLMOutput
-
-## TFNextSentencePredictorOutput
-
-[[autodoc]] modeling_tf_outputs.TFNextSentencePredictorOutput
-
-## TFSequenceClassifierOutput
-
-[[autodoc]] modeling_tf_outputs.TFSequenceClassifierOutput
-
-## TFSeq2SeqSequenceClassifierOutput
-
-[[autodoc]] modeling_tf_outputs.TFSeq2SeqSequenceClassifierOutput
-
-## TFMultipleChoiceModelOutput
-
-[[autodoc]] modeling_tf_outputs.TFMultipleChoiceModelOutput
-
-## TFTokenClassifierOutput
-
-[[autodoc]] modeling_tf_outputs.TFTokenClassifierOutput
-
-## TFQuestionAnsweringModelOutput
-
-[[autodoc]] modeling_tf_outputs.TFQuestionAnsweringModelOutput
-
-## TFSeq2SeqQuestionAnsweringModelOutput
-
-[[autodoc]] modeling_tf_outputs.TFSeq2SeqQuestionAnsweringModelOutput
-
-## FlaxBaseModelOutput
-
-[[autodoc]] modeling_flax_outputs.FlaxBaseModelOutput
-
-## FlaxBaseModelOutputWithPast
-
-[[autodoc]] modeling_flax_outputs.FlaxBaseModelOutputWithPast
-
-## FlaxBaseModelOutputWithPooling
-
-[[autodoc]] modeling_flax_outputs.FlaxBaseModelOutputWithPooling
-
-## FlaxBaseModelOutputWithPastAndCrossAttentions
-
-[[autodoc]] modeling_flax_outputs.FlaxBaseModelOutputWithPastAndCrossAttentions
-
-## FlaxSeq2SeqModelOutput
-
-[[autodoc]] modeling_flax_outputs.FlaxSeq2SeqModelOutput
-
-## FlaxCausalLMOutputWithCrossAttentions
-
-[[autodoc]] modeling_flax_outputs.FlaxCausalLMOutputWithCrossAttentions
-
-## FlaxMaskedLMOutput
-
-[[autodoc]] modeling_flax_outputs.FlaxMaskedLMOutput
-
-## FlaxSeq2SeqLMOutput
-
-[[autodoc]] modeling_flax_outputs.FlaxSeq2SeqLMOutput
-
-## FlaxNextSentencePredictorOutput
-
-[[autodoc]] modeling_flax_outputs.FlaxNextSentencePredictorOutput
-
-## FlaxSequenceClassifierOutput
-
-[[autodoc]] modeling_flax_outputs.FlaxSequenceClassifierOutput
-
-## FlaxSeq2SeqSequenceClassifierOutput
-
-[[autodoc]] modeling_flax_outputs.FlaxSeq2SeqSequenceClassifierOutput
-
-## FlaxMultipleChoiceModelOutput
-
-[[autodoc]] modeling_flax_outputs.FlaxMultipleChoiceModelOutput
-
-## FlaxTokenClassifierOutput
-
-[[autodoc]] modeling_flax_outputs.FlaxTokenClassifierOutput
-
-## FlaxQuestionAnsweringModelOutput
-
-[[autodoc]] modeling_flax_outputs.FlaxQuestionAnsweringModelOutput
-
-## FlaxSeq2SeqQuestionAnsweringModelOutput
-
-[[autodoc]] modeling_flax_outputs.FlaxSeq2SeqQuestionAnsweringModelOutput
--- a/docs/source/en/main_classes/pipelines.md
+++ b/docs/source/en/main_classes/pipelines.md
@ -273,7 +273,7 @@ independently of the inputs. The caveats from the previous section still apply.
 ## Pipeline FP16 inference
 Models can be run in FP16 which can be significantly faster on GPU while saving memory. Most models will not suffer noticeable performance loss from this. The larger the model, the less likely that it will.

-To enable FP16 inference, you can simply pass `torch_dtype=torch.float16` or `torch_dtype='float16'` to the pipeline constructor. Note that this only works for models with a PyTorch backend. Your inputs will be converted to FP16 internally.
+To enable FP16 inference, you can simply pass `dtype=torch.float16` or `dtype='float16'` to the pipeline constructor. Note that this only works for models with a PyTorch backend. Your inputs will be converted to FP16 internally.

 ## Pipeline custom code

@ -363,6 +363,12 @@ Pipelines available for computer vision tasks include the following.
    - __call__
    - all

+### KeypointMatchingPipeline
+
+[[autodoc]] KeypointMatchingPipeline
+    - __call__
+    - all
+
 ### ObjectDetectionPipeline

 [[autodoc]] ObjectDetectionPipeline
--- a/docs/source/en/main_classes/text_generation.md
+++ b/docs/source/en/main_classes/text_generation.md
@ -19,12 +19,8 @@ rendered properly in your Markdown viewer.
 Each framework has a generate method for text generation implemented in their respective `GenerationMixin` class:

 - PyTorch [`~generation.GenerationMixin.generate`] is implemented in [`~generation.GenerationMixin`].
- TensorFlow [`~generation.TFGenerationMixin.generate`] is implemented in [`~generation.TFGenerationMixin`].
- Flax/JAX [`~generation.FlaxGenerationMixin.generate`] is implemented in [`~generation.FlaxGenerationMixin`].

-Regardless of your framework of choice, you can parameterize the generate method with a [`~generation.GenerationConfig`]
-class instance. Please refer to this class for the complete list of generation parameters, which control the behavior
-of the generation method.
+You can parameterize the generate method with a [`~generation.GenerationConfig`] class instance. Please refer to this class for the complete list of generation parameters, which control the behavior of the generation method.

 To learn how to inspect a model's generation configuration, what are the defaults, how to change the parameters ad hoc,
 and how to create and save a customized generation configuration, refer to the
@ -46,14 +42,3 @@ like token streaming.
 [[autodoc]] GenerationMixin
 	- generate
 	- compute_transition_scores
-
-## TFGenerationMixin
-
-[[autodoc]] TFGenerationMixin
-	- generate
-	- compute_transition_scores
-
-## FlaxGenerationMixin
-
-[[autodoc]] FlaxGenerationMixin
-	- generate
--- a/docs/source/en/main_classes/video_processor.md
+++ b/docs/source/en/main_classes/video_processor.md
@ -14,10 +14,9 @@ rendered properly in your Markdown viewer.

 -->

-
 # Video Processor

-A **Video Processor** is a utility responsible for preparing input features for video models, as well as handling the post-processing of their outputs. It provides transformations such as resizing, normalization, and conversion into PyTorch. 
+A **Video Processor** is a utility responsible for preparing input features for video models, as well as handling the post-processing of their outputs. It provides transformations such as resizing, normalization, and conversion into PyTorch. Along ith transformations the `VideoProcessor` class handles video decoding from local paths or URLs (requires [`torchcodec`](https://pypi.org/project/torchcodec/)) and frame sampling according to model-specific strategies.

 The video processor extends the functionality of image processors by allowing Vision Large Language Models (VLMs) to handle videos with a distinct set of arguments compared to images. It serves as the bridge between raw video data and the model, ensuring that input features are optimized for the VLM.

@ -48,6 +47,47 @@ processor = torch.compile(processor)
 processed_video = processor(video, return_tensors="pt")
 ```

+#### Sampling behavior
+
+The video processor can also sample video frames using the technique best suited for the given model. Sampling behavior is controlled with the `do_sample_frames` argument and can be configured through model-specific parameters such as `num_frames` or `fps` (the rate at which the video will be sampled). If the input video is given as a local path or URL (`str`), the processor will decode it automatically. To obtain metadata about the decoded video, such as sampled frame indices, original dimensions, duration, and fps, pass `return_metadata=True` to the processor.
+
+<Tip warning={false}>
+
+- Specifying `num_frames` does not guarantee the output will contain exactly that number of frames. Depending on the model, the sampler may enforce minimum or maximum frame limits.
+
+- The default decoder is [`torchcodec`](https://pypi.org/project/torchcodec/), which must be installed.
+
+</Tip>
+
+
+```python
+from transformers import AutoVideoProcessor
+
+processor = AutoVideoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", device="cuda")
+processed_video_inputs = processor(videos=["video_path.mp4"], return_metadata=True, do_sample_frames=True, return_tensors="pt")
+video_metadata = processed_video_inputs["video_metadata"]
+
+# See how many frames the original video had and what was the original FPS
+print(video_metadata.total_num_frames, video_metadata.fps)
+```
+
+If you pass an already decoded video array but still want to enable model-specific frame sampling, it is strongly recommended to provide video_metadata. This allows the sampler to know the original video’s duration and FPS. You can pass metadata as a `VideoMetadata` object or as a plain dict.
+
+```python
+from transformers import AutoVideoProcessor
+from transformers.video_utils import VideoMetadata
+
+processor = AutoVideoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", device="cuda")
+my_decodec_video = torch.randint(0, 255, size=(100, 3, 1280, 1280)) # short video of 100 frames
+video_metadata = VideoMetadata(
+    total_num_frames=100,
+    fps=24,
+    duration=4.1, # in seconds
+)
+processed_video_inputs = processor(videos=["video_path.mp4"], video_metadata=video_metadata, do_sample_frames=True, num_frames=10, return_tensors="pt")
+print(processed_video_inputs.pixel_values_videos.shape)
+>>> [10, 3, 384, 384]
+```

 ## BaseVideoProcessor

--- a/docs/source/en/model_doc/aimv2.md
+++ b/docs/source/en/model_doc/aimv2.md
@ -100,6 +100,3 @@ probs = outputs.logits_per_image.softmax(dim=-1)

 [[autodoc]] Aimv2TextModel
    - forward
-
-</pt>
-<tf>
--- a/docs/source/en/model_doc/albert.md
+++ b/docs/source/en/model_doc/albert.md
@ -18,9 +18,7 @@ rendered properly in your Markdown viewer.
 <div style="float: right;">
    <div class="flex flex-wrap space-x-1">
        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white" >
-        <img alt= "TensorFlow" src= "https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white" >
-        <img alt= "Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style…Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC">
-        <img alt="SDPA" src= "https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white" > 
+        <img alt="SDPA" src= "https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white" >
    </div>
 </div>

@ -52,7 +50,7 @@ from transformers import pipeline
 pipeline = pipeline(
    task="fill-mask",
    model="albert-base-v2",
-    torch_dtype=torch.float16,
+    dtype=torch.float16,
    device=0
 )
 pipeline("Plants create [MASK] through a process known as photosynthesis.", top_k=5)
@ -68,7 +66,7 @@ from transformers import AutoModelForMaskedLM, AutoTokenizer
 tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2")
 model = AutoModelForMaskedLM.from_pretrained(
    "albert/albert-base-v2",
-    torch_dtype=torch.float16,
+    dtype=torch.float16,
    attn_implementation="sdpa",
    device_map="auto"
 )
@ -110,42 +108,30 @@ The resources provided in the following sections consist of a list of official H

 - [`AlbertForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification).

- [`TFAlbertForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/text-classification).
-
- [`FlaxAlbertForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification_flax.ipynb).
 - Check the [Text classification task guide](../tasks/sequence_classification) on how to use the model.

 <PipelineTag pipeline="token-classification"/>

 - [`AlbertForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/token-classification).

- [`TFAlbertForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/token-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb).
-
- [`FlaxAlbertForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/token-classification).
 - [Token classification](https://huggingface.co/course/chapter7/2?fw=pt) chapter of the 🤗 Hugging Face Course.
 - Check the [Token classification task guide](../tasks/token_classification) on how to use the model.

 <PipelineTag pipeline="fill-mask"/>

 - [`AlbertForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#robertabertdistilbert-and-masked-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb).
- [`TFAlbertForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/language-modeling#run_mlmpy) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb).
- [`FlaxAlbertForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling#masked-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/masked_language_modeling_flax.ipynb).
 - [Masked language modeling](https://huggingface.co/course/chapter7/3?fw=pt) chapter of the 🤗 Hugging Face Course.
 - Check the [Masked language modeling task guide](../tasks/masked_language_modeling) on how to use the model.

 <PipelineTag pipeline="question-answering"/>

 - [`AlbertForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb).
- [`TFAlbertForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/question-answering) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb).
- [`FlaxAlbertForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/question-answering).
 - [Question answering](https://huggingface.co/course/chapter7/7?fw=pt) chapter of the 🤗 Hugging Face Course.
 - Check the [Question answering task guide](../tasks/question_answering) on how to use the model.

 **Multiple choice**

 - [`AlbertForMultipleChoice`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/multiple-choice) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb).
- [`TFAlbertForMultipleChoice`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/multiple-choice) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb).
-
 - Check the [Multiple choice task guide](../tasks/multiple_choice) on how to use the model.

 ## AlbertConfig
@ -164,11 +150,6 @@ The resources provided in the following sections consist of a list of official H

 [[autodoc]] models.albert.modeling_albert.AlbertForPreTrainingOutput

-[[autodoc]] models.albert.modeling_tf_albert.TFAlbertForPreTrainingOutput
-
-<frameworkcontent>
-<pt>
-
 ## AlbertModel

 [[autodoc]] AlbertModel - forward
@ -196,69 +177,3 @@ The resources provided in the following sections consist of a list of official H
 ## AlbertForQuestionAnswering

 [[autodoc]] AlbertForQuestionAnswering - forward
-
-</pt>
-
-<tf>
-
-## TFAlbertModel
-
-[[autodoc]] TFAlbertModel - call
-
-## TFAlbertForPreTraining
-
-[[autodoc]] TFAlbertForPreTraining - call
-
-## TFAlbertForMaskedLM
-
-[[autodoc]] TFAlbertForMaskedLM - call
-
-## TFAlbertForSequenceClassification
-
-[[autodoc]] TFAlbertForSequenceClassification - call
-
-## TFAlbertForMultipleChoice
-
-[[autodoc]] TFAlbertForMultipleChoice - call
-
-## TFAlbertForTokenClassification
-
-[[autodoc]] TFAlbertForTokenClassification - call
-
-## TFAlbertForQuestionAnswering
-
-[[autodoc]] TFAlbertForQuestionAnswering - call
-
-</tf>
-<jax>
-
-## FlaxAlbertModel
-
-[[autodoc]] FlaxAlbertModel - **call**
-
-## FlaxAlbertForPreTraining
-
-[[autodoc]] FlaxAlbertForPreTraining - **call**
-
-## FlaxAlbertForMaskedLM
-
-[[autodoc]] FlaxAlbertForMaskedLM - **call**
-
-## FlaxAlbertForSequenceClassification
-
-[[autodoc]] FlaxAlbertForSequenceClassification - **call**
-
-## FlaxAlbertForMultipleChoice
-
-[[autodoc]] FlaxAlbertForMultipleChoice - **call**
-
-## FlaxAlbertForTokenClassification
-
-[[autodoc]] FlaxAlbertForTokenClassification - **call**
-
-## FlaxAlbertForQuestionAnswering
-
-[[autodoc]] FlaxAlbertForQuestionAnswering - **call**
-
-</jax>
-</frameworkcontent>
--- a/docs/source/en/model_doc/align.md
+++ b/docs/source/en/model_doc/align.md
@ -44,7 +44,7 @@ pipeline = pipeline(
    task="zero-shot-image-classification",
    model="kakaobrain/align-base",
    device=0,
-    torch_dtype=torch.bfloat16
+    dtype=torch.bfloat16
 )

 candidate_labels = [
--- a/docs/source/en/model_doc/altclip.md
+++ b/docs/source/en/model_doc/altclip.md
@ -40,7 +40,7 @@ import requests
 from PIL import Image
 from transformers import AltCLIPModel, AltCLIPProcessor

-model = AltCLIPModel.from_pretrained("BAAI/AltCLIP", torch_dtype=torch.bfloat16)
+model = AltCLIPModel.from_pretrained("BAAI/AltCLIP", dtype=torch.bfloat16)
 processor = AltCLIPProcessor.from_pretrained("BAAI/AltCLIP")

 url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
@ -74,7 +74,7 @@ from transformers import AltCLIPModel, AltCLIPProcessor, TorchAoConfig
 model = AltCLIPModel.from_pretrained(
    "BAAI/AltCLIP",
    quantization_config=TorchAoConfig("int4_weight_only", group_size=128),
-    torch_dtype=torch.bfloat16,
+    dtype=torch.bfloat16,
 )

 processor = AltCLIPProcessor.from_pretrained("BAAI/AltCLIP")
--- a/docs/source/en/model_doc/arcee.md
+++ b/docs/source/en/model_doc/arcee.md
@ -44,7 +44,7 @@ from transformers import pipeline
 pipeline = pipeline(
    task="text-generation",
    model="arcee-ai/AFM-4.5B",
-    torch_dtype=torch.float16,
+    dtype=torch.float16,
    device=0
 )

@ -62,7 +62,7 @@ from transformers import AutoTokenizer, ArceeForCausalLM
 tokenizer = AutoTokenizer.from_pretrained("arcee-ai/AFM-4.5B")
 model = ArceeForCausalLM.from_pretrained(
    "arcee-ai/AFM-4.5B",
-    torch_dtype=torch.float16,
+    dtype=torch.float16,
    device_map="auto"
 )

--- a/docs/source/en/model_doc/aria.md
+++ b/docs/source/en/model_doc/aria.md
@ -45,7 +45,7 @@ pipeline = pipeline(
    "image-to-text",
    model="rhymes-ai/Aria",
    device=0,
-    torch_dtype=torch.bfloat16
+    dtype=torch.bfloat16
 )
 pipeline(
    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",
@ -63,7 +63,7 @@ from transformers import AutoModelForCausalLM, AutoProcessor
 model = AutoModelForCausalLM.from_pretrained(
    "rhymes-ai/Aria",
    device_map="auto",
-    torch_dtype=torch.bfloat16,
+    dtype=torch.bfloat16,
    attn_implementation="sdpa"
 )

@ -109,7 +109,7 @@ from transformers import TorchAoConfig, AutoModelForCausalLM, AutoProcessor
 quantization_config = TorchAoConfig("int4_weight_only", group_size=128)
 model = AutoModelForCausalLM.from_pretrained(
    "rhymes-ai/Aria-sequential_mlp",
-    torch_dtype=torch.bfloat16,
+    dtype=torch.bfloat16,
    device_map="auto",
    quantization_config=quantization_config
 )
--- a/docs/source/en/model_doc/audio-spectrogram-transformer.md
+++ b/docs/source/en/model_doc/audio-spectrogram-transformer.md
@ -63,7 +63,7 @@ SDPA is used by default for `torch>=2.1.1` when an implementation is available,

 ```
 from transformers import ASTForAudioClassification
-model = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593", attn_implementation="sdpa", torch_dtype=torch.float16)
+model = ASTForAudioClassification.from_pretrained("MIT/ast-finetuned-audioset-10-10-0.4593", attn_implementation="sdpa", dtype=torch.float16)
 ...
 ```

--- a/docs/source/en/model_doc/auto.md
+++ b/docs/source/en/model_doc/auto.md
@ -30,7 +30,7 @@ model = AutoModel.from_pretrained("google-bert/bert-base-cased")

 will create a model that is an instance of [`BertModel`].

-There is one class of `AutoModel` for each task, and for each backend (PyTorch, TensorFlow, or Flax).
+There is one class of `AutoModel` for each task.

 ## Extending the Auto Classes

@ -90,14 +90,6 @@ The following auto classes are available for instantiating a base model class wi

 [[autodoc]] AutoModel

-### TFAutoModel
-
-[[autodoc]] TFAutoModel
-
-### FlaxAutoModel
-
-[[autodoc]] FlaxAutoModel
-
 ## Generic pretraining classes

 The following auto classes are available for instantiating a model with a pretraining head.
@ -106,14 +98,6 @@ The following auto classes are available for instantiating a model with a pretra

 [[autodoc]] AutoModelForPreTraining

-### TFAutoModelForPreTraining
-
-[[autodoc]] TFAutoModelForPreTraining
-
-### FlaxAutoModelForPreTraining
-
-[[autodoc]] FlaxAutoModelForPreTraining
-
 ## Natural Language Processing

 The following auto classes are available for the following natural language processing tasks.
@ -122,114 +106,42 @@ The following auto classes are available for the following natural language proc

 [[autodoc]] AutoModelForCausalLM

-### TFAutoModelForCausalLM
-
-[[autodoc]] TFAutoModelForCausalLM
-
-### FlaxAutoModelForCausalLM
-
-[[autodoc]] FlaxAutoModelForCausalLM
-
 ### AutoModelForMaskedLM

 [[autodoc]] AutoModelForMaskedLM

-### TFAutoModelForMaskedLM
-
-[[autodoc]] TFAutoModelForMaskedLM
-
-### FlaxAutoModelForMaskedLM
-
-[[autodoc]] FlaxAutoModelForMaskedLM
-
 ### AutoModelForMaskGeneration

 [[autodoc]] AutoModelForMaskGeneration

-### TFAutoModelForMaskGeneration
-
-[[autodoc]] TFAutoModelForMaskGeneration
-
 ### AutoModelForSeq2SeqLM

 [[autodoc]] AutoModelForSeq2SeqLM

-### TFAutoModelForSeq2SeqLM
-
-[[autodoc]] TFAutoModelForSeq2SeqLM
-
-### FlaxAutoModelForSeq2SeqLM
-
-[[autodoc]] FlaxAutoModelForSeq2SeqLM
-
 ### AutoModelForSequenceClassification

 [[autodoc]] AutoModelForSequenceClassification

-### TFAutoModelForSequenceClassification
-
-[[autodoc]] TFAutoModelForSequenceClassification
-
-### FlaxAutoModelForSequenceClassification
-
-[[autodoc]] FlaxAutoModelForSequenceClassification
-
 ### AutoModelForMultipleChoice

 [[autodoc]] AutoModelForMultipleChoice

-### TFAutoModelForMultipleChoice
-
-[[autodoc]] TFAutoModelForMultipleChoice
-
-### FlaxAutoModelForMultipleChoice
-
-[[autodoc]] FlaxAutoModelForMultipleChoice
-
 ### AutoModelForNextSentencePrediction

 [[autodoc]] AutoModelForNextSentencePrediction

-### TFAutoModelForNextSentencePrediction
-
-[[autodoc]] TFAutoModelForNextSentencePrediction
-
-### FlaxAutoModelForNextSentencePrediction
-
-[[autodoc]] FlaxAutoModelForNextSentencePrediction
-
 ### AutoModelForTokenClassification

 [[autodoc]] AutoModelForTokenClassification

-### TFAutoModelForTokenClassification
-
-[[autodoc]] TFAutoModelForTokenClassification
-
-### FlaxAutoModelForTokenClassification
-
-[[autodoc]] FlaxAutoModelForTokenClassification
-
 ### AutoModelForQuestionAnswering

 [[autodoc]] AutoModelForQuestionAnswering

-### TFAutoModelForQuestionAnswering
-
-[[autodoc]] TFAutoModelForQuestionAnswering
-
-### FlaxAutoModelForQuestionAnswering
-
-[[autodoc]] FlaxAutoModelForQuestionAnswering
-
 ### AutoModelForTextEncoding

 [[autodoc]] AutoModelForTextEncoding

-### TFAutoModelForTextEncoding
-
-[[autodoc]] TFAutoModelForTextEncoding
-
 ## Computer vision

 The following auto classes are available for the following computer vision tasks.
@ -242,14 +154,6 @@ The following auto classes are available for the following computer vision tasks

 [[autodoc]] AutoModelForImageClassification

-### TFAutoModelForImageClassification
-
-[[autodoc]] TFAutoModelForImageClassification
-
-### FlaxAutoModelForImageClassification
-
-[[autodoc]] FlaxAutoModelForImageClassification
-
 ### AutoModelForVideoClassification

 [[autodoc]] AutoModelForVideoClassification
@ -266,10 +170,6 @@ The following auto classes are available for the following computer vision tasks

 [[autodoc]] AutoModelForMaskedImageModeling

-### TFAutoModelForMaskedImageModeling
-
-[[autodoc]] TFAutoModelForMaskedImageModeling
-
 ### AutoModelForObjectDetection

 [[autodoc]] AutoModelForObjectDetection
@ -286,10 +186,6 @@ The following auto classes are available for the following computer vision tasks

 [[autodoc]] AutoModelForSemanticSegmentation

-### TFAutoModelForSemanticSegmentation
-
-[[autodoc]] TFAutoModelForSemanticSegmentation
-
 ### AutoModelForInstanceSegmentation

 [[autodoc]] AutoModelForInstanceSegmentation
@ -302,10 +198,6 @@ The following auto classes are available for the following computer vision tasks

 [[autodoc]] AutoModelForZeroShotImageClassification

-### TFAutoModelForZeroShotImageClassification
-
-[[autodoc]] TFAutoModelForZeroShotImageClassification
-
 ### AutoModelForZeroShotObjectDetection

 [[autodoc]] AutoModelForZeroShotObjectDetection
@ -320,10 +212,6 @@ The following auto classes are available for the following audio tasks.

 ### AutoModelForAudioFrameClassification

-[[autodoc]] TFAutoModelForAudioClassification
-
-### TFAutoModelForAudioFrameClassification
-
 [[autodoc]] AutoModelForAudioFrameClassification

 ### AutoModelForCTC
@ -334,14 +222,6 @@ The following auto classes are available for the following audio tasks.

 [[autodoc]] AutoModelForSpeechSeq2Seq

-### TFAutoModelForSpeechSeq2Seq
-
-[[autodoc]] TFAutoModelForSpeechSeq2Seq
-
-### FlaxAutoModelForSpeechSeq2Seq
-
-[[autodoc]] FlaxAutoModelForSpeechSeq2Seq
-
 ### AutoModelForAudioXVector

 [[autodoc]] AutoModelForAudioXVector
@ -366,18 +246,10 @@ The following auto classes are available for the following multimodal tasks.

 [[autodoc]] AutoModelForTableQuestionAnswering

-### TFAutoModelForTableQuestionAnswering
-
-[[autodoc]] TFAutoModelForTableQuestionAnswering
-
 ### AutoModelForDocumentQuestionAnswering

 [[autodoc]] AutoModelForDocumentQuestionAnswering

-### TFAutoModelForDocumentQuestionAnswering
-
-[[autodoc]] TFAutoModelForDocumentQuestionAnswering
-
 ### AutoModelForVisualQuestionAnswering

 [[autodoc]] AutoModelForVisualQuestionAnswering
@ -386,14 +258,6 @@ The following auto classes are available for the following multimodal tasks.

 [[autodoc]] AutoModelForVision2Seq

-### TFAutoModelForVision2Seq
-
-[[autodoc]] TFAutoModelForVision2Seq
-
-### FlaxAutoModelForVision2Seq
-
-[[autodoc]] FlaxAutoModelForVision2Seq
-
 ### AutoModelForImageTextToText

 [[autodoc]] AutoModelForImageTextToText
--- a/docs/source/en/model_doc/aya_vision.md
+++ b/docs/source/en/model_doc/aya_vision.md
@ -67,7 +67,7 @@ model_id = "CohereLabs/aya-vision-8b"

 processor = AutoProcessor.from_pretrained(model_id)
 model = AutoModelForImageTextToText.from_pretrained(
-    model_id, device_map="auto", torch_dtype=torch.float16
+    model_id, device_map="auto", dtype=torch.float16
 )

 # Format message with the aya-vision chat template
@ -153,7 +153,7 @@ print(processor.tokenizer.decode(generated[0], skip_special_tokens=True))
        
    processor = AutoProcessor.from_pretrained("CohereForAI/aya-vision-8b")
    model = AutoModelForImageTextToText.from_pretrained(
-        "CohereForAI/aya-vision-8b", device_map="auto", torch_dtype=torch.float16
+        "CohereForAI/aya-vision-8b", device_map="auto", dtype=torch.float16
    )
    
    messages = [
@ -199,7 +199,7 @@ print(processor.tokenizer.decode(generated[0], skip_special_tokens=True))
        
    processor = AutoProcessor.from_pretrained(model_id)
    model = AutoModelForImageTextToText.from_pretrained(
-        "CohereForAI/aya-vision-8b", device_map="auto", torch_dtype=torch.float16
+        "CohereForAI/aya-vision-8b", device_map="auto", dtype=torch.float16
    )
    
    batch_messages = [
--- a/docs/source/en/model_doc/bamba.md
+++ b/docs/source/en/model_doc/bamba.md
@ -46,7 +46,7 @@ from transformers import pipeline
 pipeline = pipeline(
    task="text-generation",
    model="ibm-ai-platform/Bamba-9B-v2",
-    torch_dtype=torch.bfloat16,
+    dtype=torch.bfloat16,
    device=0
 )
 pipeline("Plants create energy through a process known as")
@ -61,7 +61,7 @@ import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer

 tokenizer = AutoTokenizer.from_pretrained("ibm-ai-platform/Bamba-9B-v2")
-model = AutoModelForCausalLM.from_pretrained("ibm-ai-platform/Bamba-9B-v2", torch_dtype=torch.bfloat16, device_map="auto", attn_implementation="sdpa")
+model = AutoModelForCausalLM.from_pretrained("ibm-ai-platform/Bamba-9B-v2", dtype=torch.bfloat16, device_map="auto", attn_implementation="sdpa")
 input_ids = tokenizer("Plants create energy through a process known as", return_tensors="pt").to(model.device)

 output = model.generate(**input_ids)
--- a/docs/source/en/model_doc/bark.md
+++ b/docs/source/en/model_doc/bark.md
@ -47,7 +47,7 @@ from transformers import BarkModel, infer_device
 import torch

 device = infer_device()
-model = BarkModel.from_pretrained("suno/bark-small", torch_dtype=torch.float16).to(device)
+model = BarkModel.from_pretrained("suno/bark-small", dtype=torch.float16).to(device)
 ```

 #### Using CPU offload
@ -92,7 +92,7 @@ pip install -U flash-attn --no-build-isolation
 To load a model using Flash Attention 2, we can pass the `attn_implementation="flash_attention_2"` flag to [`.from_pretrained`](https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.from_pretrained). We'll also load the model in half-precision (e.g. `torch.float16`), since it results in almost no degradation to audio quality but significantly lower memory usage and faster inference:

 ```python
-model = BarkModel.from_pretrained("suno/bark-small", torch_dtype=torch.float16, attn_implementation="flash_attention_2").to(device)
+model = BarkModel.from_pretrained("suno/bark-small", dtype=torch.float16, attn_implementation="flash_attention_2").to(device)
 ```

 ##### Performance comparison
@ -120,7 +120,7 @@ import torch
 device = infer_device()

 # load in fp16 and use Flash Attention 2
-model = BarkModel.from_pretrained("suno/bark-small", torch_dtype=torch.float16, attn_implementation="flash_attention_2").to(device)
+model = BarkModel.from_pretrained("suno/bark-small", dtype=torch.float16, attn_implementation="flash_attention_2").to(device)

 # enable CPU offload
 model.enable_cpu_offload()
--- a/docs/source/en/model_doc/bart.md
+++ b/docs/source/en/model_doc/bart.md
@ -19,9 +19,6 @@ rendered properly in your Markdown viewer.
 <div style="float: right;">
    <div class="flex flex-wrap space-x-1">
    <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-    <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
-    <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
-    ">
    <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
    <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
 </div>
@ -43,7 +40,7 @@ from transformers import pipeline
 pipeline = pipeline(
    task="fill-mask",
    model="facebook/bart-large",
-    torch_dtype=torch.float16,
+    dtype=torch.float16,
    device=0
 )
 pipeline("Plants create <mask> through a process known as photosynthesis.")
@ -61,7 +58,7 @@ tokenizer = AutoTokenizer.from_pretrained(
 )
 model = AutoModelForMaskedLM.from_pretrained(
    "facebook/bart-large",
-    torch_dtype=torch.float16,
+    dtype=torch.float16,
    device_map="auto",
    attn_implementation="sdpa"
 )
@ -112,10 +109,6 @@ echo -e "Plants create <mask> through a process known as photosynthesis." | tran
 [[autodoc]] BartTokenizerFast
    - all

-
-<frameworkcontent>
-<pt>
-
 ## BartModel

 [[autodoc]] BartModel
@ -140,62 +133,3 @@ echo -e "Plants create <mask> through a process known as photosynthesis." | tran

 [[autodoc]] BartForCausalLM
    - forward
-
-</pt>
-<tf>
-
-## TFBartModel
-
-[[autodoc]] TFBartModel
-    - call
-
-## TFBartForConditionalGeneration
-
-[[autodoc]] TFBartForConditionalGeneration
-    - call
-
-## TFBartForSequenceClassification
-
-[[autodoc]] TFBartForSequenceClassification
-    - call
-
-</tf>
-<jax>
-
-## FlaxBartModel
-
-[[autodoc]] FlaxBartModel
-    - __call__
-    - encode
-    - decode
-
-## FlaxBartForConditionalGeneration
-
-[[autodoc]] FlaxBartForConditionalGeneration
-    - __call__
-    - encode
-    - decode
-
-## FlaxBartForSequenceClassification
-
-[[autodoc]] FlaxBartForSequenceClassification
-    - __call__
-    - encode
-    - decode
-
-## FlaxBartForQuestionAnswering
-
-[[autodoc]] FlaxBartForQuestionAnswering
-    - __call__
-    - encode
-    - decode
-
-## FlaxBartForCausalLM
-
-[[autodoc]] FlaxBartForCausalLM
-    - __call__
-</jax>
-</frameworkcontent>
-
-
-
--- a/docs/source/en/model_doc/barthez.md
+++ b/docs/source/en/model_doc/barthez.md
@ -18,9 +18,6 @@ rendered properly in your Markdown viewer.
 <div style="float: right;">
    <div class="flex flex-wrap space-x-1">
        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
-        <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
-        ">
    </div>
 </div>

@ -47,7 +44,7 @@ from transformers import pipeline
 pipeline = pipeline(
    task="fill-mask",
    model="moussaKam/barthez",
-    torch_dtype=torch.float16,
+    dtype=torch.float16,
    device=0
 )
 pipeline("Les plantes produisent <mask> grâce à un processus appelé photosynthèse.")
@ -65,7 +62,7 @@ tokenizer = AutoTokenizer.from_pretrained(
 )
 model = AutoModelForMaskedLM.from_pretrained(
    "moussaKam/barthez",
-    torch_dtype=torch.float16,
+    dtype=torch.float16,
    device_map="auto",
 )
 inputs = tokenizer("Les plantes produisent <mask> grâce à un processus appelé photosynthèse.", return_tensors="pt").to(model.device)
--- a/docs/source/en/model_doc/bartpho.md
+++ b/docs/source/en/model_doc/bartpho.md
@ -18,9 +18,6 @@ rendered properly in your Markdown viewer.
 <div style="float: right;">
   <div class="flex flex-wrap space-x-1">
      <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-      <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
-      <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
-      ">
   </div>
 </div>

@ -49,13 +46,13 @@ from transformers import pipeline
 pipeline = pipeline(
   task="summarization",
   model="vinai/bartpho-word",
-   torch_dtype=torch.float16,
+   dtype=torch.float16,
   device=0
 )

 text = """
-Quang tổng hợp hay gọi tắt là quang hợp là quá trình thu nhận và chuyển hóa năng lượng ánh sáng Mặt trời của thực vật, 
-tảo và một số vi khuẩn để tạo ra hợp chất hữu cơ phục vụ bản thân cũng như làm nguồn thức ăn cho hầu hết các sinh vật 
+Quang tổng hợp hay gọi tắt là quang hợp là quá trình thu nhận và chuyển hóa năng lượng ánh sáng Mặt trời của thực vật,
+tảo và một số vi khuẩn để tạo ra hợp chất hữu cơ phục vụ bản thân cũng như làm nguồn thức ăn cho hầu hết các sinh vật
 trên Trái Đất. Quang hợp trong thực vật thường liên quan đến chất tố diệp lục màu xanh lá cây và tạo ra oxy như một sản phẩm phụ
 """
 pipeline(text)
@ -73,13 +70,13 @@ tokenizer = AutoTokenizer.from_pretrained(
 )
 model = BartForConditionalGeneration.from_pretrained(
    "vinai/bartpho-word",
-    torch_dtype=torch.float16,
+    dtype=torch.float16,
    device_map="auto",
 )

 text = """
-Quang tổng hợp hay gọi tắt là quang hợp là quá trình thu nhận và chuyển hóa năng lượng ánh sáng Mặt trời của thực vật, 
-tảo và một số vi khuẩn để tạo ra hợp chất hữu cơ phục vụ bản thân cũng như làm nguồn thức ăn cho hầu hết các sinh vật 
+Quang tổng hợp hay gọi tắt là quang hợp là quá trình thu nhận và chuyển hóa năng lượng ánh sáng Mặt trời của thực vật,
+tảo và một số vi khuẩn để tạo ra hợp chất hữu cơ phục vụ bản thân cũng như làm nguồn thức ăn cho hầu hết các sinh vật
 trên Trái Đất. Quang hợp trong thực vật thường liên quan đến chất tố diệp lục màu xanh lá cây và tạo ra oxy như một sản phẩm phụ
 """
 inputs = tokenizer(text, return_tensors="pt").to(model.device)
@ -92,8 +89,8 @@ tokenizer.batch_decode(outputs, skip_special_tokens=True, clean_up_tokenization_
 <hfoption id="transformers CLI">

 ```bash
-echo -e "Quang tổng hợp hay gọi tắt là quang hợp là quá trình thu nhận và chuyển hóa năng lượng ánh sáng Mặt trời của thực vật, 
-tảo và một số vi khuẩn để tạo ra hợp chất hữu cơ phục vụ bản thân cũng như làm nguồn thức ăn cho hầu hết các sinh vật 
+echo -e "Quang tổng hợp hay gọi tắt là quang hợp là quá trình thu nhận và chuyển hóa năng lượng ánh sáng Mặt trời của thực vật,
+tảo và một số vi khuẩn để tạo ra hợp chất hữu cơ phục vụ bản thân cũng như làm nguồn thức ăn cho hầu hết các sinh vật
 trên Trái Đất. Quang hợp trong thực vật thường liên quan đến chất tố diệp lục màu xanh lá cây và tạo ra oxy như một sản phẩm phụ" | \
 transformers run --task summarization --model vinai/bartpho-word --device 0
 ```
--- a/docs/source/en/model_doc/beit.md
+++ b/docs/source/en/model_doc/beit.md
@ -19,8 +19,6 @@ rendered properly in your Markdown viewer.

 <div class="flex flex-wrap space-x-1">
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
-">
 <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
 </div>

@ -47,8 +45,7 @@ with previous pre-training methods. For example, base-size BEiT achieves 83.2% t
 significantly outperforming from-scratch DeiT training (81.8%) with the same setup. Moreover, large-size BEiT obtains
 86.3% only using ImageNet-1K, even outperforming ViT-L with supervised pre-training on ImageNet-22K (85.2%).*

-This model was contributed by [nielsr](https://huggingface.co/nielsr). The JAX/FLAX version of this model was
-contributed by [kamalkraj](https://huggingface.co/kamalkraj). The original code can be found [here](https://github.com/microsoft/unilm/tree/master/beit).
+This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/microsoft/unilm/tree/master/beit).

 ## Usage tips

@ -81,24 +78,24 @@ alt="drawing" width="600"/>

 ### Using Scaled Dot Product Attention (SDPA)

-PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function 
-encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the 
-[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) 
+PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function
+encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the
+[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)
 or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
 page for more information.

-SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set 
+SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set
 `attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.

 ```
 from transformers import BeitForImageClassification
-model = BeitForImageClassification.from_pretrained("microsoft/beit-base-patch16-224", attn_implementation="sdpa", torch_dtype=torch.float16)
+model = BeitForImageClassification.from_pretrained("microsoft/beit-base-patch16-224", attn_implementation="sdpa", dtype=torch.float16)
 ...
 ```

 For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).

-On a local benchmark (NVIDIA GeForce RTX 2060-8GB, PyTorch 2.5.1, OS Ubuntu 20.04) with `float16` and 
+On a local benchmark (NVIDIA GeForce RTX 2060-8GB, PyTorch 2.5.1, OS Ubuntu 20.04) with `float16` and
 `microsoft/beit-base-patch16-224` model, we saw the following improvements during training and inference:

 #### Training
@ -134,8 +131,6 @@ If you're interested in submitting a resource to be included here, please feel f

 [[autodoc]] models.beit.modeling_beit.BeitModelOutputWithPooling

-[[autodoc]] models.beit.modeling_flax_beit.FlaxBeitModelOutputWithPooling
-
 ## BeitConfig

 [[autodoc]] BeitConfig
@ -158,9 +153,6 @@ If you're interested in submitting a resource to be included here, please feel f
    - preprocess
    - post_process_semantic_segmentation

-<frameworkcontent>
-<pt>
-
 ## BeitModel

 [[autodoc]] BeitModel
@ -180,24 +172,3 @@ If you're interested in submitting a resource to be included here, please feel f

 [[autodoc]] BeitForSemanticSegmentation
    - forward
-
-</pt>
-<jax>
-
-## FlaxBeitModel
-
-[[autodoc]] FlaxBeitModel
-    - __call__
-
-## FlaxBeitForMaskedImageModeling
-
-[[autodoc]] FlaxBeitForMaskedImageModeling
-    - __call__
-
-## FlaxBeitForImageClassification
-
-[[autodoc]] FlaxBeitForImageClassification
-    - __call__
-
-</jax>
-</frameworkcontent>
--- a/docs/source/en/model_doc/bert-generation.md
+++ b/docs/source/en/model_doc/bert-generation.md
@ -43,7 +43,7 @@ from transformers import pipeline
 pipeline = pipeline(
    task="text2text-generation",
    model="google/roberta2roberta_L-24_discofuse",
-    torch_dtype=torch.float16,
+    dtype=torch.float16,
    device=0
 )
 pipeline("Plants create energy through ")
@ -56,7 +56,7 @@ pipeline("Plants create energy through ")
 import torch
 from transformers import EncoderDecoderModel, AutoTokenizer

-model = EncoderDecoderModel.from_pretrained("google/roberta2roberta_L-24_discofuse", torch_dtype="auto")
+model = EncoderDecoderModel.from_pretrained("google/roberta2roberta_L-24_discofuse", dtype="auto")
 tokenizer = AutoTokenizer.from_pretrained("google/roberta2roberta_L-24_discofuse")

 input_ids = tokenizer(
@ -94,7 +94,7 @@ quantization_config = BitsAndBytesConfig(
 model = EncoderDecoderModel.from_pretrained(
    "google/roberta2roberta_L-24_discofuse",
    quantization_config=quantization_config,
-    torch_dtype="auto"
+    dtype="auto"
 )
 tokenizer = AutoTokenizer.from_pretrained("google/roberta2roberta_L-24_discofuse")

--- a/docs/source/en/model_doc/bert-japanese.md
+++ b/docs/source/en/model_doc/bert-japanese.md
@ -19,9 +19,6 @@ rendered properly in your Markdown viewer.

 <div class="flex flex-wrap space-x-1">
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
-<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
-">
 </div>

 ## Overview
@ -77,10 +74,10 @@ Example of using a model with Character tokenization:

 This model was contributed by [cl-tohoku](https://huggingface.co/cl-tohoku).

-<Tip> 
+<Tip>

-This implementation is the same as BERT, except for tokenization method. Refer to [BERT documentation](bert) for 
-API reference information.  
+This implementation is the same as BERT, except for tokenization method. Refer to [BERT documentation](bert) for
+API reference information.

 </Tip>

--- a/docs/source/en/model_doc/bert.md
+++ b/docs/source/en/model_doc/bert.md
@ -18,9 +18,6 @@ rendered properly in your Markdown viewer.
 <div style="float: right;">
    <div class="flex flex-wrap space-x-1">
        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
-        <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
-        ">
        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
    </div>
 </div>
@ -46,7 +43,7 @@ from transformers import pipeline
 pipeline = pipeline(
    task="fill-mask",
    model="google-bert/bert-base-uncased",
-    torch_dtype=torch.float16,
+    dtype=torch.float16,
    device=0
 )
 pipeline("Plants create [MASK] through a process known as photosynthesis.")
@ -64,11 +61,11 @@ tokenizer = AutoTokenizer.from_pretrained(
 )
 model = AutoModelForMaskedLM.from_pretrained(
    "google-bert/bert-base-uncased",
-    torch_dtype=torch.float16,
+    dtype=torch.float16,
    device_map="auto",
    attn_implementation="sdpa"
 )
-inputs = tokenizer("Plants create [MASK] through a process known as photosynthesis.", return_tensors="pt").to("cuda")
+inputs = tokenizer("Plants create [MASK] through a process known as photosynthesis.", return_tensors="pt").to(model.device)

 with torch.no_grad():
    outputs = model(**inputs)
@ -157,104 +154,6 @@ echo -e "Plants create [MASK] through a process known as photosynthesis." | tran
 [[autodoc]] BertForQuestionAnswering
    - forward

-## TFBertTokenizer
-
-[[autodoc]] TFBertTokenizer
-
-## TFBertModel
-
-[[autodoc]] TFBertModel
-    - call
-
-## TFBertForPreTraining
-
-[[autodoc]] TFBertForPreTraining
-    - call
-
-## TFBertModelLMHeadModel
-
-[[autodoc]] TFBertLMHeadModel
-    - call
-
-## TFBertForMaskedLM
-
-[[autodoc]] TFBertForMaskedLM
-    - call
-
-## TFBertForNextSentencePrediction
-
-[[autodoc]] TFBertForNextSentencePrediction
-    - call
-
-## TFBertForSequenceClassification
-
-[[autodoc]] TFBertForSequenceClassification
-    - call
-
-## TFBertForMultipleChoice
-
-[[autodoc]] TFBertForMultipleChoice
-    - call
-
-## TFBertForTokenClassification
-
-[[autodoc]] TFBertForTokenClassification
-    - call
-
-## TFBertForQuestionAnswering
-
-[[autodoc]] TFBertForQuestionAnswering
-    - call
-
-## FlaxBertModel
-
-[[autodoc]] FlaxBertModel
-    - __call__
-
-## FlaxBertForPreTraining
-
-[[autodoc]] FlaxBertForPreTraining
-    - __call__
-
-## FlaxBertForCausalLM
-
-[[autodoc]] FlaxBertForCausalLM
-    - __call__
-
-## FlaxBertForMaskedLM
-
-[[autodoc]] FlaxBertForMaskedLM
-    - __call__
-
-## FlaxBertForNextSentencePrediction
-
-[[autodoc]] FlaxBertForNextSentencePrediction
-    - __call__
-
-## FlaxBertForSequenceClassification
-
-[[autodoc]] FlaxBertForSequenceClassification
-    - __call__
-
-## FlaxBertForMultipleChoice
-
-[[autodoc]] FlaxBertForMultipleChoice
-    - __call__
-
-## FlaxBertForTokenClassification
-
-[[autodoc]] FlaxBertForTokenClassification
-    - __call__
-
-## FlaxBertForQuestionAnswering
-
-[[autodoc]] FlaxBertForQuestionAnswering
-    - __call__
-
 ## Bert specific outputs

 [[autodoc]] models.bert.modeling_bert.BertForPreTrainingOutput
-
-[[autodoc]] models.bert.modeling_tf_bert.TFBertForPreTrainingOutput
-
-[[autodoc]] models.bert.modeling_flax_bert.FlaxBertForPreTrainingOutput
--- a/docs/source/en/model_doc/bertweet.md
+++ b/docs/source/en/model_doc/bertweet.md
@ -20,9 +20,6 @@ rendered properly in your Markdown viewer.
 <div style="float: right;">
    <div class="flex flex-wrap space-x-1">
    <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-    <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
-    <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
-    ">
 </div>

 ## BERTweet
@ -47,7 +44,7 @@ from transformers import pipeline
 pipeline = pipeline(
    task="fill-mask",
    model="vinai/bertweet-base",
-    torch_dtype=torch.float16,
+    dtype=torch.float16,
    device=0
 )
 pipeline("Plants create <mask> through a process known as photosynthesis.")
@ -64,7 +61,7 @@ tokenizer = AutoTokenizer.from_pretrained(
 )
 model = AutoModelForMaskedLM.from_pretrained(
    "vinai/bertweet-base",
-    torch_dtype=torch.float16,
+    dtype=torch.float16,
    device_map="auto"
 )
 inputs = tokenizer("Plants create <mask> through a process known as photosynthesis.", return_tensors="pt").to(model.device)
--- a/docs/source/en/model_doc/big_bird.md
+++ b/docs/source/en/model_doc/big_bird.md
@ -18,8 +18,6 @@ rendered properly in your Markdown viewer.
 <div style="float: right;">
    <div class="flex flex-wrap space-x-1">
        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white" >
-        <img alt= "Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
-">
    </div>
 </div>

@ -37,14 +35,14 @@ The example below demonstrates how to predict the `[MASK]` token with [`Pipeline
 <hfoptions id="usage">
 <hfoption id="Pipeline">

-```py 
+```py
 import torch
 from transformers import pipeline

 pipeline = pipeline(
    task="fill-mask",
    model="google/bigbird-roberta-base",
-    torch_dtype=torch.float16,
+    dtype=torch.float16,
    device=0
 )
 pipeline("Plants create [MASK] through a process known as photosynthesis.")
@ -61,7 +59,7 @@ tokenizer = AutoTokenizer.from_pretrained(
 )
 model = AutoModelForMaskedLM.from_pretrained(
    "google/bigbird-roberta-base",
-    torch_dtype=torch.float16,
+    dtype=torch.float16,
    device_map="auto",
 )
 inputs = tokenizer("Plants create [MASK] through a process known as photosynthesis.", return_tensors="pt").to(model.device)
@ -116,9 +114,6 @@ print(f"The predicted token is: {predicted_token}")

 [[autodoc]] models.big_bird.modeling_big_bird.BigBirdForPreTrainingOutput

-<frameworkcontent>
-<pt>
-
 ## BigBirdModel

 [[autodoc]] BigBirdModel
@ -158,51 +153,3 @@ print(f"The predicted token is: {predicted_token}")

 [[autodoc]] BigBirdForQuestionAnswering
    - forward
-
-</pt>
-<jax>
-
-## FlaxBigBirdModel
-
-[[autodoc]] FlaxBigBirdModel
-    - __call__
-
-## FlaxBigBirdForPreTraining
-
-[[autodoc]] FlaxBigBirdForPreTraining
-    - __call__
-
-## FlaxBigBirdForCausalLM
-
-[[autodoc]] FlaxBigBirdForCausalLM
-    - __call__
-
-## FlaxBigBirdForMaskedLM
-
-[[autodoc]] FlaxBigBirdForMaskedLM
-    - __call__
-
-## FlaxBigBirdForSequenceClassification
-
-[[autodoc]] FlaxBigBirdForSequenceClassification
-    - __call__
-
-## FlaxBigBirdForMultipleChoice
-
-[[autodoc]] FlaxBigBirdForMultipleChoice
-    - __call__
-
-## FlaxBigBirdForTokenClassification
-
-[[autodoc]] FlaxBigBirdForTokenClassification
-    - __call__
-
-## FlaxBigBirdForQuestionAnswering
-
-[[autodoc]] FlaxBigBirdForQuestionAnswering
-    - __call__
-
-</jax>
-</frameworkcontent>
-
-
--- a/docs/source/en/model_doc/bigbird_pegasus.md
+++ b/docs/source/en/model_doc/bigbird_pegasus.md
@ -44,7 +44,7 @@ from transformers import pipeline
 pipeline = pipeline(
    task="summarization",
    model="google/bigbird-pegasus-large-arxiv",
-    torch_dtype=torch.float32,
+    dtype=torch.float32,
    device=0
 )
 pipeline("""Plants are among the most remarkable and essential life forms on Earth, possessing a unique ability to produce their own food through a process known as photosynthesis. This complex biochemical process is fundamental not only to plant life but to virtually all life on the planet.
@ -64,7 +64,7 @@ tokenizer = AutoTokenizer.from_pretrained(
 )
 model = AutoModelForSeq2SeqLM.from_pretrained(
    "google/bigbird-pegasus-large-arxiv",
-    torch_dtype=torch.bfloat16,
+    dtype=torch.bfloat16,
    device_map="auto",
 )

@ -102,7 +102,7 @@ quantization_config = BitsAndBytesConfig(
 )
 model = AutoModelForSeq2SeqLM.from_pretrained(
    "google/bigbird-pegasus-large-arxiv",
-    torch_dtype=torch.bfloat16,
+    dtype=torch.bfloat16,
    device_map="auto",
    quantization_config=quantization_config
 )
--- a/docs/source/en/model_doc/biogpt.md
+++ b/docs/source/en/model_doc/biogpt.md
@ -44,7 +44,7 @@ from transformers import pipeline
 generator = pipeline(
    task="text-generation",
    model="microsoft/biogpt",
-    torch_dtype=torch.float16,
+    dtype=torch.float16,
    device=0,
 )
 result = generator("Ibuprofen is best used for", truncation=True, max_length=50, do_sample=True)[0]["generated_text"]
@ -61,7 +61,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 tokenizer = AutoTokenizer.from_pretrained("microsoft/biogpt")
 model = AutoModelForCausalLM.from_pretrained(
    "microsoft/biogpt",
-    torch_dtype=torch.float16,
+    dtype=torch.float16,
    device_map="auto",
    attn_implementation="sdpa"
 )
@ -105,7 +105,7 @@ tokenizer = AutoTokenizer.from_pretrained("microsoft/BioGPT-Large")
 model = AutoModelForCausalLM.from_pretrained(
    "microsoft/BioGPT-Large", 
    quantization_config=bnb_config,
-    torch_dtype=torch.bfloat16,
+    dtype=torch.bfloat16,
    device_map="auto"
 )

--- a/docs/source/en/model_doc/bitnet.md
+++ b/docs/source/en/model_doc/bitnet.md
@ -90,7 +90,7 @@ model_id = "microsoft/bitnet-b1.58-2B-4T"
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
    model_id,
-    torch_dtype=torch.bfloat16
+    dtype=torch.bfloat16
 )

 # Apply the chat template
--- a/docs/source/en/model_doc/blenderbot-small.md
+++ b/docs/source/en/model_doc/blenderbot-small.md
@ -19,9 +19,6 @@ rendered properly in your Markdown viewer.

 <div class="flex flex-wrap space-x-1">
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
-<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
-">
 <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
 <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
 </div>
@ -81,9 +78,6 @@ the left.

 [[autodoc]] BlenderbotSmallTokenizerFast

-<frameworkcontent>
-<pt>
-
 ## BlenderbotSmallModel

 [[autodoc]] BlenderbotSmallModel
@ -98,36 +92,3 @@ the left.

 [[autodoc]] BlenderbotSmallForCausalLM
    - forward
-
-</pt>
-<tf>
-
-## TFBlenderbotSmallModel
-
-[[autodoc]] TFBlenderbotSmallModel
-    - call
-
-## TFBlenderbotSmallForConditionalGeneration
-
-[[autodoc]] TFBlenderbotSmallForConditionalGeneration
-    - call
-
-</tf>
-<jax>
-
-## FlaxBlenderbotSmallModel
-
-[[autodoc]] FlaxBlenderbotSmallModel
-    - __call__
-    - encode
-    - decode
-
-## FlaxBlenderbotForConditionalGeneration
-
-[[autodoc]] FlaxBlenderbotSmallForConditionalGeneration
-    - __call__
-    - encode
-    - decode
-
-</jax>
-</frameworkcontent>
--- a/docs/source/en/model_doc/blenderbot.md
+++ b/docs/source/en/model_doc/blenderbot.md
@ -19,9 +19,6 @@ rendered properly in your Markdown viewer.

 <div class="flex flex-wrap space-x-1">
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
-<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
-">
 <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
 <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
 </div>
@ -95,10 +92,6 @@ An example:
 [[autodoc]] BlenderbotTokenizerFast
    - build_inputs_with_special_tokens

-
-<frameworkcontent>
-<pt>
-
 ## BlenderbotModel

 See [`~transformers.BartModel`] for arguments to *forward* and *generate*
@ -117,38 +110,3 @@ See [`~transformers.BartForConditionalGeneration`] for arguments to *forward* an

 [[autodoc]] BlenderbotForCausalLM
    - forward
-
-</pt>
-<tf>
-
-## TFBlenderbotModel
-
-[[autodoc]] TFBlenderbotModel
-    - call
-
-## TFBlenderbotForConditionalGeneration
-
-[[autodoc]] TFBlenderbotForConditionalGeneration
-    - call
-
-</tf>
-<jax>
-
-## FlaxBlenderbotModel
-
-[[autodoc]] FlaxBlenderbotModel
-    - __call__
-    - encode
-    - decode
-
-## FlaxBlenderbotForConditionalGeneration
-
-[[autodoc]] FlaxBlenderbotForConditionalGeneration
-    - __call__
-    - encode
-    - decode
-
-</jax>
-</frameworkcontent>
-
-
--- a/docs/source/en/model_doc/blip.md
+++ b/docs/source/en/model_doc/blip.md
@ -18,7 +18,6 @@ rendered properly in your Markdown viewer.
 <div style="float: right;">
    <div class="flex flex-wrap space-x-1">
        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
    </div>
 </div>

@ -31,7 +30,7 @@ You can find all the original BLIP checkpoints under the [BLIP](https://huggingf

 > [!TIP]
 > This model was contributed by [ybelkada](https://huggingface.co/ybelkada).
-> 
+>
 > Click on the BLIP models in the right sidebar for more examples of how to apply BLIP to different vision language tasks.

 The example below demonstrates how to visual question answering with [`Pipeline`] or the [`AutoModel`] class.
@ -46,7 +45,7 @@ from transformers import pipeline
 pipeline = pipeline(
    task="visual-question-answering",
    model="Salesforce/blip-vqa-base",
-    torch_dtype=torch.float16,
+    dtype=torch.float16,
    device=0
 )
 url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
@ -64,8 +63,8 @@ from transformers import AutoProcessor, AutoModelForVisualQuestionAnswering

 processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")
 model = AutoModelForVisualQuestionAnswering.from_pretrained(
-    "Salesforce/blip-vqa-base", 
-    torch_dtype=torch.float16,
+    "Salesforce/blip-vqa-base",
+    dtype=torch.float16,
    device_map="auto"
 )

@ -113,9 +112,6 @@ Refer to this [notebook](https://github.com/huggingface/notebooks/blob/main/exam
 [[autodoc]] BlipImageProcessorFast
    - preprocess

-<frameworkcontent>
-<pt>
-
 ## BlipModel

 `BlipModel` is going to be deprecated in future versions, please use `BlipForConditionalGeneration`, `BlipForImageTextRetrieval` or `BlipForQuestionAnswering` depending on your usecase.
@ -154,45 +150,3 @@ Refer to this [notebook](https://github.com/huggingface/notebooks/blob/main/exam

 [[autodoc]] BlipForQuestionAnswering
    - forward
-
-</pt>
-<tf>
-
-## TFBlipModel
-
-[[autodoc]] TFBlipModel
-    - call
-    - get_text_features
-    - get_image_features
-
-## TFBlipTextModel
-
-[[autodoc]] TFBlipTextModel
-    - call
-
-## TFBlipTextLMHeadModel
-
-[[autodoc]] TFBlipTextLMHeadModel
- forward
-
-## TFBlipVisionModel
-
-[[autodoc]] TFBlipVisionModel
-    - call
-
-## TFBlipForConditionalGeneration
-
-[[autodoc]] TFBlipForConditionalGeneration
-    - call
-
-## TFBlipForImageTextRetrieval
-
-[[autodoc]] TFBlipForImageTextRetrieval
-    - call
-
-## TFBlipForQuestionAnswering
-
-[[autodoc]] TFBlipForQuestionAnswering
-    - call
-</tf>
-</frameworkcontent>
--- a/docs/source/en/model_doc/bloom.md
+++ b/docs/source/en/model_doc/bloom.md
@ -19,8 +19,6 @@ rendered properly in your Markdown viewer.

 <div class="flex flex-wrap space-x-1">
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
-">
 </div>

 ## Overview
@ -68,10 +66,6 @@ See also:
 [[autodoc]] BloomTokenizerFast
    - all

-
-<frameworkcontent>
-<pt>
-
 ## BloomModel

 [[autodoc]] BloomModel
@ -96,21 +90,3 @@ See also:

 [[autodoc]] BloomForQuestionAnswering
    - forward
-
-</pt>
-<jax>
-
-## FlaxBloomModel
-
-[[autodoc]] FlaxBloomModel
-    - __call__
-
-## FlaxBloomForCausalLM
-
-[[autodoc]] FlaxBloomForCausalLM
-    - __call__
-
-</jax>
-</frameworkcontent>
-
-
--- a/docs/source/en/model_doc/bort.md
+++ b/docs/source/en/model_doc/bort.md
@ -19,9 +19,6 @@ rendered properly in your Markdown viewer.

 <div class="flex flex-wrap space-x-1">
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
-<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
-">
 </div>

 <Tip warning={true}>
@ -61,5 +58,3 @@ This model was contributed by [stefan-it](https://huggingface.co/stefan-it). The
 - BORT requires a specific fine-tuning algorithm, called [Agora](https://adewynter.github.io/notes/bort_algorithms_and_applications.html#fine-tuning-with-algebraic-topology) ,
  that is sadly not open-sourced yet. It would be very useful for the community, if someone tries to implement the
  algorithm to make BORT fine-tuning work.
-
-
--- a/docs/source/en/model_doc/byt5.md
+++ b/docs/source/en/model_doc/byt5.md
@ -17,8 +17,6 @@ rendered properly in your Markdown viewer.
 <div style="float: right;">
  <div class="flex flex-wrap space-x-1">
    <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-    <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
-    <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=flax&logoColor=white">
  </div>
 </div>

@ -43,7 +41,7 @@ from transformers import pipeline
 pipeline = pipeline(
    task="text2text-generation",
    model="google/byt5-small",
-    torch_dtype=torch.float16,
+    dtype=torch.float16,
    device=0
 )
 pipeline("translate English to French: The weather is nice today")
@ -61,7 +59,7 @@ tokenizer = AutoTokenizer.from_pretrained(
 )
 model = AutoModelForSeq2SeqLM.from_pretrained(
    "google/byt5-small",
-    torch_dtype=torch.float16,
+    dtype=torch.float16,
    device_map="auto"
 )

@ -83,7 +81,7 @@ echo -e "translate English to French: Life is beautiful." | transformers-cli run

 ## Quantization

-Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends. 
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.

 The example below uses [torchao](../quantization/torchao) to only quantize the weights to int4.

@ -96,7 +94,7 @@ quantization_config = TorchAoConfig("int4_weight_only", group_size=128)

 model = AutoModelForSeq2SeqLM.from_pretrained(
    "google/byt5-xl",
-    torch_dtype=torch.bfloat16,
+    dtype=torch.bfloat16,
    device_map="auto",
    quantization_config=quantization_config
 )
@ -116,11 +114,11 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
    ```python
    import torch
    from transformers import AutoModelForSeq2SeqLM
-    
+
    model = AutoModelForSeq2SeqLM.from_pretrained("google/byt5-small")
-    
+
    num_special_tokens = 3
-    
+
    input_ids = torch.tensor([list("Life is like a box of chocolates.".encode("utf-8"))]) + num_special_tokens
    labels = torch.tensor([list("La vie est comme une boîte de chocolat.".encode("utf-8"))]) + num_special_tokens
    loss = model(input_ids, labels=labels).loss
--- a/docs/source/en/model_doc/camembert.md
+++ b/docs/source/en/model_doc/camembert.md
@ -18,8 +18,7 @@ rendered properly in your Markdown viewer.
 <div style="float: right;">
 	<div class="flex flex-wrap space-x-1">
 		<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-		<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
-    <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
 	</div>
 </div>

@ -48,10 +47,10 @@ The examples below demonstrate how to predict the `<mask>` token with [`Pipeline
 import torch
 from transformers import pipeline

-pipeline = pipeline("fill-mask", model="camembert-base", torch_dtype=torch.float16, device=0)
+pipeline = pipeline("fill-mask", model="camembert-base", dtype=torch.float16, device=0)
 pipeline("Le camembert est un délicieux fromage <mask>.")
 ```
-</hfoption> 
+</hfoption>

 <hfoption id="AutoModel">

@ -60,8 +59,8 @@ import torch
 from transformers import AutoTokenizer, AutoModelForMaskedLM

 tokenizer = AutoTokenizer.from_pretrained("camembert-base")
-model = AutoModelForMaskedLM.from_pretrained("camembert-base", torch_dtype="auto", device_map="auto", attn_implementation="sdpa")
-inputs = tokenizer("Le camembert est un délicieux fromage <mask>.", return_tensors="pt").to("cuda")
+model = AutoModelForMaskedLM.from_pretrained("camembert-base", dtype="auto", device_map="auto", attn_implementation="sdpa")
+inputs = tokenizer("Le camembert est un délicieux fromage <mask>.", return_tensors="pt").to(model.device)

 with torch.no_grad():
    outputs = model(**inputs)
@ -73,7 +72,7 @@ predicted_token = tokenizer.decode(predicted_token_id)

 print(f"The predicted token is: {predicted_token}")
 ```
-</hfoption> 
+</hfoption>

 <hfoption id="transformers CLI">

@ -81,15 +80,15 @@ print(f"The predicted token is: {predicted_token}")
 echo -e "Le camembert est un délicieux fromage <mask>." | transformers run --task fill-mask --model camembert-base --device 0
 ```

-</hfoption> 
+</hfoption>

-</hfoptions> 
+</hfoptions>


 Quantization reduces the memory burden of large models by representing weights in lower precision. Refer to the [Quantization](../quantization/overview) overview for available options.

 The example below uses [bitsandbytes](../quantization/bitsandbytes) quantization to quantize the weights to 8-bits.
-  
+
 ```python
 from transformers import AutoTokenizer, AutoModelForMaskedLM, BitsAndBytesConfig
 import torch
@ -102,7 +101,7 @@ model = AutoModelForMaskedLM.from_pretrained(
 )
 tokenizer = AutoTokenizer.from_pretrained("almanach/camembert-large")

-inputs = tokenizer("Le camembert est un délicieux fromage <mask>.", return_tensors="pt").to("cuda")
+inputs = tokenizer("Le camembert est un délicieux fromage <mask>.", return_tensors="pt").to(model.device)

 with torch.no_grad():
    outputs = model(**inputs)
@ -131,9 +130,6 @@ print(f"The predicted token is: {predicted_token}")

 [[autodoc]] CamembertTokenizerFast

-<frameworkcontent>
-<pt>
-
 ## CamembertModel

 [[autodoc]] CamembertModel
@ -161,37 +157,3 @@ print(f"The predicted token is: {predicted_token}")
 ## CamembertForQuestionAnswering

 [[autodoc]] CamembertForQuestionAnswering
-
-</pt>
-<tf>
-
-## TFCamembertModel
-
-[[autodoc]] TFCamembertModel
-
-## TFCamembertForCausalLM
-
-[[autodoc]] TFCamembertForCausalLM
-
-## TFCamembertForMaskedLM
-
-[[autodoc]] TFCamembertForMaskedLM
-
-## TFCamembertForSequenceClassification
-
-[[autodoc]] TFCamembertForSequenceClassification
-
-## TFCamembertForMultipleChoice
-
-[[autodoc]] TFCamembertForMultipleChoice
-
-## TFCamembertForTokenClassification
-
-[[autodoc]] TFCamembertForTokenClassification
-
-## TFCamembertForQuestionAnswering
-
-[[autodoc]] TFCamembertForQuestionAnswering
-
-</tf>
-</frameworkcontent>
--- a/docs/source/en/model_doc/chameleon.md
+++ b/docs/source/en/model_doc/chameleon.md
@ -78,7 +78,7 @@ from PIL import Image
 import requests

 processor = ChameleonProcessor.from_pretrained("facebook/chameleon-7b")
-model = ChameleonForConditionalGeneration.from_pretrained("facebook/chameleon-7b", torch_dtype=torch.bfloat16, device_map="auto")
+model = ChameleonForConditionalGeneration.from_pretrained("facebook/chameleon-7b", dtype=torch.bfloat16, device_map="auto")

 # prepare image and text prompt
 url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
@ -104,7 +104,7 @@ import requests

 processor = ChameleonProcessor.from_pretrained("facebook/chameleon-7b")

-model = ChameleonForConditionalGeneration.from_pretrained("facebook/chameleon-7b", torch_dtype=torch.bfloat16, device_map="auto")
+model = ChameleonForConditionalGeneration.from_pretrained("facebook/chameleon-7b", dtype=torch.bfloat16, device_map="auto")

 # Get three different images
 url = "https://www.ilankelman.org/stopsigns/australia.jpg"
@ -170,7 +170,7 @@ from transformers import ChameleonForConditionalGeneration
 model_id = "facebook/chameleon-7b"
 model = ChameleonForConditionalGeneration.from_pretrained(
    model_id,
-    torch_dtype=torch.bfloat16,
+    dtype=torch.bfloat16,
    attn_implementation="flash_attention_2"
 ).to(0)
 ```
--- a/docs/source/en/model_doc/clap.md
+++ b/docs/source/en/model_doc/clap.md
@ -43,7 +43,7 @@ The example below demonstrates how to extract text embeddings with the [`AutoMod
 import torch
 from transformers import AutoTokenizer, AutoModel

-model = AutoModel.from_pretrained("laion/clap-htsat-unfused", torch_dtype=torch.float16, device_map="auto")
+model = AutoModel.from_pretrained("laion/clap-htsat-unfused", dtype=torch.float16, device_map="auto")
 tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")

 texts = ["the sound of a cat", "the sound of a dog", "music playing"]
--- a/docs/source/en/model_doc/clip.md
+++ b/docs/source/en/model_doc/clip.md
@ -18,9 +18,6 @@ rendered properly in your Markdown viewer.
 <div style="float: right;">
    <div class="flex flex-wrap space-x-1">
        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
-        <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
-        ">
        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
    </div>
@ -47,7 +44,7 @@ from transformers import pipeline
 clip = pipeline(
   task="zero-shot-image-classification",
   model="openai/clip-vit-base-patch32",
-   torch_dtype=torch.bfloat16,
+   dtype=torch.bfloat16,
   device=0
 )
 labels = ["a photo of a cat", "a photo of a dog", "a photo of a car"]
@ -63,7 +60,7 @@ import torch
 from PIL import Image
 from transformers import AutoProcessor, AutoModel

-model = AutoModel.from_pretrained("openai/clip-vit-base-patch32", torch_dtype=torch.bfloat16, attn_implementation="sdpa")
+model = AutoModel.from_pretrained("openai/clip-vit-base-patch32", dtype=torch.bfloat16, attn_implementation="sdpa")
 processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

 url = "http://images.cocodataset.org/val2017/000000039769.jpg"
@ -130,9 +127,6 @@ print(f"Most likely label: {most_likely_label} with probability: {probs[0][most_

 [[autodoc]] CLIPProcessor

-<frameworkcontent>
-<pt>
-
 ## CLIPModel

 [[autodoc]] CLIPModel
@ -164,51 +158,3 @@ print(f"Most likely label: {most_likely_label} with probability: {probs[0][most_

 [[autodoc]] CLIPForImageClassification
    - forward
-
-</pt>
-<tf>
-
-## TFCLIPModel
-
-[[autodoc]] TFCLIPModel
-    - call
-    - get_text_features
-    - get_image_features
-
-## TFCLIPTextModel
-
-[[autodoc]] TFCLIPTextModel
-    - call
-
-## TFCLIPVisionModel
-
-[[autodoc]] TFCLIPVisionModel
-    - call
-
-</tf>
-<jax>
-
-## FlaxCLIPModel
-
-[[autodoc]] FlaxCLIPModel
-    - __call__
-    - get_text_features
-    - get_image_features
-
-## FlaxCLIPTextModel
-
-[[autodoc]] FlaxCLIPTextModel
-    - __call__
-
-## FlaxCLIPTextModelWithProjection
-
-[[autodoc]] FlaxCLIPTextModelWithProjection
-    - __call__
-
-## FlaxCLIPVisionModel
-
-[[autodoc]] FlaxCLIPVisionModel
-    - __call__
-
-</jax>
-</frameworkcontent>
--- a/docs/source/en/model_doc/code_llama.md
+++ b/docs/source/en/model_doc/code_llama.md
@ -18,8 +18,6 @@ rendered properly in your Markdown viewer.
 <div style="float: right;">
    <div class="flex flex-wrap space-x-1">
        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
-        ">
    </div>
 </div>

@ -44,7 +42,7 @@ from transformers import pipeline
 pipe = pipeline(
    "text-generation",
    model="meta-llama/CodeLlama-7b-hf",
-    torch_dtype=torch.float16,
+    dtype=torch.float16,
    device_map=0
 )

@ -67,7 +65,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 tokenizer = AutoTokenizer.from_pretrained("meta-llama/CodeLlama-7b-hf")
 model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/CodeLlama-7b-hf",
-    torch_dtype=torch.float16,
+    dtype=torch.float16,
    device_map="auto",
    attn_implementation="sdpa"
 )
@ -115,7 +113,7 @@ bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.
 tokenizer = CodeLlamaTokenizer.from_pretrained("meta-llama/CodeLlama-34b-hf")
 model = AutoModelForCausalLM.from_pretrained(
   "meta-llama/CodeLlama-34b-hf",
-   torch_dtype=torch.bfloat16,
+   dtype=torch.bfloat16,
   device_map="auto",
   quantization_config=bnb_config
 )
--- a/docs/source/en/model_doc/cohere.md
+++ b/docs/source/en/model_doc/cohere.md
@ -45,7 +45,7 @@ from transformers import pipeline
 pipeline = pipeline(
    task="text-generation",
    model="CohereForAI/c4ai-command-r-v01",
-    torch_dtype=torch.float16,
+    dtype=torch.float16,
    device=0
 )
 pipeline("Plants create energy through a process known as")
@ -59,7 +59,7 @@ import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM

 tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01")
-model = AutoModelForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01", torch_dtype=torch.float16, device_map="auto", attn_implementation="sdpa")
+model = AutoModelForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01", dtype=torch.float16, device_map="auto", attn_implementation="sdpa")

 # format message with the Command-R chat template
 messages = [{"role": "user", "content": "How do plants make energy?"}]
@ -79,7 +79,7 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))

 ```bash
 # pip install -U flash-attn --no-build-isolation
-transformers chat CohereForAI/c4ai-command-r-v01 --torch_dtype auto --attn_implementation flash_attention_2
+transformers chat CohereForAI/c4ai-command-r-v01 --dtype auto --attn_implementation flash_attention_2
 ```

 </hfoption>
@ -95,7 +95,7 @@ from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM

 bnb_config = BitsAndBytesConfig(load_in_4bit=True)
 tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01")
-model = AutoModelForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01", torch_dtype=torch.float16, device_map="auto", quantization_config=bnb_config, attn_implementation="sdpa")
+model = AutoModelForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01", dtype=torch.float16, device_map="auto", quantization_config=bnb_config, attn_implementation="sdpa")

 # format message with the Command-R chat template
 messages = [{"role": "user", "content": "How do plants make energy?"}]
@ -125,7 +125,7 @@ visualizer("Plants create energy through a process known as")


 ## Notes
- Don’t use the torch_dtype parameter in [`~AutoModel.from_pretrained`] if you’re using FlashAttention-2 because it only supports fp16 or bf16. You should use [Automatic Mixed Precision](https://pytorch.org/tutorials/recipes/recipes/amp_recipe.html), set fp16 or bf16 to True if using [`Trainer`], or use [torch.autocast](https://pytorch.org/docs/stable/amp.html#torch.autocast).
+- Don’t use the dtype parameter in [`~AutoModel.from_pretrained`] if you’re using FlashAttention-2 because it only supports fp16 or bf16. You should use [Automatic Mixed Precision](https://pytorch.org/tutorials/recipes/recipes/amp_recipe.html), set fp16 or bf16 to True if using [`Trainer`], or use [torch.autocast](https://pytorch.org/docs/stable/amp.html#torch.autocast).

 ## CohereConfig

--- a/docs/source/en/model_doc/cohere2.md
+++ b/docs/source/en/model_doc/cohere2.md
@ -47,7 +47,7 @@ from transformers import pipeline
 pipeline = pipeline(
    task="text-generation", 
    model="CohereLabs/c4ai-command-r7b-12-2024",
-    torch_dtype=torch.float16,
+    dtype=torch.float16,
    device_map=0
 )

@ -67,7 +67,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
 tokenizer = AutoTokenizer.from_pretrained("CohereLabs/c4ai-command-r7b-12-2024")
 model = AutoModelForCausalLM.from_pretrained(
    "CohereLabs/c4ai-command-r7b-12-2024", 
-    torch_dtype=torch.float16, 
+    dtype=torch.float16, 
    device_map="auto", 
    attn_implementation="sdpa"
 )
@ -90,7 +90,7 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))

 ```bash
 # pip install -U flash-attn --no-build-isolation
-transformers-cli chat CohereLabs/c4ai-command-r7b-12-2024 --torch_dtype auto --attn_implementation flash_attention_2
+transformers-cli chat CohereLabs/c4ai-command-r7b-12-2024 --dtype auto --attn_implementation flash_attention_2
 ```

 </hfoption>
@ -108,7 +108,7 @@ bnb_config = BitsAndBytesConfig(load_in_4bit=True)
 tokenizer = AutoTokenizer.from_pretrained("CohereLabs/c4ai-command-r7b-12-2024")
 model = AutoModelForCausalLM.from_pretrained(
    "CohereLabs/c4ai-command-r7b-12-2024", 
-    torch_dtype=torch.float16, 
+    dtype=torch.float16, 
    device_map="auto", 
    quantization_config=bnb_config, 
    attn_implementation="sdpa"
--- a/docs/source/en/model_doc/cohere2_vision.md
+++ b/docs/source/en/model_doc/cohere2_vision.md
@ -48,7 +48,7 @@ model_id = "CohereLabs/command-a-vision-07-2025"

 processor = AutoProcessor.from_pretrained(model_id)
 model = AutoModelForImageTextToText.from_pretrained(
-    model_id, device_map="auto", torch_dtype=torch.float16
+    model_id, device_map="auto", dtype=torch.float16
 )

 # Format message with the Command-A-Vision chat template
--- a/docs/source/en/model_doc/colpali.md
+++ b/docs/source/en/model_doc/colpali.md
@ -46,7 +46,7 @@ model_name = "vidore/colpali-v1.3-hf"

 model = ColPaliForRetrieval.from_pretrained(
    model_name,
-    torch_dtype=torch.bfloat16,
+    dtype=torch.bfloat16,
    device_map="auto",  # "cpu", "cuda", "xpu", or "mps" for Apple Silicon
 )
 processor = ColPaliProcessor.from_pretrained(model_name)
--- a/docs/source/en/model_doc/colqwen2.md
+++ b/docs/source/en/model_doc/colqwen2.md
@ -49,8 +49,8 @@ model_name = "vidore/colqwen2-v1.0-hf"

 model = ColQwen2ForRetrieval.from_pretrained(
    model_name,
-    torch_dtype=torch.bfloat16,
-    device_map="auto",  # "cpu", "cuda", or "mps" for Apple Silicon
+    dtype=torch.bfloat16,
+    device_map="auto",  # "cpu", "cuda", "xpu" or "mps" for Apple Silicon
    attn_implementation="flash_attention_2" if is_flash_attn_2_available() else "sdpa",
 )
 processor = ColQwen2Processor.from_pretrained(model_name)
@ -107,10 +107,10 @@ import requests
 import torch
 from PIL import Image

-from transformers import BitsAndBytesConfig, ColQwen2ForRetrieval, ColQwen2Processor
-
+from transformers import BitsAndBytesConfig, ColQwen2ForRetrieval, ColQwen2Processor, infer_device

 model_name = "vidore/colqwen2-v1.0-hf"
+device = infer_device()

 # 4-bit quantization configuration
 bnb_config = BitsAndBytesConfig(
@ -123,7 +123,7 @@ bnb_config = BitsAndBytesConfig(
 model = ColQwen2ForRetrieval.from_pretrained(
    model_name,
    quantization_config=bnb_config,
-    device_map="cuda",
+    device_map=device,
 ).eval()

 processor = ColQwen2Processor.from_pretrained(model_name)
--- a/docs/source/en/model_doc/convbert.md
+++ b/docs/source/en/model_doc/convbert.md
@ -19,7 +19,6 @@ rendered properly in your Markdown viewer.

 <div class="flex flex-wrap space-x-1">
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
 </div>

 ## Overview
@ -72,9 +71,6 @@ ConvBERT training tips are similar to those of BERT. For usage tips refer to [BE

 [[autodoc]] ConvBertTokenizerFast

-<frameworkcontent>
-<pt>
-
 ## ConvBertModel

 [[autodoc]] ConvBertModel
@ -104,39 +100,3 @@ ConvBERT training tips are similar to those of BERT. For usage tips refer to [BE

 [[autodoc]] ConvBertForQuestionAnswering
    - forward
-
-</pt>
-<tf>
-
-## TFConvBertModel
-
-[[autodoc]] TFConvBertModel
-    - call
-
-## TFConvBertForMaskedLM
-
-[[autodoc]] TFConvBertForMaskedLM
-    - call
-
-## TFConvBertForSequenceClassification
-
-[[autodoc]] TFConvBertForSequenceClassification
-    - call
-
-## TFConvBertForMultipleChoice
-
-[[autodoc]] TFConvBertForMultipleChoice
-    - call
-
-## TFConvBertForTokenClassification
-
-[[autodoc]] TFConvBertForTokenClassification
-    - call
-
-## TFConvBertForQuestionAnswering
-
-[[autodoc]] TFConvBertForQuestionAnswering
-    - call
-
-</tf>
-</frameworkcontent>
--- a/docs/source/en/model_doc/convnext.md
+++ b/docs/source/en/model_doc/convnext.md
@ -19,7 +19,6 @@ rendered properly in your Markdown viewer.

 <div class="flex flex-wrap space-x-1">
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
 </div>

 ## Overview
@ -43,8 +42,7 @@ alt="drawing" width="600"/>

 <small> ConvNeXT architecture. Taken from the <a href="https://huggingface.co/papers/2201.03545">original paper</a>.</small>

-This model was contributed by [nielsr](https://huggingface.co/nielsr). TensorFlow version of the model was contributed by [ariG23498](https://github.com/ariG23498),
-[gante](https://github.com/gante), and [sayakpaul](https://github.com/sayakpaul) (equal contribution). The original code can be found [here](https://github.com/facebookresearch/ConvNeXt).
+This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/facebookresearch/ConvNeXt).

 ## Resources

@ -75,9 +73,6 @@ If you're interested in submitting a resource to be included here, please feel f
 [[autodoc]] ConvNextImageProcessorFast
    - preprocess

-<frameworkcontent>
-<pt>
-
 ## ConvNextModel

 [[autodoc]] ConvNextModel
@ -87,19 +82,3 @@ If you're interested in submitting a resource to be included here, please feel f

 [[autodoc]] ConvNextForImageClassification
    - forward
-
-</pt>
-<tf>
-
-## TFConvNextModel
-
-[[autodoc]] TFConvNextModel
-    - call
-
-## TFConvNextForImageClassification
-
-[[autodoc]] TFConvNextForImageClassification
-    - call
-
-</tf>
-</frameworkcontent>
--- a/docs/source/en/model_doc/convnextv2.md
+++ b/docs/source/en/model_doc/convnextv2.md
@ -19,7 +19,6 @@ rendered properly in your Markdown viewer.

 <div class="flex flex-wrap space-x-1">
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
 </div>

 ## Overview
@ -61,14 +60,3 @@ If you're interested in submitting a resource to be included here, please feel f

 [[autodoc]] ConvNextV2ForImageClassification
    - forward
-
-## TFConvNextV2Model
-
-[[autodoc]] TFConvNextV2Model
-    - call
-
-
-## TFConvNextV2ForImageClassification
-
-[[autodoc]] TFConvNextV2ForImageClassification
-    - call
--- a/docs/source/en/model_doc/cpm.md
+++ b/docs/source/en/model_doc/cpm.md
@ -19,9 +19,6 @@ rendered properly in your Markdown viewer.

 <div class="flex flex-wrap space-x-1">
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
-<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
-">
 </div>

 ## Overview
@ -48,8 +45,8 @@ here: https://github.com/TsinghuaAI/CPM-Generate

 <Tip>

-CPM's architecture is the same as GPT-2, except for tokenization method. Refer to [GPT-2 documentation](gpt2) for 
-API reference information.  
+CPM's architecture is the same as GPT-2, except for tokenization method. Refer to [GPT-2 documentation](gpt2) for
+API reference information.

 </Tip>

--- a/docs/source/en/model_doc/csm.md
+++ b/docs/source/en/model_doc/csm.md
@ -59,7 +59,7 @@ inputs = processor.apply_chat_template(
    conversation,
    tokenize=True,
    return_dict=True,
-).to(device)
+).to(model.device)

 # infer the model
 audio = model.generate(**inputs, output_audio=True)
@ -104,7 +104,7 @@ inputs = processor.apply_chat_template(
    conversation,
    tokenize=True,
    return_dict=True,
-).to(device)
+).to(model.device)

 # infer the model
 audio = model.generate(**inputs, output_audio=True)
@ -161,7 +161,7 @@ inputs = processor.apply_chat_template(
    conversation,
    tokenize=True,
    return_dict=True,
-).to(device)
+).to(model.device)

 audio = model.generate(**inputs, output_audio=True)
 processor.save_audio(audio, [f"speech_batch_idx_{i}.wav" for i in range(len(audio))])
@ -251,7 +251,7 @@ padded_inputs_1 = processor.apply_chat_template(
    conversation,
    tokenize=True,
    return_dict=True,
-).to(device)
+).to(model.device)

 print("\n" + "="*50)
 print("First generation - compiling and recording CUDA graphs...")
@ -292,7 +292,7 @@ padded_inputs_2 = processor.apply_chat_template(
    conversation,
    tokenize=True,
    return_dict=True,
-).to(device)
+).to(model.device)

 print("\n" + "="*50)
 print("Generation with other inputs!")
@ -337,7 +337,7 @@ inputs = processor.apply_chat_template(
    tokenize=True,
    return_dict=True,
    output_labels=True,
-).to(device)
+).to(model.device)

 out = model(**inputs)
 out.loss.backward()
--- a/docs/source/en/model_doc/ctrl.md
+++ b/docs/source/en/model_doc/ctrl.md
@ -19,7 +19,6 @@ rendered properly in your Markdown viewer.

 <div class="flex flex-wrap space-x-1">
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
 </div>

 ## Overview
@ -52,7 +51,7 @@ This model was contributed by [keskarnitishr](https://huggingface.co/keskarnitis
  token in a sequence. Leveraging this feature allows CTRL to generate syntactically coherent text as it can be
  observed in the *run_generation.py* example script.
 - The PyTorch models can take the `past_key_values` as input, which is the previously computed key/value attention pairs.
-  TensorFlow models accepts `past` as input. Using the `past_key_values` value prevents the model from re-computing
+  Using the `past_key_values` value prevents the model from re-computing
  pre-computed values in the context of text generation. See the [`forward`](model_doc/ctrl#transformers.CTRLModel.forward)
  method for more information on the usage of this argument.

@ -71,9 +70,6 @@ This model was contributed by [keskarnitishr](https://huggingface.co/keskarnitis
 [[autodoc]] CTRLTokenizer
    - save_vocabulary

-<frameworkcontent>
-<pt>
-
 ## CTRLModel

 [[autodoc]] CTRLModel
@ -88,24 +84,3 @@ This model was contributed by [keskarnitishr](https://huggingface.co/keskarnitis

 [[autodoc]] CTRLForSequenceClassification
    - forward
-
-</pt>
-<tf>
-
-## TFCTRLModel
-
-[[autodoc]] TFCTRLModel
-    - call
-
-## TFCTRLLMHeadModel
-
-[[autodoc]] TFCTRLLMHeadModel
-    - call
-
-## TFCTRLForSequenceClassification
-
-[[autodoc]] TFCTRLForSequenceClassification
-    - call
-
-</tf>
-</frameworkcontent>
--- a/docs/source/en/model_doc/cvt.md
+++ b/docs/source/en/model_doc/cvt.md
@ -18,7 +18,6 @@ rendered properly in your Markdown viewer.
 <div style="float: right;">
    <div class="flex flex-wrap space-x-1">
        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-        <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
    </div>
 </div>

@ -30,7 +29,7 @@ You can find all the CvT checkpoints under the [Microsoft](https://huggingface.c

 > [!TIP]
 > This model was contributed by [anujunj](https://huggingface.co/anugunj).
-> 
+>
 > Click on the CvT models in the right sidebar for more examples of how to apply CvT to different computer vision tasks.

 The example below demonstrates how to classify an image with [`Pipeline`] or the [`AutoModel`] class.
@ -45,8 +44,8 @@ from transformers import pipeline
 pipeline = pipeline(
    task="image-classification",
    model="microsoft/cvt-13",
-    torch_dtype=torch.float16,
-    device=0 
+    dtype=torch.float16,
+    device=0
 )
 pipeline("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg")
 ```
@ -63,7 +62,7 @@ from transformers import AutoModelForImageClassification, AutoImageProcessor
 image_processor = AutoImageProcessor.from_pretrained("microsoft/cvt-13")
 model = AutoModelForImageClassification.from_pretrained(
    "microsoft/cvt-13",
-    torch_dtype=torch.float16,
+    dtype=torch.float16,
    device_map="auto"
 )

@ -91,9 +90,6 @@ Refer to this set of ViT [notebooks](https://github.com/NielsRogge/Transformers-

 [[autodoc]] CvtConfig

-<frameworkcontent>
-<pt>
-
 ## CvtModel

 [[autodoc]] CvtModel
@ -103,19 +99,3 @@ Refer to this set of ViT [notebooks](https://github.com/NielsRogge/Transformers-

 [[autodoc]] CvtForImageClassification
    - forward
-
-</pt>
-<tf>
-
-## TFCvtModel
-
-[[autodoc]] TFCvtModel
-    - call
-
-## TFCvtForImageClassification
-
-[[autodoc]] TFCvtForImageClassification
-    - call
-
-</tf>
-</frameworkcontent>
--- a/docs/source/en/model_doc/data2vec.md
+++ b/docs/source/en/model_doc/data2vec.md
@ -43,7 +43,6 @@ natural language understanding demonstrate a new state of the art or competitive
 Models and code are available at www.github.com/pytorch/fairseq/tree/master/examples/data2vec.*

 This model was contributed by [edugp](https://huggingface.co/edugp) and [patrickvonplaten](https://huggingface.co/patrickvonplaten).
-[sayakpaul](https://github.com/sayakpaul) and [Rocketknight1](https://github.com/Rocketknight1) contributed Data2Vec for vision in TensorFlow.

 The original code (for NLP and Speech) can be found [here](https://github.com/pytorch/fairseq/tree/main/examples/data2vec).
 The original code for vision can be found [here](https://github.com/facebookresearch/data2vec_vision/tree/main/beit).
@ -54,24 +53,24 @@ The original code for vision can be found [here](https://github.com/facebookrese
 - For Data2VecAudio, preprocessing is identical to [`Wav2Vec2Model`], including feature extraction
 - For Data2VecText, preprocessing is identical to [`RobertaModel`], including tokenization.
 - For Data2VecVision, preprocessing is identical to [`BeitModel`], including feature extraction.
- The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`  
+- The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`

 ### Using Scaled Dot Product Attention (SDPA)

-PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function 
-encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the 
-[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) 
+PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function
+encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the
+[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)
 or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
 page for more information.

-SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set 
+SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set
 `attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.

 The SDPA implementation is currently available for the Data2VecAudio and Data2VecVision models.

 ```
 from transformers import Data2VecVisionForImageClassification
-model = Data2VecVisionForImageClassification.from_pretrained("facebook/data2vec-vision-base", attn_implementation="sdpa", torch_dtype=torch.float16)
+model = Data2VecVisionForImageClassification.from_pretrained("facebook/data2vec-vision-base", attn_implementation="sdpa", dtype=torch.float16)
 ...
 ```

@ -103,7 +102,6 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 <PipelineTag pipeline="image-classification"/>

 - [`Data2VecVisionForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
- To fine-tune [`TFData2VecVisionForImageClassification`] on a custom dataset, see [this notebook](https://colab.research.google.com/github/sayakpaul/TF-2.0-Hacks/blob/master/data2vec_vision_image_classification.ipynb).

 **Data2VecText documentation resources**
 - [Text classification task guide](../tasks/sequence_classification)
@ -135,9 +133,6 @@ If you're interested in submitting a resource to be included here, please feel f

 [[autodoc]] Data2VecVisionConfig

-<frameworkcontent>
-<pt>
-
 ## Data2VecAudioModel

 [[autodoc]] Data2VecAudioModel
@ -212,24 +207,3 @@ If you're interested in submitting a resource to be included here, please feel f

 [[autodoc]] Data2VecVisionForSemanticSegmentation
    - forward
-
-</pt>
-<tf>
-
-## TFData2VecVisionModel
-
-[[autodoc]] TFData2VecVisionModel
-    - call
-
-## TFData2VecVisionForImageClassification
-
-[[autodoc]] TFData2VecVisionForImageClassification
-    - call
-
-## TFData2VecVisionForSemanticSegmentation
-
-[[autodoc]] TFData2VecVisionForSemanticSegmentation
-    - call
-
-</tf>
-</frameworkcontent>
--- a/docs/source/en/model_doc/dbrx.md
+++ b/docs/source/en/model_doc/dbrx.md
@ -52,7 +52,7 @@ tokenizer = AutoTokenizer.from_pretrained("databricks/dbrx-instruct", token="YOU
 model = DbrxForCausalLM.from_pretrained(
    "databricks/dbrx-instruct",
    device_map="auto",
-    torch_dtype=torch.bfloat16,
+    dtype=torch.bfloat16,
    token="YOUR_HF_TOKEN",
    )

@ -73,7 +73,7 @@ tokenizer = AutoTokenizer.from_pretrained("databricks/dbrx-instruct", token="YOU
 model = DbrxForCausalLM.from_pretrained(
    "databricks/dbrx-instruct",
    device_map="auto",
-    torch_dtype=torch.bfloat16,
+    dtype=torch.bfloat16,
    token="YOUR_HF_TOKEN",
    attn_implementation="flash_attention_2",
    )
@ -95,7 +95,7 @@ tokenizer = AutoTokenizer.from_pretrained("databricks/dbrx-instruct", token="YOU
 model = DbrxForCausalLM.from_pretrained(
    "databricks/dbrx-instruct",
    device_map="auto",
-    torch_dtype=torch.bfloat16,
+    dtype=torch.bfloat16,
    token="YOUR_HF_TOKEN",
    attn_implementation="sdpa",
    )
--- a/docs/source/en/model_doc/deberta-v2.md
+++ b/docs/source/en/model_doc/deberta-v2.md
@ -18,7 +18,6 @@ rendered properly in your Markdown viewer.
 <div style="float: right;">
    <div class="flex flex-wrap space-x-1">
           <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white" >
-           <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
    </div>
 </div>

@ -48,7 +47,7 @@ pipeline = pipeline(
    task="text-classification",
    model="microsoft/deberta-v2-xlarge-mnli",
    device=0,
-    torch_dtype=torch.float16
+    dtype=torch.float16
 )
 result = pipeline("DeBERTa-v2 is great at understanding context!")
 print(result)
@ -66,11 +65,11 @@ tokenizer = AutoTokenizer.from_pretrained(
 )
 model = AutoModelForSequenceClassification.from_pretrained(
    "microsoft/deberta-v2-xlarge-mnli",
-    torch_dtype=torch.float16,
+    dtype=torch.float16,
    device_map="auto"
 )

-inputs = tokenizer("DeBERTa-v2 is great at understanding context!", return_tensors="pt").to("cuda")
+inputs = tokenizer("DeBERTa-v2 is great at understanding context!", return_tensors="pt").to(model.device)
 outputs = model(**inputs)

 logits = outputs.logits
@ -108,10 +107,10 @@ tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    quantization_config=quantization_config,
-    torch_dtype="float16"
+    dtype="float16"
 )

-inputs = tokenizer("DeBERTa-v2 is great at understanding context!", return_tensors="pt").to("cuda")
+inputs = tokenizer("DeBERTa-v2 is great at understanding context!", return_tensors="pt").to(model.device)
 outputs = model(**inputs)
 logits = outputs.logits
 predicted_class_id = logits.argmax().item()
@ -139,9 +138,6 @@ print(f"Predicted label: {predicted_label}")
    - build_inputs_with_special_tokens
    - create_token_type_ids_from_sequences

-<frameworkcontent>
-<pt>
-
 ## DebertaV2Model

 [[autodoc]] DebertaV2Model
@ -176,44 +172,3 @@ print(f"Predicted label: {predicted_label}")

 [[autodoc]] DebertaV2ForMultipleChoice
    - forward
-
-</pt>
-<tf>
-
-## TFDebertaV2Model
-
-[[autodoc]] TFDebertaV2Model
-    - call
-
-## TFDebertaV2PreTrainedModel
-
-[[autodoc]] TFDebertaV2PreTrainedModel
-    - call
-
-## TFDebertaV2ForMaskedLM
-
-[[autodoc]] TFDebertaV2ForMaskedLM
-    - call
-
-## TFDebertaV2ForSequenceClassification
-
-[[autodoc]] TFDebertaV2ForSequenceClassification
-    - call
-
-## TFDebertaV2ForTokenClassification
-
-[[autodoc]] TFDebertaV2ForTokenClassification
-    - call
-
-## TFDebertaV2ForQuestionAnswering
-
-[[autodoc]] TFDebertaV2ForQuestionAnswering
-    - call
-
-## TFDebertaV2ForMultipleChoice
-
-[[autodoc]] TFDebertaV2ForMultipleChoice
-    - call
-
-</tf>
-</frameworkcontent>
--- a/Show More
+++ b/Show More