comment

add a callback hook right before the optimizer step (#33444 )
Return image hidden states (#33426 )
2025-10-21 17:48:57 +08:00 · 2024-09-13 14:59:25 +02:00 · 2024-09-13 10:43:45 +02:00 · 2024-09-13 10:20:03 +02:00 · 2024-09-12 10:16:12 -07:00 · 2024-09-12 18:48:36 +02:00
263 changed files with 6394 additions and 3540 deletions
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@ -37,17 +37,17 @@ body:
        Models:

          - text models: @ArthurZucker
-          - vision models: @amyeroberts
-          - speech models: @sanchit-gandhi
+          - vision models: @amyeroberts, @qubvel
+          - speech models: @ylacombe, @eustlb
          - graph models: @clefourrier

        Library:

          - flax: @sanchit-gandhi
          - generate: @zucchini-nlp (visual-language models) or @gante (all others)
-          - pipelines: @Narsil
+          - pipelines: @Rocketknight1
          - tensorflow: @gante and @Rocketknight1
-          - tokenizers: @ArthurZucker
+          - tokenizers: @ArthurZucker and @itazap
          - trainer: @muellerzr @SunMarc

        Integrations:
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@ -40,18 +40,19 @@ members/contributors who may be interested in your PR.
 Models:

 - text models: @ArthurZucker
- vision models: @amyeroberts
- speech models: @sanchit-gandhi
+- vision models: @amyeroberts, @qubvel
+- speech models: @ylacombe, @eustlb
 - graph models: @clefourrier

 Library:

 - flax: @sanchit-gandhi
 - generate: @zucchini-nlp (visual-language models) or @gante (all others)
- pipelines: @Narsil
+- pipelines: @Rocketknight1
 - tensorflow: @gante and @Rocketknight1
 - tokenizers: @ArthurZucker
 - trainer: @muellerzr and @SunMarc
+- chat templates: @Rocketknight1

 Integrations:

--- a/.github/workflows/model_jobs_amd.yml
+++ b/.github/workflows/model_jobs_amd.yml
@ -0,0 +1,129 @@
+name: model jobs
+
+on:
+  workflow_call:
+    inputs:
+      folder_slices:
+        required: true
+        type: string
+      machine_type:
+        required: true
+        type: string
+      slice_id:
+        required: true
+        type: number
+      runner:
+        required: true
+        type: string
+      docker:
+        required: true
+        type: string
+
+env:
+  HF_HOME: /mnt/cache
+  TRANSFORMERS_IS_CI: yes
+  OMP_NUM_THREADS: 8
+  MKL_NUM_THREADS: 8
+  RUN_SLOW: yes
+  # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access.
+  # This token is created under the bot `hf-transformers-bot`.
+  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
+  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
+  TF_FORCE_GPU_ALLOW_GROWTH: true
+  RUN_PT_TF_CROSS_TESTS: 1
+  CUDA_VISIBLE_DEVICES: 0,1
+
+jobs:
+  run_models_gpu:
+    name: " "
+    strategy:
+      max-parallel: 1  # For now, not to parallelize. Can change later if it works well.
+      fail-fast: false
+      matrix:
+        folders: ${{ fromJson(inputs.folder_slices)[inputs.slice_id] }}
+    runs-on: ['${{ inputs.machine_type }}', self-hosted, amd-gpu, '${{ inputs.runner }}']
+    container:
+      image: ${{ inputs.docker }}
+      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: Echo input and matrix info
+        shell: bash
+        run: |
+          echo "${{ inputs.folder_slices }}"
+          echo "${{ matrix.folders }}"
+          echo "${{ toJson(fromJson(inputs.folder_slices)[inputs.slice_id]) }}"
+
+      - name: Echo folder ${{ matrix.folders }}
+        shell: bash
+        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
+        # set the artifact folder names (because the character `/` is not allowed).
+        run: |
+          echo "${{ matrix.folders }}"
+          matrix_folders=${{ matrix.folders }}
+          matrix_folders=${matrix_folders/'models/'/'models_'}
+          echo "$matrix_folders"
+          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
+
+      - name: Update clone
+        working-directory: /transformers
+        run: git fetch && git checkout ${{ github.sha }}
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: Update / Install some packages (for Past CI)
+        if: ${{ contains(inputs.docker, '-past-') }}
+        working-directory: /transformers
+        run: |
+          python3 -m pip install -U datasets
+
+      - name: Update / Install some packages (for Past CI)
+        if: ${{ contains(inputs.docker, '-past-') && contains(inputs.docker, '-pytorch-') }}
+        working-directory: /transformers
+        run: |
+          python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
+
+      - name: ROCM-SMI
+        run: |
+          rocm-smi
+
+      - name: ROCM-INFO
+        run: |
+          rocminfo  | grep "Agent" -A 14
+
+      - name: Show ROCR environment
+        run: |
+          echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Run all tests on GPU
+        working-directory: /transformers
+        run: python3 -m pytest -rsfE -v --make-reports=${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}  -m "not not_device_test"
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
+
+      - name: Run test
+        shell: bash
+        run: |
+          mkdir -p /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
+          echo "hello" > /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/hello.txt
+          echo "${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports"
+
+      - name: "Test suite reports artifacts: ${{ inputs.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports"
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ inputs.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports
+          path: /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
--- a/.github/workflows/remind_slow_ci.yml
+++ b/.github/workflows/remind_slow_ci.yml
@ -0,0 +1,19 @@
+name: Build PR Documentation
+
+on:
+  pull_request_target:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  remind:
+    name: remind
+    runs-on: ubuntu-22.04
+    env:
+      GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+    steps:
+    - name: Install requirements
+      run: |
+        echo "Bonjour"
--- a/.github/workflows/self-scheduled-amd-mi210-caller.yml
+++ b/.github/workflows/self-scheduled-amd-mi210-caller.yml
@ -10,11 +10,46 @@ on:
      - run_amd_scheduled_ci_caller*

 jobs:
-  run_amd_ci:
-    name: AMD mi210
-    if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_scheduled_ci_caller')))
+  model-ci:
+    name: Model CI
    uses: ./.github/workflows/self-scheduled-amd.yml
    with:
-      gpu_flavor: mi210
+      job: run_models_gpu
      slack_report_channel: "#transformers-ci-daily-amd"
+      runner: mi210
+      docker: huggingface/transformers-pytorch-amd-gpu
+      ci_event: Scheduled CI (AMD) - mi210
+    secrets: inherit
+
+  torch-pipeline:
+    name: Torch pipeline CI
+    uses: ./.github/workflows/self-scheduled-amd.yml
+    with:
+      job: run_pipelines_torch_gpu
+      slack_report_channel: "#transformers-ci-daily-amd"
+      runner: mi210
+      docker: huggingface/transformers-pytorch-amd-gpu
+      ci_event: Scheduled CI (AMD) - mi210
+    secrets: inherit
+
+  example-ci:
+    name: Example CI
+    uses: ./.github/workflows/self-scheduled-amd.yml
+    with:
+      job: run_examples_gpu
+      slack_report_channel: "#transformers-ci-daily-amd"
+      runner: mi210
+      docker: huggingface/transformers-pytorch-amd-gpu
+      ci_event: Scheduled CI (AMD) - mi210
+    secrets: inherit
+
+  deepspeed-ci:
+    name: DeepSpeed CI
+    uses: ./.github/workflows/self-scheduled-amd.yml
+    with:
+      job: run_torch_cuda_extensions_gpu
+      slack_report_channel: "#transformers-ci-daily-amd"
+      runner: mi210
+      docker: huggingface/transformers-pytorch-deepspeed-amd-gpu
+      ci_event: Scheduled CI (AMD) - mi210
    secrets: inherit
--- a/.github/workflows/self-scheduled-amd-mi250-caller.yml
+++ b/.github/workflows/self-scheduled-amd-mi250-caller.yml
@ -10,11 +10,46 @@ on:
      - run_amd_scheduled_ci_caller*

 jobs:
-  run_amd_ci:
-    name: AMD mi250
-    if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_scheduled_ci_caller')))
+  model-ci:
+    name: Model CI
    uses: ./.github/workflows/self-scheduled-amd.yml
    with:
-      gpu_flavor: mi250
+      job: run_models_gpu
      slack_report_channel: "#transformers-ci-daily-amd"
+      runner: mi250
+      docker: huggingface/transformers-pytorch-amd-gpu
+      ci_event: Scheduled CI (AMD) - mi250
+    secrets: inherit
+
+  torch-pipeline:
+    name: Torch pipeline CI
+    uses: ./.github/workflows/self-scheduled-amd.yml
+    with:
+      job: run_pipelines_torch_gpu
+      slack_report_channel: "#transformers-ci-daily-amd"
+      runner: mi250
+      docker: huggingface/transformers-pytorch-amd-gpu
+      ci_event: Scheduled CI (AMD) - mi250
+    secrets: inherit
+
+  example-ci:
+    name: Example CI
+    uses: ./.github/workflows/self-scheduled-amd.yml
+    with:
+      job: run_examples_gpu
+      slack_report_channel: "#transformers-ci-daily-amd"
+      runner: mi250
+      docker: huggingface/transformers-pytorch-amd-gpu
+      ci_event: Scheduled CI (AMD) - mi250
+    secrets: inherit
+
+  deepspeed-ci:
+    name: DeepSpeed CI
+    uses: ./.github/workflows/self-scheduled-amd.yml
+    with:
+      job: run_torch_cuda_extensions_gpu
+      slack_report_channel: "#transformers-ci-daily-amd"
+      runner: mi250
+      docker: huggingface/transformers-pytorch-deepspeed-amd-gpu
+      ci_event: Scheduled CI (AMD) - mi250
    secrets: inherit
--- a/.github/workflows/self-scheduled-amd-mi300-caller.yml
+++ b/.github/workflows/self-scheduled-amd-mi300-caller.yml
@ -1,21 +0,0 @@
-name: Self-hosted runner (AMD mi300 scheduled CI caller)
-
-on:
-  workflow_run:
-    workflows: ["Self-hosted runner (AMD scheduled CI caller)"]
-    branches: ["main"]
-    types: [completed]
-  push:
-    branches:
-      - run_amd_scheduled_ci_caller*
-
-jobs:
-  run_amd_ci:
-    name: AMD mi300
-    needs: build-docker-containers
-    if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && (startsWith(github.ref_name, 'run_amd_push_ci_caller') || startsWith(github.ref_name, 'mi300-ci'))))
-    uses: ./.github/workflows/self-scheduled-amd.yml
-    with:
-      gpu_flavor: mi300
-      slack_report_channel: "#transformers-ci-daily-amd"
-    secrets: inherit
--- a/.github/workflows/self-scheduled-amd.yml
+++ b/.github/workflows/self-scheduled-amd.yml
@ -3,10 +3,23 @@ name: Self-hosted runner (scheduled-amd)
 # Note: For the AMD CI, we rely on a caller workflow and on the workflow_call event to trigger the
 # CI in order to run it on both MI210 and MI250, without having to use matrix here which pushes
 # us towards the limit of allowed jobs on GitHub Actions.
+
 on:
  workflow_call:
    inputs:
-      gpu_flavor:
+      job:
+        required: true
+        type: string
+      slack_report_channel:
+        required: true
+        type: string
+      runner:
+        required: true
+        type: string
+      docker:
+        required: true
+        type: string
+      ci_event:
        required: true
        type: string

@ -18,7 +31,7 @@ env:
  RUN_SLOW: yes
  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
-
+  NUM_SLICES: 2

 # Important note: each job (run_tests_single_gpu, run_tests_multi_gpu, run_examples_gpu, run_pipelines_torch_gpu) requires all the previous jobs before running.
 # This is done so that we avoid parallelizing the scheduled tests, to leave available
@ -42,7 +55,7 @@ jobs:
    strategy:
      matrix:
        machine_type: [single-gpu, multi-gpu]
-    runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+    runs-on: ['${{ matrix.machine_type }}', self-hosted, amd-gpu, '${{ inputs.runner }}']
    container:
      image: huggingface/transformers-pytorch-amd-gpu
      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@ -50,25 +63,29 @@ jobs:
      - name: ROCM-SMI
        run: |
          rocm-smi
+
      - name: ROCM-INFO
        run: |
          rocminfo  | grep "Agent" -A 14
+
      - name: Show ROCR environment
        run: |
          echo "ROCR: $ROCR_VISIBLE_DEVICES"

  setup:
+    if: contains(fromJSON('["run_models_gpu"]'), inputs.job)
    name: Setup
    needs: check_runners
    strategy:
      matrix:
        machine_type: [single-gpu, multi-gpu]
-    runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+    runs-on: ['${{ matrix.machine_type }}', self-hosted, amd-gpu, '${{ inputs.runner }}']
    container:
      image: huggingface/transformers-pytorch-amd-gpu
      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    outputs:
-      matrix: ${{ steps.set-matrix.outputs.matrix }}
+      folder_slices: ${{ steps.set-matrix.outputs.folder_slices }}
+      slice_ids: ${{ steps.set-matrix.outputs.slice_ids }}
    steps:
      - name: Update clone
        working-directory: /transformers
@ -90,7 +107,8 @@ jobs:
        name: Identify models to test
        working-directory: /transformers/tests
        run: |
-          echo "matrix=$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2 + d1; print(d)')" >> $GITHUB_OUTPUT
+          echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
+          echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT

      - name: ROCM-SMI
        run: |
@ -99,6 +117,7 @@ jobs:
      - name: ROCM-INFO
        run: |
          rocminfo  | grep "Agent" -A 14
+
      - name: Show ROCR environment
        run: |
          echo "ROCR: $ROCR_VISIBLE_DEVICES"
@ -108,99 +127,38 @@ jobs:
        run: |
          python3 utils/print_env.py

-  run_models_gpu_single_gpu:
+  run_models_gpu:
+    if: ${{ inputs.job == 'run_models_gpu' }}
    name: Single GPU tests
+    needs: setup
    strategy:
      max-parallel: 1  # For now, not to parallelize. Can change later if it works well.
      fail-fast: false
      matrix:
-        folders: ${{ fromJson(needs.setup.outputs.matrix) }}
-        machine_type: [single-gpu]
-    runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
-    container:
-      image: huggingface/transformers-pytorch-amd-gpu
-      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    needs: setup
-    steps:
-      - name: Echo folder ${{ matrix.folders }}
-        shell: bash
-        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
-        # set the artifact folder names (because the character `/` is not allowed).
-        run: |
-          echo "${{ matrix.folders }}"
-          matrix_folders=${{ matrix.folders }}
-          matrix_folders=${matrix_folders/'models/'/'models_'}
-          echo "$matrix_folders"
-          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
+        machine_type: [single-gpu, multi-gpu]
+        slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
+    uses: ./.github/workflows/model_jobs_amd.yml
+    with:
+      folder_slices: ${{ needs.setup.outputs.folder_slices }}
+      machine_type: ${{ matrix.machine_type }}
+      slice_id: ${{ matrix.slice_id }}
+      runner: ${{ inputs.runner }}
+      docker: ${{ inputs.docker }}
+    secrets: inherit

-      - name: Update clone
-        working-directory: /transformers
-        run: git fetch && git checkout ${{ github.sha }}
-
-      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
-        working-directory: /transformers
-        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
-
-      - name: ROCM-SMI
-        run: |
-          rocm-smi
-      - name: ROCM-INFO
-        run: |
-          rocminfo  | grep "Agent" -A 14
-      - name: Show ROCR environment
-        run: |
-          echo "ROCR: $ROCR_VISIBLE_DEVICES"
-
-      - name: Environment
-        working-directory: /transformers
-        run: |
-          python3 utils/print_env.py
-
-      - name: Show installed libraries and their versions
-        working-directory: /transformers
-        run: pip freeze
-
-      - name: Run all tests on GPU
-        working-directory: /transformers
-        run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} -m "not not_device_test"
-
-      - name: Failure short reports
-        if: ${{ failure() }}
-        continue-on-error: true
-        run: cat /transformers/reports/${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
-
-      - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports"
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: ${{ matrix.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports
-          path: /transformers/reports/${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
-
-  run_models_gpu_multi_gpu:
-    name: Multi GPU tests
+  run_pipelines_torch_gpu:
+    if: ${{ inputs.job == 'run_pipelines_torch_gpu' }}
+    name: PyTorch pipelines
+    needs: check_runners
    strategy:
-      max-parallel: 1
      fail-fast: false
      matrix:
-        folders: ${{ fromJson(needs.setup.outputs.matrix) }}
-        machine_type: [multi-gpu]
-    runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+        machine_type: [single-gpu, multi-gpu]
+    runs-on: ['${{ matrix.machine_type }}', self-hosted, amd-gpu, '${{ inputs.runner }}']
    container:
-      image: huggingface/transformers-pytorch-amd-gpu
+      image: ${{ inputs.docker }}
      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    needs: setup
    steps:
-      - name: Echo folder ${{ matrix.folders }}
-        shell: bash
-        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
-        # set the artifact folder names (because the character `/` is not allowed).
-        run: |
-          echo "${{ matrix.folders }}"
-          matrix_folders=${{ matrix.folders }}
-          matrix_folders=${matrix_folders/'models/'/'models_'}
-          echo "$matrix_folders"
-          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
-
      - name: Update clone
        working-directory: /transformers
        run: git fetch && git checkout ${{ github.sha }}
@ -212,9 +170,11 @@ jobs:
      - name: ROCM-SMI
        run: |
          rocm-smi
+
      - name: ROCM-INFO
        run: |
          rocminfo  | grep "Agent" -A 14
+
      - name: Show ROCR environment
        run: |
          echo "ROCR: $ROCR_VISIBLE_DEVICES"
@ -228,33 +188,35 @@ jobs:
        working-directory: /transformers
        run: pip freeze

-      - name: Run all tests on GPU
+      - name: Run all pipeline tests on GPU
        working-directory: /transformers
-        run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} -m "not not_device_test"
+        run: |
+          python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports tests/pipelines -m "not not_device_test"

      - name: Failure short reports
        if: ${{ failure() }}
        continue-on-error: true
-        run: cat /transformers/reports/${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
+        run: cat /transformers/reports/${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports/failures_short.txt

-      - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports"
+      - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports"
        if: ${{ always() }}
        uses: actions/upload-artifact@v4
        with:
-          name: ${{ matrix.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports
-          path: /transformers/reports/${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
+          name: ${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports
+          path: /transformers/reports/${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports

  run_examples_gpu:
-    name: Examples tests
+    if: ${{ inputs.job == 'run_examples_gpu' }}
+    name: Examples directory
+    needs: check_runners
    strategy:
      fail-fast: false
      matrix:
        machine_type: [single-gpu]
-    runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+    runs-on: ['${{ matrix.machine_type }}', self-hosted, amd-gpu, '${{ inputs.runner }}']
    container:
-      image: huggingface/transformers-pytorch-amd-gpu
+      image: ${{ inputs.docker }}
      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    needs: setup
    steps:
      - name: Update clone
        working-directory: /transformers
@ -267,9 +229,11 @@ jobs:
      - name: ROCM-SMI
        run: |
          rocm-smi
+
      - name: ROCM-INFO
        run: |
          rocminfo  | grep "Agent" -A 14
+
      - name: Show ROCR environment
        run: |
          echo "ROCR: $ROCR_VISIBLE_DEVICES"
@ -301,73 +265,17 @@ jobs:
          name: ${{ matrix.machine_type }}_run_examples_gpu_test_reports
          path: /transformers/reports/${{ matrix.machine_type }}_run_examples_gpu_test_reports

-  run_pipelines_torch_gpu:
-    name: PyTorch pipelines tests
-    strategy:
-      fail-fast: false
-      matrix:
-        machine_type: [single-gpu, multi-gpu]
-    runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
-    container:
-      image: huggingface/transformers-pytorch-amd-gpu
-      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    needs: setup
-    steps:
-      - name: Update clone
-        working-directory: /transformers
-        run: git fetch && git checkout ${{ github.sha }}
-
-      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
-        working-directory: /transformers
-        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
-
-      - name: ROCM-SMI
-        run: |
-          rocm-smi
-      - name: ROCM-INFO
-        run: |
-          rocminfo  | grep "Agent" -A 14
-      - name: Show ROCR environment
-        run: |
-          echo "ROCR: $ROCR_VISIBLE_DEVICES"
-
-      - name: Environment
-        working-directory: /transformers
-        run: |
-          python3 utils/print_env.py
-
-      - name: Show installed libraries and their versions
-        working-directory: /transformers
-        run: pip freeze
-
-      - name: Run all pipeline tests on GPU
-        working-directory: /transformers
-        run: |
-          python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports tests/pipelines -m "not not_device_test"
-
-      - name: Failure short reports
-        if: ${{ failure() }}
-        continue-on-error: true
-        run: cat /transformers/reports/${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports/failures_short.txt
-
-      - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports"
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: ${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports
-          path: /transformers/reports/${{ matrix.machine_type }}_run_pipelines_torch_gpu_test_reports
-
  run_torch_cuda_extensions_gpu:
+    if: ${{ inputs.job == 'run_torch_cuda_extensions_gpu' }}
    name: Torch ROCm deepspeed tests
+    needs: check_runners
    strategy:
      fail-fast: false
      matrix:
        machine_type: [single-gpu, multi-gpu]
-
-    runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
-    needs: setup
+    runs-on: ['${{ matrix.machine_type }}', self-hosted, amd-gpu, '${{ inputs.runner }}']
    container:
-      image: huggingface/transformers-pytorch-deepspeed-amd-gpu
+      image: ${{ inputs.docker }}
      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
      - name: Update clone
@ -381,6 +289,7 @@ jobs:
      - name: ROCM-SMI
        run: |
          rocm-smi
+
      - name: ROCM-INFO
        run: |
          rocminfo  | grep "Agent" -A 14
@ -414,107 +323,27 @@ jobs:
          name: ${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports
          path: /transformers/reports/${{ matrix.machine_type }}_run_torch_cuda_extensions_gpu_test_reports

-  run_extract_warnings:
-    name: Extract warnings in CI artifacts
-    runs-on: ubuntu-22.04
-    if: always()
+  send_results:
+    name: Slack Report
    needs: [
      check_runner_status,
      check_runners,
      setup,
-      run_models_gpu_single_gpu,
-      run_models_gpu_multi_gpu,
-      run_examples_gpu,
+      run_models_gpu,
      run_pipelines_torch_gpu,
+      run_examples_gpu,
      run_torch_cuda_extensions_gpu
    ]
-    steps:
-      - name: Checkout transformers
-        uses: actions/checkout@v4
-        with:
-          fetch-depth: 2
+    if: ${{ always() }}
+    uses: ./.github/workflows/slack-report.yml
+    with:
+      job: ${{ inputs.job }}
+      # This would be `skipped` if `setup` is skipped.
+      setup_status: ${{ needs.setup.result }}
+      slack_report_channel: ${{ inputs.slack_report_channel }}
+      # This would be an empty string if `setup` is skipped.
+      folder_slices: ${{ needs.setup.outputs.folder_slices }}
+      quantization_matrix: ${{ needs.setup.outputs.quantization_matrix }}
+      ci_event: ${{ inputs.ci_event }}

-      - name: Install transformers
-        run: pip install transformers
-
-      - name: Show installed libraries and their versions
-        run: pip freeze
-
-      - name: Create output directory
-        run: mkdir warnings_in_ci
-
-      - uses: actions/download-artifact@v4
-        with:
-          path: warnings_in_ci
-
-      - name: Show artifacts
-        run: echo "$(python3 -c 'import os; d = os.listdir(); print(d)')"
-        working-directory: warnings_in_ci
-
-      - name: Extract warnings in CI artifacts
-        run: |
-          python3 utils/extract_warnings.py --workflow_run_id ${{ github.run_id }} --output_dir warnings_in_ci --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }} --from_gh
-          echo "$(python3 -c 'import os; import json; fp = open("warnings_in_ci/selected_warnings.json"); d = json.load(fp); d = "\n".join(d) ;print(d)')"
-
-      - name: Upload artifact
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: warnings_in_ci
-          path: warnings_in_ci/selected_warnings.json
-
-  send_results:
-    name: Send results to webhook
-    runs-on: ubuntu-22.04
-    if: always()
-    needs: [
-      check_runner_status,
-      check_runners,
-      setup,
-      run_models_gpu_single_gpu,
-      run_models_gpu_multi_gpu,
-      run_examples_gpu,
-      run_pipelines_torch_gpu,
-      run_torch_cuda_extensions_gpu,
-      run_extract_warnings
-    ]
-    steps:
-      - name: Preliminary job status
-        shell: bash
-        # For the meaning of these environment variables, see the job `Setup`
-        run: |
-          echo "Runner availability: ${{ needs.check_runner_status.result }}"
-          echo "Runner status: ${{ needs.check_runners.result }}"
-          echo "Setup status: ${{ needs.setup.result }}"
-
-      - uses: actions/checkout@v4
-      - uses: actions/download-artifact@v4
-      - name: Send message to Slack
-        env:
-          CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
-          CI_SLACK_CHANNEL_ID_DAILY_AMD: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY_AMD }}
-          CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
-          CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY_AMD }}
-          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
-          CI_EVENT: Scheduled CI (AMD) - ${{ inputs.gpu_flavor }}
-          CI_SHA: ${{ github.sha }}
-          CI_WORKFLOW_REF: ${{ github.workflow_ref }}
-          RUNNER_STATUS: ${{ needs.check_runner_status.result }}
-          RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
-          SETUP_STATUS: ${{ needs.setup.result }}
-        # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
-        # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
-        run: |
-          sudo apt-get install -y curl
-          pip install huggingface_hub
-          pip install slack_sdk
-          pip show slack_sdk
-          python utils/notification_service.py "${{ needs.setup.outputs.matrix }}"
-
-      # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.
-      - name: Failure table artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: test_failure_tables
-          path: test_failure_tables
+    secrets: inherit
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@ -83,7 +83,7 @@ jobs:
        run: |
          echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT
          echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT
-      
+
      - id: set-matrix-quantization
        if: ${{ inputs.job == 'run_quantization_torch_gpu' }}
        name: Identify quantization method to test
--- a/.github/workflows/ssh-runner.yml
+++ b/.github/workflows/ssh-runner.yml
@ -1,9 +1,17 @@
 name: SSH into our runners

 on:
-  push:
-    branches:
-      - ssh_new_cluster
+  workflow_dispatch:
+    inputs:
+      runner_type:
+        description: 'Type of runner to test (a10 or t4)'
+        required: true 
+      docker_image:
+        description: 'Name of the Docker image'
+        required: true
+      num_gpus:
+        description: 'Type of the number of gpus to use (`single` or `multi`)'
+        required: true

 env:
  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
@ -20,10 +28,9 @@ env:
 jobs:
  ssh_runner:
    name: "SSH"
-    runs-on:
-      group: aws-g4dn-2xlarge-cache-test
+    runs-on: ["${{ github.event.inputs.num_gpus }}-gpu", nvidia-gpu, "${{ github.event.inputs.runner_type }}", ci]
    container:
-      image: huggingface/transformers-all-latest-gpu
+      image: ${{ github.event.inputs.docker_image }}
      options: --gpus all --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/

    steps:
@ -46,12 +53,22 @@ jobs:
      - name: NVIDIA-SMI
        run: |
          nvidia-smi
-      
+
+      - name: Store Slack infos
+        #because the SSH can be enabled dynamically if the workflow failed, so we need to store slack infos to be able to retrieve them during the waitforssh step
+        shell: bash
+        run: |
+          if [ "${{ secrets[format('{0}_{1}', github.actor, 'SLACK_ID')] }}" != "" ]; then
+            echo "SLACKCHANNEL=${{ secrets[format('{0}_{1}', github.actor, 'SLACK_ID')] }}" >> $GITHUB_ENV
+          else
+            echo "SLACKCHANNEL=${{ secrets.SLACK_CIFEEDBACK_CHANNEL }}" >> $GITHUB_ENV
+          fi
+
      - name: Tailscale # In order to be able to SSH when a test fails
        uses: huggingface/tailscale-action@main
        with:
          authkey: ${{ secrets.TAILSCALE_SSH_AUTHKEY }}
-          slackChannel: ${{ secrets.SLACK_CIFEEDBACK_CHANNEL }}
+          slackChannel: ${{ env.SLACKCHANNEL }}
          slackToken: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
          waitForSSH: true
-          sshTimeout: 30m
+          sshTimeout: 15m
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@ -9,6 +9,8 @@ jobs:
    name: Close Stale Issues
    if: github.repository == 'huggingface/transformers'
    runs-on: ubuntu-22.04
+    permissions:
+      issues: write
    env:
      GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
    steps:
--- a/2
+++ b/2
@ -53,7 +53,6 @@ quality:
 	@python -c "from transformers import *" || (echo '🚨 import failed, this means you introduced unprotected imports! 🚨'; exit 1)
 	ruff check $(check_dirs) setup.py conftest.py
 	ruff format --check $(check_dirs) setup.py conftest.py
-	python utils/custom_init_isort.py --check_only
 	python utils/sort_auto_mappings.py --check_only
 	python utils/check_doc_toc.py
 	python utils/check_docstrings.py --check_all
@ -62,7 +61,6 @@ quality:
 # Format source code automatically and check is there are any problems left that need manual fixing

 extra_style_checks:
-	python utils/custom_init_isort.py
 	python utils/sort_auto_mappings.py
 	python utils/check_doc_toc.py --fix_and_overwrite

--- a/docker/consistency.dockerfile
+++ b/docker/consistency.dockerfile
@ -13,4 +13,4 @@ RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transforme
 RUN git lfs install

 RUN pip uninstall -y transformers
-RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
+RUN apt-get clean && rm -rf /var/lib/apt/lists/* && apt-get autoremove && apt-get autoclean
--- a/docker/torch-light.dockerfile
+++ b/docker/torch-light.dockerfile
@ -6,6 +6,6 @@ RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-de
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
 RUN pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
-RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu 
-RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]"
+RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing,tiktoken]"
 RUN pip uninstall -y transformers
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@ -145,6 +145,8 @@
    title: Troubleshoot
  - local: gguf
    title: Interoperability with GGUF files
+  - local: tiktoken
+    title: Interoperability with TikToken files
  title: Developer guides
 - sections:
  - local: quantization/overview
@ -294,6 +296,8 @@
      title: Trainer
    - local: main_classes/deepspeed
      title: DeepSpeed
+    - local: main_classes/executorch
+      title: ExecuTorch
    - local: main_classes/feature_extractor
      title: Feature Extractor
    - local: main_classes/image_processor
--- a/docs/source/en/accelerate.md
+++ b/docs/source/en/accelerate.md
@ -46,7 +46,7 @@ The next step is to pass all the relevant training objects to the [`~accelerate.

 ## Backward

-The last addition is to replace the typical `loss.backward()` in your training loop with 🤗 Accelerate's [`~accelerate.Accelerator.backward`]method:
+The last addition is to replace the typical `loss.backward()` in your training loop with 🤗 Accelerate's [`~accelerate.Accelerator.backward`] method:

 ```py
 >>> for epoch in range(num_epochs):
--- a/docs/source/en/autoclass_tutorial.md
+++ b/docs/source/en/autoclass_tutorial.md
@ -110,7 +110,7 @@ Now you can access the `feature_maps` object from the first stage of the backbon

 ## AutoFeatureExtractor

-For audio tasks, a feature extractor processes the audio signal the correct input format.
+For audio tasks, a feature extractor processes the audio signal into the correct input format.

 Load a feature extractor with [`AutoFeatureExtractor.from_pretrained`]:

--- a/docs/source/en/benchmarks.md
+++ b/docs/source/en/benchmarks.md
@ -35,7 +35,7 @@ The classes [`PyTorchBenchmark`] and [`TensorFlowBenchmark`] allow to flexibly b

 <Tip>

-Hereby, _inference_ is defined by a single forward pass, and _training_ is defined by a single forward pass and
+Here, _inference_ is defined by a single forward pass, and _training_ is defined by a single forward pass and
 backward pass.

 </Tip>
@ -368,7 +368,7 @@ This section lists a couple of best practices one should be aware of when benchm
  memory measurement it is recommended to run each memory benchmark in a separate process by making sure
  `no_multi_processing` is set to `True`.
 - One should always state the environment information when sharing the results of a model benchmark. Results can vary
-  heavily between different GPU devices, library versions, etc., so that benchmark results on their own are not very
+  heavily between different GPU devices, library versions, etc., as a consequence, benchmark results on their own are not very
  useful for the community.


--- a/docs/source/en/bertology.md
+++ b/docs/source/en/bertology.md
@ -37,5 +37,5 @@ help people access the inner representations, mainly adapted from the great work
 - retrieving heads output values and gradients to be able to compute head importance score and prune head as explained
  in https://arxiv.org/abs/1905.10650.

-To help you understand and use these features, we have added a specific example script: [bertology.py](https://github.com/huggingface/transformers/tree/main/examples/research_projects/bertology/run_bertology.py) while extract information and prune a model pre-trained on
+To help you understand and use these features, we have added a specific example script: [bertology.py](https://github.com/huggingface/transformers/tree/main/examples/research_projects/bertology/run_bertology.py) which extracts information and prune a model pre-trained on
 GLUE.
--- a/docs/source/en/chat_templating.md
+++ b/docs/source/en/chat_templating.md
@ -196,7 +196,7 @@ Not all models require generation prompts. Some models, like LLaMA, don't have a
 special tokens before bot responses. In these cases, the `add_generation_prompt` argument will have no effect. The exact
 effect that `add_generation_prompt` has will depend on the template being used.

-## What does "continue_last_message" do?
+## What does "continue_final_message" do?

 When passing a list of messages to `apply_chat_template` or `TextGenerationPipeline`, you can choose
 to format the chat so the model will continue the final message in the chat instead of starting a new one. This is done
@ -211,7 +211,7 @@ chat = [
    {"role": "assistant", "content": '{"name": "'},
 ]

-formatted_chat = tokenizer.apply_chat_template(chat, tokenize=True, return_dict=True, continue_last_message=True)
+formatted_chat = tokenizer.apply_chat_template(chat, tokenize=True, return_dict=True, continue_final_message=True)
 model.generate(**formatted_chat)
 ```

@ -219,7 +219,7 @@ The model will generate text that continues the JSON string, rather than startin
 can be very useful for improving the accuracy of the model's instruction-following when you know how you want
 it to start its replies.

-Because `add_generation_prompt` adds the tokens that start a new message, and `continue_last_message` removes any
+Because `add_generation_prompt` adds the tokens that start a new message, and `continue_final_message` removes any
 end-of-message tokens from the final message, it does not make sense to use them together. As a result, you'll
 get an error if you try!

@ -228,7 +228,7 @@ get an error if you try!
 The default behaviour of `TextGenerationPipeline` is to set `add_generation_prompt=True` so that it starts a new
 message. However, if the final message in the input chat has the "assistant" role, it will assume that this message is 
 a prefill and switch to `continue_final_message=True` instead, because most models do not support multiple 
-consecutive assistant messages. You can override this behaviour by explicitly passing the `continue_last_message` 
+consecutive assistant messages. You can override this behaviour by explicitly passing the `continue_final_message` 
 argument when calling the pipeline.

 </Tip>
--- a/docs/source/en/community.md
+++ b/docs/source/en/community.md
@ -63,8 +63,8 @@ This page regroups resources around 🤗 Transformers developed by the community
 | [Evaluate LUKE on TACRED, a relation extraction dataset](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_tacred.ipynb) | How to evaluate *LukeForEntityPairClassification* on the TACRED dataset | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_tacred.ipynb) |
 | [Evaluate LUKE on CoNLL-2003, an important NER benchmark](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) | How to evaluate *LukeForEntitySpanClassification* on the CoNLL-2003 dataset | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) |
 | [Evaluate BigBird-Pegasus on PubMed dataset](https://github.com/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) | How to evaluate *BigBirdPegasusForConditionalGeneration* on PubMed dataset | [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) |
-| [Speech Emotion Classification with Wav2Vec2](https://github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) | How to leverage a pretrained Wav2Vec2 model for Emotion Classification on the MEGA dataset | [Mehrdad Farahani](https://github.com/m3hrdadfi) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) |
+| [Speech Emotion Classification with Wav2Vec2](https://github.com/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) | How to leverage a pretrained Wav2Vec2 model for Emotion Classification on the MEGA dataset | [Mehrdad Farahani](https://github.com/m3hrdadfi) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) |
 | [Detect objects in an image with DETR](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) | How to use a trained *DetrForObjectDetection* model to detect objects in an image and visualize attention | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) |
 | [Fine-tune DETR on a custom object detection dataset](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) | How to fine-tune *DetrForObjectDetection* on a custom object detection dataset | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) |
 | [Finetune T5 for Named Entity Recognition](https://github.com/ToluClassics/Notebooks/blob/main/T5_Ner_Finetuning.ipynb) | How to fine-tune *T5* on a Named Entity Recognition Task | [Ogundepo Odunayo](https://github.com/ToluClassics) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1obr78FY_cBmWY5ODViCmzdY6O1KB65Vc?usp=sharing) |
-| [Fine-Tuning Open-Source LLM using QLoRA with MLflow and PEFT](https://github.com/mlflow/mlflow/blob/master/docs/source/llms/transformers/tutorials/fine-tuning/transformers-peft.ipynb) | How to use [QLoRA](https://github.com/artidoro/qlora) and [PEFT](https://huggingface.co/docs/peft/en/index) to fine-tune an LLM in a memory-efficient way, while using [MLflow](https://mlflow.org/docs/latest/llms/transformers/index.html) to manage experiment tracking | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mlflow/mlflow/blob/master/docs/source/llms/transformers/tutorials/fine-tuning/transformers-peft.ipynb) |
+| [Fine-Tuning Open-Source LLM using QLoRA with MLflow and PEFT](https://github.com/mlflow/mlflow/blob/master/docs/source/llms/transformers/tutorials/fine-tuning/transformers-peft.ipynb) | How to use [QLoRA](https://github.com/artidoro/qlora) and [PEFT](https://huggingface.co/docs/peft/en/index) to fine-tune an LLM in a memory-efficient way, while using [MLflow](https://mlflow.org/docs/latest/llms/transformers/index.html) to manage experiment tracking | [Yuki Watanabe](https://github.com/B-Step62) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mlflow/mlflow/blob/master/docs/source/llms/transformers/tutorials/fine-tuning/transformers-peft.ipynb) |
--- a/docs/source/en/debugging.md
+++ b/docs/source/en/debugging.md
@ -203,7 +203,7 @@ This feature can be used with any `nn.Module`-based model.

 </Tip>

-If you start getting `loss=NaN` or the model inhibits some other abnormal behavior due to `inf` or `nan` in
+If you start getting `loss=NaN` or the model exhibits some other abnormal behavior due to `inf` or `nan` in
 activations or weights one needs to discover where the first underflow or overflow happens and what led to it. Luckily
 you can accomplish that easily by activating a special module that will do the detection automatically.

--- a/docs/source/en/generation_strategies.md
+++ b/docs/source/en/generation_strategies.md
@ -456,7 +456,7 @@ just like in multinomial sampling. However, in assisted decoding, reducing the t
 ['Alice and Bob, a couple of friends of mine, who are both in the same office as']
 ```

-Alternativelly, you can also set the `prompt_lookup_num_tokens` to trigger n-gram based assisted decoding, as opposed
+Alternatively, you can also set the `prompt_lookup_num_tokens` to trigger n-gram based assisted decoding, as opposed
 to model based assisted decoding. You can read more about it [here](https://twitter.com/joao_gante/status/1747322413006643259).
 ### DoLa Decoding

--- a/docs/source/en/gguf.md
+++ b/docs/source/en/gguf.md
@ -79,6 +79,7 @@ For now the supported model architectures are the architectures that have been v
 - Mistral
 - Qwen2
 - Qwen2Moe
+- Phi3

 ## Example usage

--- a/docs/source/en/installation.md
+++ b/docs/source/en/installation.md
@ -71,7 +71,7 @@ pip install 'transformers[tf-cpu]'

 M1 / ARM Users

-You will need to install the following before installing TensorFLow 2.0
+You will need to install the following before installing TensorFlow 2.0
 ```bash
 brew install cmake
 brew install pkg-config
--- a/docs/source/en/main_classes/agent.md
+++ b/docs/source/en/main_classes/agent.md
@ -50,6 +50,10 @@ We provide two types of agents, based on the main [`Agent`] class:

 [[autodoc]] ReactCodeAgent

+### ManagedAgent
+
+[[autodoc]] ManagedAgent
+
 ## Tools

 ### load_tool
--- a/docs/source/en/main_classes/executorch.md
+++ b/docs/source/en/main_classes/executorch.md
@ -0,0 +1,33 @@
+<!--Copyright (c) Meta Platforms, Inc. and affiliates.
+All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+
+# ExecuTorch
+
+[`ExecuTorch`](https://github.com/pytorch/executorch) is an end-to-end solution for enabling on-device inference capabilities across mobile and edge devices including wearables, embedded devices and microcontrollers. It is part of the PyTorch ecosystem and supports the deployment of PyTorch models with a focus on portability, productivity, and performance.
+
+ExecuTorch introduces well defined entry points to perform model, device, and/or use-case specific optimizations such as backend delegation, user-defined compiler transformations, memory planning, and more. The first step in preparing a PyTorch model for execution on an edge device using ExecuTorch is to export the model. This is achieved through the use of a PyTorch API called [`torch.export`](https://pytorch.org/docs/stable/export.html).
+
+
+## ExecuTorch Integration
+
+An integration point is being developed to ensure that 🤗 Transformers can be exported using `torch.export`. The goal of this integration is not only to enable export but also to ensure that the exported artifact can be further lowered and optimized to run efficiently in `ExecuTorch`, particularly for mobile and edge use cases.
+
+[[autodoc]] integrations.executorch.TorchExportableModuleWithStaticCache
+    - forward
+
+[[autodoc]] integrations.executorch.convert_and_export_with_cache
--- a/docs/source/en/main_classes/logging.md
+++ b/docs/source/en/main_classes/logging.md
@ -30,7 +30,7 @@ transformers.logging.set_verbosity_info()
 ```

 You can also use the environment variable `TRANSFORMERS_VERBOSITY` to override the default verbosity. You can set it
-to one of the following: `debug`, `info`, `warning`, `error`, `critical`. For example:
+to one of the following: `debug`, `info`, `warning`, `error`, `critical`, `fatal`. For example:

 ```bash
 TRANSFORMERS_VERBOSITY=error ./myprogram.py
@ -65,7 +65,7 @@ verbose to the most verbose), those levels (with their corresponding int values
  critical errors.
 - `transformers.logging.ERROR` (int value, 40): only report errors.
 - `transformers.logging.WARNING` or `transformers.logging.WARN` (int value, 30): only reports error and
-  warnings. This the default level used by the library.
+  warnings. This is the default level used by the library.
 - `transformers.logging.INFO` (int value, 20): reports error, warnings and basic information.
 - `transformers.logging.DEBUG` (int value, 10): report all information.

@ -77,10 +77,10 @@ Python has two logging systems that are often used in conjunction: `logging`, wh
 which allows further classification of warnings in specific buckets, e.g., `FutureWarning` for a feature or path
 that has already been deprecated and `DeprecationWarning` to indicate an upcoming deprecation.

-We use both in the `transformers` library. We leverage and adapt `logging`'s `captureWarning` method to allow
+We use both in the `transformers` library. We leverage and adapt `logging`'s `captureWarnings` method to allow
 management of these warning messages by the verbosity setters above.

-What does that mean for developers of the library? We should respect the following heuristic:
+What does that mean for developers of the library? We should respect the following heuristics:
 - `warnings` should be favored for developers of the library and libraries dependent on `transformers`
 - `logging` should be used for end-users of the library using it in every-day projects

--- a/docs/source/en/main_classes/optimizer_schedules.md
+++ b/docs/source/en/main_classes/optimizer_schedules.md
@ -38,7 +38,7 @@ The `.optimization` module provides:

 ## Schedules

-### Learning Rate Schedules (Pytorch)
+### Learning Rate Schedules (PyTorch)

 [[autodoc]] SchedulerType

--- a/docs/source/en/main_classes/output.md
+++ b/docs/source/en/main_classes/output.md
@ -42,7 +42,7 @@ an optional `attentions` attribute. Here we have the `loss` since we passed alon

 <Tip>

-When passing `output_hidden_states=True` you may expect the `outputs.hidden_states[-1]` to match `outputs.last_hidden_states` exactly.
+When passing `output_hidden_states=True` you may expect the `outputs.hidden_states[-1]` to match `outputs.last_hidden_state` exactly.
 However, this is not always the case. Some models apply normalization or subsequent process to the last hidden state when it's returned.

 </Tip>
--- a/docs/source/en/main_classes/trainer.md
+++ b/docs/source/en/main_classes/trainer.md
@ -18,7 +18,7 @@ rendered properly in your Markdown viewer.

 The [`Trainer`] class provides an API for feature-complete training in PyTorch, and it supports distributed training on multiple GPUs/TPUs, mixed precision for [NVIDIA GPUs](https://nvidia.github.io/apex/), [AMD GPUs](https://rocm.docs.amd.com/en/latest/rocm.html), and [`torch.amp`](https://pytorch.org/docs/stable/amp.html) for PyTorch. [`Trainer`] goes hand-in-hand with the [`TrainingArguments`] class, which offers a wide range of options to customize how a model is trained. Together, these two classes provide a complete training API.

-[`Seq2SeqTrainer`] and [`Seq2SeqTrainingArguments`] inherit from the [`Trainer`] and [`TrainingArgument`] classes and they're adapted for training models for sequence-to-sequence tasks such as summarization or translation.
+[`Seq2SeqTrainer`] and [`Seq2SeqTrainingArguments`] inherit from the [`Trainer`] and [`TrainingArguments`] classes and they're adapted for training models for sequence-to-sequence tasks such as summarization or translation.

 <Tip warning={true}>

--- a/docs/source/en/model_doc/camembert.md
+++ b/docs/source/en/model_doc/camembert.md
@ -106,7 +106,7 @@ as the information relative to the inputs and outputs.

 [[autodoc]] TFCamembertModel

-## TFCamembertForCasualLM
+## TFCamembertForCausalLM

 [[autodoc]] TFCamembertForCausalLM

--- a/docs/source/en/model_doc/clipseg.md
+++ b/docs/source/en/model_doc/clipseg.md
@ -19,7 +19,7 @@ rendered properly in your Markdown viewer.
 ## Overview

 The CLIPSeg model was proposed in [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke
-and Alexander Ecker. CLIPSeg adds a minimal decoder on top of a frozen [CLIP](clip) model for zero- and one-shot image segmentation.
+and Alexander Ecker. CLIPSeg adds a minimal decoder on top of a frozen [CLIP](clip) model for zero-shot and one-shot image segmentation.

 The abstract from the paper is the following:

--- a/docs/source/en/model_doc/code_llama.md
+++ b/docs/source/en/model_doc/code_llama.md
@ -34,7 +34,7 @@ This model was contributed by [ArthurZucker](https://huggingface.co/ArthurZ). Th

 The `Llama2` family models, on which Code Llama is based, were trained using `bfloat16`, but the original inference uses `float16`. Let's look at the different precisions:

-* `float32`: PyTorch convention on model initialization is to load models in `float32`, no matter with which `dtype` the model weights were stored. `transformers` also follows this convention for consistency with PyTorch. This will be picked by default. If you want the `AutoModel` API to cast the load the checkpoints with the storage weights type, you must specify `torch_dtype="auto"`, e.g. `model = AutoModelForCausalLM.from_pretrained("path", torch_dtype = "auto")`.
+* `float32`: PyTorch convention on model initialization is to load models in `float32`, no matter with which `dtype` the model weights were stored. `transformers` also follows this convention for consistency with PyTorch. This will be picked by default. If you want the `AutoModel` API to load the checkpoints with the storage weights type, you must specify `torch_dtype="auto"`, e.g. `model = AutoModelForCausalLM.from_pretrained("path", torch_dtype = "auto")`.
 * `bfloat16`: Code Llama was trained with this precision, so we recommend using it for further training or fine-tuning.
 * `float16`: We recommend running inference using this precision, as it's usually faster than `bfloat16`, and evaluation metrics show no discernible degradation with respect to `bfloat16`. You can also run inference using `bfloat16`, and we recommend you check inference results with both `float16` and `bfloat16` after fine-tuning.

--- a/docs/source/en/model_doc/falcon_mamba.md
+++ b/docs/source/en/model_doc/falcon_mamba.md
@ -27,7 +27,7 @@ Due to its architecture, FalconMamba is significantly faster at inference and re

 Tips:

- FalconMamba is mostly based on Mamba architecutre, the same [tips and best practices](./mamba) would be relevant here.
+- FalconMamba is mostly based on Mamba architecture, the same [tips and best practices](./mamba) would be relevant here.

 The model has been trained on approximtely 6T tokens consisting a mixture of many data sources such as RefineWeb, Cosmopedia and Math data.

--- a/docs/source/en/model_doc/hiera.md
+++ b/docs/source/en/model_doc/hiera.md
@ -31,7 +31,7 @@ alt="drawing" width="600"/>

 <small> Hiera architecture. Taken from the <a href="https://arxiv.org/abs/2306.00989">original paper.</a> </small>

-This model was a joint contibution by [EduardoPacheco](https://huggingface.co/EduardoPacheco) and [namangarg110](https://huggingface.co/namangarg110). The original code can be found [here] (https://github.com/facebookresearch/hiera).
+This model was a joint contribution by [EduardoPacheco](https://huggingface.co/EduardoPacheco) and [namangarg110](https://huggingface.co/namangarg110). The original code can be found [here] (https://github.com/facebookresearch/hiera).

 ## Resources

--- a/docs/source/en/model_doc/jamba.md
+++ b/docs/source/en/model_doc/jamba.md
@ -33,7 +33,7 @@ alt="drawing" width="600"/>

 ## Usage

-### Presequities
+### Prerequisites

 Jamba requires you use `transformers` version 4.39.0 or higher:
 ```bash
--- a/docs/source/en/model_doc/llama3.md
+++ b/docs/source/en/model_doc/llama3.md
@ -57,25 +57,26 @@ Tips:
 - The tokenizer is a BPE model based on [tiktoken](https://github.com/openai/tiktoken) (vs the one based on sentencepiece implementation for Llama2). The main difference that it ignores BPE merge rules when an input token is part of the vocab. This means that if no merge exist to produce `"hugging"`, instead of having the smallest units, like `["hug","ging"] form 2 tokens, if `"hugging"` is part of the vocab, it will be automatically returned as a token.
 - The original model uses `pad_id = -1` which means that there is no padding token. We can't have the same logic, make sure to add a padding token using `tokenizer.add_special_tokens({"pad_token":"<pad>"})` and resize the token embedding accordingly. You should also set the `model.config.pad_token_id`. The `embed_tokens` layer of the model is initialized with `self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.config.padding_idx)`, which makes sure that encoding the padding token will output zeros, so passing it when initializing is recommended.
 - The original checkpoint can be converted using the [conversion script](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py). The script can be called with the following (example) command:
-
-```bash
-python src/transformers/models/llama/convert_llama_weights_to_hf.py \
-    --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path --llama_version 3
-```
+    
+    ```bash
+    python src/transformers/models/llama/convert_llama_weights_to_hf.py \
+        --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path --llama_version 3
+    ```

 - After conversion, the model and tokenizer can be loaded via:

-```python
-from transformers import AutoModelForCausalLM, AutoTokenizer
+    ```python
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+    
+    tokenizer = AutoTokenizer.from_pretrained("/output/path")
+    model = AutoModelForCausalLM.from_pretrained("/output/path")
+    ```

-tokenizer = AutoTokenizer.from_pretrained("/output/path")
-model = AutoModelForCausalLM.from_pretrained("/output/path")
-```
-
-Note that executing the script requires enough CPU RAM to host the whole model in float16 precision (even if the biggest versions
-come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM). For the 75B model, it's thus 145GB of RAM needed.
+    Note that executing the script requires enough CPU RAM to host the whole model in float16 precision (even if the biggest versions
+    come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM). For the 75B model, it's thus 145GB of RAM needed.

 - When using Flash Attention 2 via `attn_implementation="flash_attention_2"`, don't pass `torch_dtype` to the `from_pretrained` class method and use Automatic Mixed-Precision training. When using `Trainer`, it is simply specifying either `fp16` or `bf16` to `True`. Otherwise, make sure you are using `torch.autocast`. This is required because the Flash Attention only support `fp16` and `bf16` data type.

 ## Resources
+
 A ton of cool resources are already available on the documentation page of [Llama2](./llama2), inviting contributors to add new resources curated for Llama3 here! 🤗
--- a/docs/source/en/model_doc/matcha.md
+++ b/docs/source/en/model_doc/matcha.md
@ -61,7 +61,7 @@ print(processor.decode(predictions[0], skip_special_tokens=True))

 ## Fine-tuning

-To fine-tune MatCha, refer to the pix2struct [fine-tuning notebook](https://github.com/huggingface/notebooks/blob/main/examples/image_captioning_pix2struct.ipynb). For `Pix2Struct` models, we have found out that fine-tuning the model with Adafactor and cosine learning rate scheduler leads to faste convergence:
+To fine-tune MatCha, refer to the pix2struct [fine-tuning notebook](https://github.com/huggingface/notebooks/blob/main/examples/image_captioning_pix2struct.ipynb). For `Pix2Struct` models, we have found out that fine-tuning the model with Adafactor and cosine learning rate scheduler leads to faster convergence:
 ```python
 from transformers.optimization import Adafactor, get_cosine_schedule_with_warmup

--- a/docs/source/en/model_doc/mbart.md
+++ b/docs/source/en/model_doc/mbart.md
@ -83,7 +83,7 @@ keyword, and target text format passed with the `text_label` keyword argument.
 ## Overview of MBart-50

 MBart-50 was introduced in the [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) paper by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav
-Chaudhary, Jiatao Gu, Angela Fan. MBart-50 is created using the original *mbart-large-cc25* checkpoint by extendeding
+Chaudhary, Jiatao Gu, Angela Fan. MBart-50 is created using the original *mbart-large-cc25* checkpoint by extending
 its embedding layers with randomly initialized vectors for an extra set of 25 language tokens and then pretrained on 50
 languages.

--- a/docs/source/en/model_doc/mixtral.md
+++ b/docs/source/en/model_doc/mixtral.md
@ -31,7 +31,7 @@ Mixtral-8x7B is the second large language model (LLM) released by [mistral.ai](h
 Mixtral-8x7B is a decoder-only Transformer with the following architectural choices:

 - Mixtral is a Mixture of Experts (MoE) model with 8 experts per MLP, with a total of 45 billion parameters. To learn more about mixture-of-experts, refer to the [blog post](https://huggingface.co/blog/moe).
- Despite the model having 45 billion parameters,, the compute required for a single forward pass is the same as that of a 14 billion parameter model. This is because even though each of the experts have to be loaded in RAM (70B like ram requirement) each token from the hidden states are dispatched twice (top 2 routing) and thus the compute (the operation required at each forward computation) is just 2 X sequence_length. 
+- Despite the model having 45 billion parameters, the compute required for a single forward pass is the same as that of a 14 billion parameter model. This is because even though each of the experts have to be loaded in RAM (70B like ram requirement) each token from the hidden states are dispatched twice (top 2 routing) and thus the compute (the operation required at each forward computation) is just 2 X sequence_length. 

 The following implementation details are shared with Mistral AI's first model [Mistral-7B](mistral):
 - Sliding Window Attention - Trained with 8k context length and fixed cache size, with a theoretical attention span of 128K tokens
--- a/docs/source/en/model_doc/mms.md
+++ b/docs/source/en/model_doc/mms.md
@ -242,7 +242,7 @@ export UROMAN=$(pwd)
 ```

 You can then pre-process the text input using the following code snippet. You can either rely on using the bash variable 
-`UROMAN` to point to the uroman repository, or you can pass the uroman directory as an argument to the `uromaize` function:
+`UROMAN` to point to the uroman repository, or you can pass the uroman directory as an argument to the `uromanize` function:

 ```python
 import torch
@ -270,9 +270,9 @@ def uromanize(input_string, uroman_path):
    return stdout.decode()[:-1]

 text = "이봐 무슨 일이야"
-uromaized_text = uromanize(text, uroman_path=os.environ["UROMAN"])
+uromanized_text = uromanize(text, uroman_path=os.environ["UROMAN"])

-inputs = tokenizer(text=uromaized_text, return_tensors="pt")
+inputs = tokenizer(text=uromanized_text, return_tensors="pt")

 set_seed(555)  # make deterministic
 with torch.no_grad():
--- a/docs/source/en/model_doc/mpt.md
+++ b/docs/source/en/model_doc/mpt.md
@ -18,7 +18,7 @@ rendered properly in your Markdown viewer.

 ## Overview

-The MPT model was proposed by the [MosaicML](https://www.mosaicml.com/) team and released with multiple sizes and finetuned variants. The MPT models is a series of open source and commercially usable LLMs pre-trained on 1T tokens. 
+The MPT model was proposed by the [MosaicML](https://www.mosaicml.com/) team and released with multiple sizes and finetuned variants. The MPT models are a series of open source and commercially usable LLMs pre-trained on 1T tokens. 

 MPT models are GPT-style decoder-only transformers with several improvements: performance-optimized layer implementations, architecture changes that provide greater training stability, and the elimination of context length limits by replacing positional embeddings with ALiBi. 

--- a/docs/source/en/model_doc/oneformer.md
+++ b/docs/source/en/model_doc/oneformer.md
@ -39,7 +39,7 @@ This model was contributed by [Jitesh Jain](https://huggingface.co/praeclarumjj3
 - If you want to train the model in a distributed environment across multiple nodes, then one should update the
  `get_num_masks` function inside in the `OneFormerLoss` class of `modeling_oneformer.py`. When training on multiple nodes, this should be
  set to the average number of target masks across all nodes, as can be seen in the original implementation [here](https://github.com/SHI-Labs/OneFormer/blob/33ebb56ed34f970a30ae103e786c0cb64c653d9a/oneformer/modeling/criterion.py#L287).
- One can use [`OneFormerProcessor`] to prepare input images and task inputs for the model and optional targets for the model. [`OneformerProcessor`] wraps [`OneFormerImageProcessor`] and [`CLIPTokenizer`] into a single instance to both prepare the images and encode the task inputs.
+- One can use [`OneFormerProcessor`] to prepare input images and task inputs for the model and optional targets for the model. [`OneFormerProcessor`] wraps [`OneFormerImageProcessor`] and [`CLIPTokenizer`] into a single instance to both prepare the images and encode the task inputs.
 - To get the final segmentation, depending on the task, you can call [`~OneFormerProcessor.post_process_semantic_segmentation`] or [`~OneFormerImageProcessor.post_process_instance_segmentation`] or [`~OneFormerImageProcessor.post_process_panoptic_segmentation`]. All three tasks can be solved using [`OneFormerForUniversalSegmentation`] output, panoptic segmentation accepts an optional `label_ids_to_fuse` argument to fuse instances of the target object/s (e.g. sky) together.

 ## Resources
--- a/docs/source/en/model_doc/openai-gpt.md
+++ b/docs/source/en/model_doc/openai-gpt.md
@ -29,7 +29,7 @@ rendered properly in your Markdown viewer.

 OpenAI GPT model was proposed in [Improving Language Understanding by Generative Pre-Training](https://s3-us-west-2.amazonaws.com/openai-assets/research-covers/language-unsupervised/language_understanding_paper.pdf)
 by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever. It's a causal (unidirectional) transformer
-pre-trained using language modeling on a large corpus will long range dependencies, the Toronto Book Corpus.
+pre-trained using language modeling on a large corpus with long range dependencies, the Toronto Book Corpus.

 The abstract from the paper is the following:

--- a/docs/source/en/model_doc/phobert.md
+++ b/docs/source/en/model_doc/phobert.md
@ -54,7 +54,7 @@ This model was contributed by [dqnguyen](https://huggingface.co/dqnguyen). The o

 <Tip> 

-PhoBERT implementation is the same as BERT, except for tokenization. Refer to [EART documentation](bert) for information on 
+PhoBERT implementation is the same as BERT, except for tokenization. Refer to [BERT documentation](bert) for information on 
 configuration classes and their parameters. PhoBERT-specific tokenizer is documented below.  

 </Tip>
--- a/docs/source/en/model_doc/qwen2_vl.md
+++ b/docs/source/en/model_doc/qwen2_vl.md
@ -229,8 +229,6 @@ processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", min_pixel

 ```

-
-
 #### Multiple Image Inputs

 By default, images and video content are directly included in the conversation. When handling multiple images, it's helpful to add labels to the images and videos for better reference. Users can control this behavior with the following settings:
--- a/docs/source/en/model_doc/seggpt.md
+++ b/docs/source/en/model_doc/seggpt.md
@ -27,7 +27,7 @@ The abstract from the paper is the following:
 Tips:
 - One can use [`SegGptImageProcessor`] to prepare image input, prompt and mask to the model.
 - One can either use segmentation maps or RGB images as prompt masks. If using the latter make sure to set `do_convert_rgb=False` in the `preprocess` method.
- It's highly advisable to pass `num_labels` when using `segmetantion_maps` (not considering background) during preprocessing and postprocessing with [`SegGptImageProcessor`] for your use case.
+- It's highly advisable to pass `num_labels` when using `segmentation_maps` (not considering background) during preprocessing and postprocessing with [`SegGptImageProcessor`] for your use case.
 - When doing inference with [`SegGptForImageSegmentation`] if your `batch_size` is greater than 1 you can use feature ensemble across your images by passing `feature_ensemble=True` in the forward method.

 Here's how to use the model for one-shot semantic segmentation:
--- a/docs/source/en/model_doc/swin2sr.md
+++ b/docs/source/en/model_doc/swin2sr.md
@ -19,7 +19,7 @@ rendered properly in your Markdown viewer.
 ## Overview

 The Swin2SR model was proposed in [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
-Swin2R improves the [SwinIR](https://github.com/JingyunLiang/SwinIR/) model by incorporating [Swin Transformer v2](swinv2) layers which mitigates issues such as training instability, resolution gaps between pre-training
+Swin2SR improves the [SwinIR](https://github.com/JingyunLiang/SwinIR/) model by incorporating [Swin Transformer v2](swinv2) layers which mitigates issues such as training instability, resolution gaps between pre-training
 and fine-tuning, and hunger on data.

 The abstract from the paper is the following:
--- a/docs/source/en/model_doc/vits.md
+++ b/docs/source/en/model_doc/vits.md
@ -127,7 +127,7 @@ export UROMAN=$(pwd)
 ```

 You can then pre-process the text input using the following code snippet. You can either rely on using the bash variable 
-`UROMAN` to point to the uroman repository, or you can pass the uroman directory as an argument to the `uromaize` function:
+`UROMAN` to point to the uroman repository, or you can pass the uroman directory as an argument to the `uromanize` function:

 ```python
 import torch
@ -155,9 +155,9 @@ def uromanize(input_string, uroman_path):
    return stdout.decode()[:-1]

 text = "이봐 무슨 일이야"
-uromaized_text = uromanize(text, uroman_path=os.environ["UROMAN"])
+uromanized_text = uromanize(text, uroman_path=os.environ["UROMAN"])

-inputs = tokenizer(text=uromaized_text, return_tensors="pt")
+inputs = tokenizer(text=uromanized_text, return_tensors="pt")

 set_seed(555)  # make deterministic
 with torch.no_grad():
--- a/docs/source/en/model_doc/xlm-roberta.md
+++ b/docs/source/en/model_doc/xlm-roberta.md
@ -43,7 +43,7 @@ low-resource languages, improving 11.8% in XNLI accuracy for Swahili and 9.2% fo
 also present a detailed empirical evaluation of the key factors that are required to achieve these gains, including the
 trade-offs between (1) positive transfer and capacity dilution and (2) the performance of high and low resource
 languages at scale. Finally, we show, for the first time, the possibility of multilingual modeling without sacrificing
-per-language performance; XLM-Ris very competitive with strong monolingual models on the GLUE and XNLI benchmarks. We
+per-language performance; XLM-R is very competitive with strong monolingual models on the GLUE and XNLI benchmarks. We
 will make XLM-R code, data, and models publicly available.*

 This model was contributed by [stefan-it](https://huggingface.co/stefan-it). The original code can be found [here](https://github.com/pytorch/fairseq/tree/master/examples/xlmr).
--- a/docs/source/en/model_doc/xlnet.md
+++ b/docs/source/en/model_doc/xlnet.md
@ -166,7 +166,7 @@ This model was contributed by [thomwolf](https://huggingface.co/thomwolf). The o
 [[autodoc]] TFXLNetForSequenceClassification
    - call

-## TFLNetForMultipleChoice
+## TFXLNetForMultipleChoice

 [[autodoc]] TFXLNetForMultipleChoice
    - call
--- a/docs/source/en/model_sharing.md
+++ b/docs/source/en/model_sharing.md
@ -47,7 +47,7 @@ As a result, you can load a specific model version with the `revision` parameter
 ... )
 ```

-Files are also easily edited in a repository, and you can view the commit history as well as the difference:
+Files are also easily edited in a repository, and you can view the commit history as well as the differences:

 ![vis_diff](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vis_diff.png)

--- a/docs/source/en/pad_truncation.md
+++ b/docs/source/en/pad_truncation.md
@ -18,7 +18,7 @@ rendered properly in your Markdown viewer.

 Batched inputs are often different lengths, so they can't be converted to fixed-size tensors. Padding and truncation are strategies for dealing with this problem, to create rectangular tensors from batches of varying lengths. Padding adds a special **padding token** to ensure shorter sequences will have the same length as either the longest sequence in a batch or the maximum length accepted by the model. Truncation works in the other direction by truncating long sequences.

-In most cases, padding your batch to the length of the longest sequence and truncating to the maximum length a model can accept works pretty well. However, the API supports more strategies if you need them. The three arguments you need to are: `padding`, `truncation` and `max_length`.
+In most cases, padding your batch to the length of the longest sequence and truncating to the maximum length a model can accept works pretty well. However, the API supports more strategies if you need them. The three arguments you need to know are: `padding`, `truncation` and `max_length`.

 The `padding` argument controls padding. It can be a boolean or a string:

--- a/docs/source/en/peft.md
+++ b/docs/source/en/peft.md
@ -46,7 +46,7 @@ pip install git+https://github.com/huggingface/peft.git
 - [IA3](https://huggingface.co/docs/peft/conceptual_guides/ia3)
 - [AdaLoRA](https://arxiv.org/abs/2303.10512)

-If you want to use other PEFT methods, such as prompt learning or prompt tuning, or about the 🤗 PEFT library in general, please refer to the [documentation](https://huggingface.co/docs/peft/index).
+If you want to use other PEFT methods, such as prompt learning or prompt tuning, or learn about the 🤗 PEFT library in general, please refer to the [documentation](https://huggingface.co/docs/peft/index).


 ## Load a PEFT adapter
@ -125,7 +125,7 @@ Now you can use [`~peft.PeftModel.set_adapter`] to set which adapter to use:
 ```py
 # use adapter_1
 model.set_adapter("adapter_1")
-output = model.generate(**inputs)
+output_disabled = model.generate(**inputs)
 print(tokenizer.decode(output_disabled[0], skip_special_tokens=True))

 # use adapter_2
--- a/docs/source/en/perf_hardware.md
+++ b/docs/source/en/perf_hardware.md
@ -116,7 +116,7 @@ Each new generation provides a faster bandwidth, e.g. here is a quote from [Nvid

 So the higher `X` you get in the report of `NVX` in the output of `nvidia-smi topo -m` the better. The generation will depend on your GPU architecture.

-Let's compare the execution of an openai-community/gpt2 language model training over a small sample of wikitext.
+Let's compare the execution of an `openai-community/gpt2` language model training over a small sample of wikitext.

 The results are:

--- a/docs/source/en/perf_infer_gpu_one.md
+++ b/docs/source/en/perf_infer_gpu_one.md
@ -239,6 +239,7 @@ For now, Transformers supports SDPA inference and training for the following arc
 * [Phi3](https://huggingface.co/docs/transformers/model_doc/phi3#transformers.Phi3Model)
 * [Idefics](https://huggingface.co/docs/transformers/model_doc/idefics#transformers.IdeficsModel)
 * [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper#transformers.WhisperModel)
+* [mBart](https://huggingface.co/docs/transformers/model_doc/mbart#transformers.MBartModel)
 * [Mistral](https://huggingface.co/docs/transformers/model_doc/mistral#transformers.MistralModel)
 * [Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral#transformers.MixtralModel)
 * [StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm#transformers.StableLmModel)
--- a/docs/source/en/perf_train_gpu_one.md
+++ b/docs/source/en/perf_train_gpu_one.md
@ -395,7 +395,7 @@ Choose which backend to use by specifying it via `torch_compile_backend` in the
 * `dynamo.optimize("aot_cudagraphs")` - cudagraphs with AotAutograd. [Read more](https://github.com/pytorch/torchdynamo/pull/757)

 **Inference-only backend**s:
-* `dynamo.optimize("ofi")` -  Uses Torchscript optimize_for_inference.  [Read more](https://pytorch.org/docs/stable/generated/torch.jit.optimize_for_inference.html)
+* `dynamo.optimize("ofi")` -  Uses TorchScript optimize_for_inference.  [Read more](https://pytorch.org/docs/stable/generated/torch.jit.optimize_for_inference.html)
 * `dynamo.optimize("fx2trt")` -  Uses NVIDIA TensorRT for inference optimizations.  [Read more](https://pytorch.org/TensorRT/tutorials/getting_started_with_fx_path.html)
 * `dynamo.optimize("onnxrt")` -  Uses ONNXRT for inference on CPU/GPU.  [Read more](https://onnxruntime.ai/)
 * `dynamo.optimize("ipex")` -  Uses IPEX for inference on CPU.  [Read more](https://github.com/intel/intel-extension-for-pytorch)
@ -413,7 +413,7 @@ For example with a vanilla AdamW, the memory requirement for the optimizer state
 * Momentum: 4 bytes/param
 * Variance: 4 bytes/param

-Suppose a model with 7B parameters and 200 millions parameters injected with [Low Rank Adapters](https://huggingface.co/docs/peft/conceptual_guides/lora).
+Suppose a model with 7B parameters and 200 million parameters injected with [Low Rank Adapters](https://huggingface.co/docs/peft/conceptual_guides/lora).

 The memory requirement for the optimizer state of the plain model would be 12 * 7 = 84 GB (assuming 7B trainable parameters).

--- a/docs/source/en/perf_train_tpu_tf.md
+++ b/docs/source/en/perf_train_tpu_tf.md
@ -158,5 +158,5 @@ There was a lot in here, so let’s summarize with a quick checklist you can fol
 - Create your `TPUStrategy` and make sure dataset loading and model creation are inside the `strategy.scope()` (see [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/tpu_training-tf.ipynb))
 - Don’t forget to take `jit_compile=True` out again when you move to TPU!
 - 🙏🙏🙏🥺🥺🥺
- Call model.fit()
+- Call `model.fit()`
 - You did it!
--- a/docs/source/en/performance.md
+++ b/docs/source/en/performance.md
@ -24,7 +24,7 @@ Training large transformer models and deploying them to production present vario
 During training, the model may require more GPU memory than available or exhibit slow training speed. In the deployment 
 phase, the model can struggle to handle the required throughput in a production environment.

-This documentation aims to assist you in overcoming these challenges and finding the optimal setting for your use-case. 
+This documentation aims to assist you in overcoming these challenges and finding the optimal settings for your use-case. 
 The guides are divided into training and inference sections, as each comes with different challenges and solutions. 
 Within each section you'll find separate guides for different hardware configurations, such as single GPU vs. multi-GPU 
 for training or CPU vs. GPU for inference.
--- a/docs/source/en/pr_checks.md
+++ b/docs/source/en/pr_checks.md
@ -166,7 +166,7 @@ Note that instead of applying this to a whole class, you can apply it to the rel
 # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
 ```

-Sometimes the copy is exactly the same except for names: for instance in `RobertaAttention`, we use `RobertaSelfAttention` insted of `BertSelfAttention` but other than that, the code is exactly the same. This is why `# Copied from` supports simple string replacements with the following syntax: `Copied from xxx with foo->bar`. This means the code is copied with all instances of `foo` being replaced by `bar`. You can see how it used [here](https://github.com/huggingface/transformers/blob/2bd7a27a671fd1d98059124024f580f8f5c0f3b5/src/transformers/models/roberta/modeling_roberta.py#L304C1-L304C86) in `RobertaAttention` with the comment:
+Sometimes the copy is exactly the same except for names: for instance in `RobertaAttention`, we use `RobertaSelfAttention` instead of `BertSelfAttention` but other than that, the code is exactly the same. This is why `# Copied from` supports simple string replacements with the following syntax: `Copied from xxx with foo->bar`. This means the code is copied with all instances of `foo` being replaced by `bar`. You can see how it used [here](https://github.com/huggingface/transformers/blob/2bd7a27a671fd1d98059124024f580f8f5c0f3b5/src/transformers/models/roberta/modeling_roberta.py#L304C1-L304C86) in `RobertaAttention` with the comment:

 ```py
 # Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Roberta
--- a/docs/source/en/preprocessing.md
+++ b/docs/source/en/preprocessing.md
@ -18,7 +18,7 @@ rendered properly in your Markdown viewer.

 [[open-in-colab]]

-Before you can train a model on a dataset, it needs to be preprocessed into the expected model input format. Whether your data is text, images, or audio, they need to be converted and assembled into batches of tensors. 🤗 Transformers provides a set of preprocessing classes to help prepare your data for the model. In this tutorial, you'll learn that for:
+Before you can train a model on a dataset, it needs to be preprocessed into the expected model input format. Whether your data is text, images, or audio, it needs to be converted and assembled into batches of tensors. 🤗 Transformers provides a set of preprocessing classes to help prepare your data for the model. In this tutorial, you'll learn that for:

 * Text, use a [Tokenizer](./main_classes/tokenizer) to convert text into a sequence of tokens, create a numerical representation of the tokens, and assemble them into tensors.
 * Speech and audio, use a [Feature extractor](./main_classes/feature_extractor) to extract sequential features from audio waveforms and convert them into tensors.
--- a/docs/source/en/quantization/aqlm.md
+++ b/docs/source/en/quantization/aqlm.md
@ -19,7 +19,7 @@ rendered properly in your Markdown viewer.
 > [!TIP]
 > Try AQLM on [Google Colab](https://colab.research.google.com/drive/1-xZmBRXT5Fm3Ghn4Mwa2KRypORXb855X?usp=sharing)!

-Additive Quantization of Language Models ([AQLM](https://arxiv.org/abs/2401.06118)) is a Large Language Models compression method. It quantizes multiple weights together and take advantage of interdependencies between them. AQLM represents groups of 8-16 weights as a sum of multiple vector codes.
+Additive Quantization of Language Models ([AQLM](https://arxiv.org/abs/2401.06118)) is a Large Language Models compression method. It quantizes multiple weights together and takes advantage of interdependencies between them. AQLM represents groups of 8-16 weights as a sum of multiple vector codes.

 Inference support for AQLM is realised in the `aqlm` library. Make sure to install it to run the models (note aqlm works only with python>=3.10):
 ```bash
--- a/docs/source/en/quantization/bitsandbytes.md
+++ b/docs/source/en/quantization/bitsandbytes.md
@ -274,7 +274,7 @@ For inference, the `bnb_4bit_quant_type` does not have a huge impact on performa

 ### Nested quantization

-Nested quantization is a technique that can save additional memory at no additional performance cost. This feature performs a second quantization of the already quantized weights to save an addition 0.4 bits/parameter. For example, with nested quantization, you can finetune a [Llama-13b](https://huggingface.co/meta-llama/Llama-2-13b) model on a 16GB NVIDIA T4 GPU with a sequence length of 1024, a batch size of 1, and enabling gradient accumulation with 4 steps.
+Nested quantization is a technique that can save additional memory at no additional performance cost. This feature performs a second quantization of the already quantized weights to save an additional 0.4 bits/parameter. For example, with nested quantization, you can finetune a [Llama-13b](https://huggingface.co/meta-llama/Llama-2-13b) model on a 16GB NVIDIA T4 GPU with a sequence length of 1024, a batch size of 1, and enabling gradient accumulation with 4 steps.

 ```py
 from transformers import BitsAndBytesConfig
--- a/docs/source/en/quantization/eetq.md
+++ b/docs/source/en/quantization/eetq.md
@ -18,7 +18,7 @@ rendered properly in your Markdown viewer.

 The [EETQ](https://github.com/NetEase-FuXi/EETQ) library supports int8 per-channel weight-only quantization for NVIDIA GPUS. The high-performance GEMM and GEMV kernels are from FasterTransformer and TensorRT-LLM. It requires no calibration dataset and does not need to pre-quantize your model. Moreover, the accuracy degradation is negligible owing to the per-channel quantization. 

-Make sure you have eetq installed from the [relase page](https://github.com/NetEase-FuXi/EETQ/releases)
+Make sure you have eetq installed from the [release page](https://github.com/NetEase-FuXi/EETQ/releases)
 ```
 pip install --no-cache-dir https://github.com/NetEase-FuXi/EETQ/releases/download/v1.0.0/EETQ-1.0.0+cu121+torch2.1.2-cp310-cp310-linux_x86_64.whl
 ```
--- a/docs/source/en/quantization/fbgemm_fp8.md
+++ b/docs/source/en/quantization/fbgemm_fp8.md
@ -31,7 +31,7 @@ Before you begin, make sure the following libraries are installed with their lat
 pip install --upgrade accelerate fbgemm-gpu torch
 ```

-If you are having issues with fbgemm-gpu and torch library, you might need to install the nighlty release. You can follow the instruction [here](https://pytorch.org/FBGEMM/fbgemm_gpu-development/InstallationInstructions.html#fbgemm-gpu-install-libraries:~:text=found%20here.-,Install%20the%20FBGEMM_GPU%20Package,-Install%20through%20PyTorch)
+If you are having issues with fbgemm-gpu and torch library, you might need to install the nightly release. You can follow the instruction [here](https://pytorch.org/FBGEMM/fbgemm_gpu-development/InstallationInstructions.html#fbgemm-gpu-install-libraries:~:text=found%20here.-,Install%20the%20FBGEMM_GPU%20Package,-Install%20through%20PyTorch)


 ```py
--- a/docs/source/en/quantization/hqq.md
+++ b/docs/source/en/quantization/hqq.md
@ -64,6 +64,6 @@ model = transformers.AutoModelForCausalLM.from_pretrained(

 ## Optimized Runtime

-HQQ supports various backends, including pure Pytorch and custom dequantization CUDA kernels. These backends are suitable for older gpus and peft/QLoRA training.
+HQQ supports various backends, including pure PyTorch and custom dequantization CUDA kernels. These backends are suitable for older gpus and peft/QLoRA training.
 For faster inference, HQQ supports 4-bit fused kernels (TorchAO and Marlin), reaching up to 200 tokens/sec on a single 4090.
 For more details on how to use the backends, please refer to https://github.com/mobiusml/hqq/?tab=readme-ov-file#backend
--- a/docs/source/en/quantization/quanto.md
+++ b/docs/source/en/quantization/quanto.md
@ -55,7 +55,7 @@ quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cud

 Note that serialization is not supported yet with transformers but it is coming soon! If you want to save the model, you can use quanto library instead.

-Quanto library uses linear quantization algorithm for quantization. Even though this is a basic quantization technique, we get very good results! Have a look at the following becnhmark (llama-2-7b on perplexity metric). You can find more benchamarks [here](https://github.com/huggingface/quanto/tree/main/bench/generation)
+Quanto library uses linear quantization algorithm for quantization. Even though this is a basic quantization technique, we get very good results! Have a look at the following benchmark (llama-2-7b on perplexity metric). You can find more benchmarks [here](https://github.com/huggingface/quanto/tree/main/bench/generation)

 <div class="flex gap-4">
  <div>
@ -63,4 +63,4 @@ Quanto library uses linear quantization algorithm for quantization. Even though
  </div>
 </div>

-The library is versatible enough to be compatible with most PTQ optimization algorithms. The plan in the future is to integrate the most popular algorithms in the most seamless possible way (AWQ, Smoothquant).
+The library is versatile enough to be compatible with most PTQ optimization algorithms. The plan in the future is to integrate the most popular algorithms in the most seamless possible way (AWQ, Smoothquant).
--- a/docs/source/en/quantization/torchao.md
+++ b/docs/source/en/quantization/torchao.md
@ -33,7 +33,7 @@ tokenizer = AutoTokenizer.from_pretrained(model_name)
 input_text = "What are we having for dinner?"
 input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")

-# compile the quantizd model to get speedup
+# compile the quantized model to get speedup
 import torchao
 torchao.quantization.utils.recommended_inductor_config_setter()
 quantized_model = torch.compile(quantized_model, mode="max-autotune")
--- a/docs/source/en/run_scripts.md
+++ b/docs/source/en/run_scripts.md
@ -126,7 +126,7 @@ python examples/tensorflow/summarization/run_summarization.py  \

 The [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) supports distributed training and mixed precision, which means you can also use it in a script. To enable both of these features:

- Add the `fp16` argument to enable mixed precision.
+- Add the `fp16` or `bf16` argument to enable mixed precision. XPU devices only supports `bf16` for mixed precision training.
 - Set the number of GPUs to use with the `nproc_per_node` argument.

 ```bash
@ -287,7 +287,7 @@ Another helpful option to enable is resuming training from a previous checkpoint
 The first method uses the `output_dir previous_output_dir` argument to resume training from the latest checkpoint stored in `output_dir`. In this case, you should remove `overwrite_output_dir`:

 ```bash
-python examples/pytorch/summarization/run_summarization.py
+python examples/pytorch/summarization/run_summarization.py \
    --model_name_or_path google-t5/t5-small \
    --do_train \
    --do_eval \
@ -304,7 +304,7 @@ python examples/pytorch/summarization/run_summarization.py
 The second method uses the `resume_from_checkpoint path_to_specific_checkpoint` argument to resume training from a specific checkpoint folder.

 ```bash
-python examples/pytorch/summarization/run_summarization.py
+python examples/pytorch/summarization/run_summarization.py \
    --model_name_or_path google-t5/t5-small \
    --do_train \
    --do_eval \
@ -334,7 +334,7 @@ To give your repository a specific name, use the `push_to_hub_model_id` argument
 The following example shows how to upload a model with a specific repository name:

 ```bash
-python examples/pytorch/summarization/run_summarization.py
+python examples/pytorch/summarization/run_summarization.py \
    --model_name_or_path google-t5/t5-small \
    --do_train \
    --do_eval \
--- a/docs/source/en/sagemaker.md
+++ b/docs/source/en/sagemaker.md
@ -22,7 +22,7 @@ rendered properly in your Markdown viewer.

 The documentation has been moved to [hf.co/docs/sagemaker](https://huggingface.co/docs/sagemaker). This page will be removed in `transformers` 5.0. 

-### Table of Content
+### Table of Contents

 - [Train Hugging Face models on Amazon SageMaker with the SageMaker Python SDK](https://huggingface.co/docs/sagemaker/train)
 - [Deploy Hugging Face models to Amazon SageMaker with the SageMaker Python SDK](https://huggingface.co/docs/sagemaker/inference)
--- a/docs/source/en/serialization.md
+++ b/docs/source/en/serialization.md
@ -153,11 +153,11 @@ directly.

 <Tip warning={true}>

-`tranformers.onnx` is no longer maintained, please export models with 🤗 Optimum as described above. This section will be removed in the future versions.
+`transformers.onnx` is no longer maintained, please export models with 🤗 Optimum as described above. This section will be removed in the future versions.

 </Tip>

-To export a 🤗 Transformers model to ONNX with `tranformers.onnx`, install extra dependencies:
+To export a 🤗 Transformers model to ONNX with `transformers.onnx`, install extra dependencies:

 ```bash
 pip install transformers[onnx]
--- a/docs/source/en/tasks/asr.md
+++ b/docs/source/en/tasks/asr.md
@ -196,7 +196,7 @@ Now instantiate your `DataCollatorForCTCWithPadding`:

 ## Evaluate

-Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [word error rate](https://huggingface.co/spaces/evaluate-metric/wer) (WER) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric):
+Including a metric during training is often helpful for evaluating your model's performance. You can quickly load an evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [word error rate](https://huggingface.co/spaces/evaluate-metric/wer) (WER) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric):

 ```py
 >>> import evaluate
--- a/docs/source/en/tasks/audio_classification.md
+++ b/docs/source/en/tasks/audio_classification.md
@ -164,7 +164,7 @@ To apply the preprocessing function over the entire dataset, use 🤗 Datasets [

 ## Evaluate

-Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric):
+Including a metric during training is often helpful for evaluating your model's performance. You can quickly load an evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric):

 ```py
 >>> import evaluate
--- a/docs/source/en/tasks/image_text_to_text.md
+++ b/docs/source/en/tasks/image_text_to_text.md
@ -204,7 +204,7 @@ for value in generator:

 ## Fit models in smaller hardware

-VLMs are often large and need to be optimized to fit in smaller hardware. Transformers supports many model quantization libraries, and here we will only show int8 quantization with [Quanto](./quantization/quanto#quanto). int8 quantization offers memory improvements up to 75 percent (if all weights are quantized). However it is no free lunch, since 8-bit is not a CUDA-native precision, the weights are quantized back and forth on the fly, which adds up to latency. 
+VLMs are often large and need to be optimized to fit on smaller hardware. Transformers supports many model quantization libraries, and here we will only show int8 quantization with [Quanto](./quantization/quanto#quanto). int8 quantization offers memory improvements up to 75 percent (if all weights are quantized). However it is no free lunch, since 8-bit is not a CUDA-native precision, the weights are quantized back and forth on the fly, which adds up to latency. 

 First, install dependencies.

--- a/docs/source/en/tasks/image_to_image.md
+++ b/docs/source/en/tasks/image_to_image.md
@ -36,6 +36,7 @@ We can now initialize the pipeline with a [Swin2SR model](https://huggingface.co

 ```python
 from transformers import pipeline
+import torch

 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 pipe = pipeline(task="image-to-image", model="caidas/swin2SR-lightweight-x2-64", device=device)
--- a/docs/source/en/tasks/language_modeling.md
+++ b/docs/source/en/tasks/language_modeling.md
@ -253,6 +253,7 @@ At this point, only three steps remain:
 ...     train_dataset=lm_dataset["train"],
 ...     eval_dataset=lm_dataset["test"],
 ...     data_collator=data_collator,
+...     tokenizer=tokenizer,
 ... )

 >>> trainer.train()
--- a/docs/source/en/tasks/masked_language_modeling.md
+++ b/docs/source/en/tasks/masked_language_modeling.md
@ -245,6 +245,7 @@ At this point, only three steps remain:
 ...     train_dataset=lm_dataset["train"],
 ...     eval_dataset=lm_dataset["test"],
 ...     data_collator=data_collator,
+...     tokenizer=tokenizer,
 ... )

 >>> trainer.train()
--- a/docs/source/en/tasks/monocular_depth_estimation.md
+++ b/docs/source/en/tasks/monocular_depth_estimation.md
@ -159,7 +159,7 @@ def colorize(value, vmin=None, vmax=None, cmap='gray_r', invalid_val=-99, invali
    """Converts a depth map to a color image.

    Args:
-        value (torch.Tensor, numpy.ndarry): Input depth map. Shape: (H, W) or (1, H, W) or (1, 1, H, W). All singular dimensions are squeezed
+        value (torch.Tensor, numpy.ndarray): Input depth map. Shape: (H, W) or (1, H, W) or (1, 1, H, W). All singular dimensions are squeezed
        vmin (float, optional): vmin-valued entries are mapped to start color of cmap. If None, value.min() is used. Defaults to None.
        vmax (float, optional):  vmax-valued entries are mapped to end color of cmap. If None, value.max() is used. Defaults to None.
        cmap (str, optional): matplotlib colormap to use. Defaults to 'magma_r'.
--- a/docs/source/en/tasks/multiple_choice.md
+++ b/docs/source/en/tasks/multiple_choice.md
@ -399,7 +399,7 @@ Tokenize each prompt and candidate answer pair and return PyTorch tensors. You s
 ```py
 >>> from transformers import AutoTokenizer

->>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_swag_model")
+>>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_swag_model")
 >>> inputs = tokenizer([[prompt, candidate1], [prompt, candidate2]], return_tensors="pt", padding=True)
 >>> labels = torch.tensor(0).unsqueeze(0)
 ```
@ -409,7 +409,7 @@ Pass your inputs and labels to the model and return the `logits`:
 ```py
 >>> from transformers import AutoModelForMultipleChoice

->>> model = AutoModelForMultipleChoice.from_pretrained("my_awesome_swag_model")
+>>> model = AutoModelForMultipleChoice.from_pretrained("username/my_awesome_swag_model")
 >>> outputs = model(**{k: v.unsqueeze(0) for k, v in inputs.items()}, labels=labels)
 >>> logits = outputs.logits
 ```
@ -428,7 +428,7 @@ Tokenize each prompt and candidate answer pair and return TensorFlow tensors:
 ```py
 >>> from transformers import AutoTokenizer

->>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_swag_model")
+>>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_swag_model")
 >>> inputs = tokenizer([[prompt, candidate1], [prompt, candidate2]], return_tensors="tf", padding=True)
 ```

@ -437,7 +437,7 @@ Pass your inputs to the model and return the `logits`:
 ```py
 >>> from transformers import TFAutoModelForMultipleChoice

->>> model = TFAutoModelForMultipleChoice.from_pretrained("my_awesome_swag_model")
+>>> model = TFAutoModelForMultipleChoice.from_pretrained("username/my_awesome_swag_model")
 >>> inputs = {k: tf.expand_dims(v, 0) for k, v in inputs.items()}
 >>> outputs = model(inputs)
 >>> logits = outputs.logits
--- a/docs/source/en/tasks/prompting.md
+++ b/docs/source/en/tasks/prompting.md
@ -290,7 +290,7 @@ Result: Modern tools often used to make gazpacho include
 #### Reasoning

 Reasoning is one of the most difficult tasks for LLMs, and achieving good results often requires applying advanced prompting techniques, like 
-[Chain-of-though](#chain-of-thought).
+[Chain-of-thought](#chain-of-thought).

 Let's try if we can make a model reason about a simple arithmetics task with a basic prompt: 

--- a/docs/source/en/tasks/summarization.md
+++ b/docs/source/en/tasks/summarization.md
@ -205,7 +205,7 @@ At this point, only three steps remain:
 ...     save_total_limit=3,
 ...     num_train_epochs=4,
 ...     predict_with_generate=True,
-...     fp16=True,
+...     fp16=True, #change to bf16=True for XPU
 ...     push_to_hub=True,
 ... )

@ -336,7 +336,7 @@ The simplest way to try out your finetuned model for inference is to use it in a
 ```py
 >>> from transformers import pipeline

->>> summarizer = pipeline("summarization", model="stevhliu/my_awesome_billsum_model")
+>>> summarizer = pipeline("summarization", model="username/my_awesome_billsum_model")
 >>> summarizer(text)
 [{"summary_text": "The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country."}]
 ```
@ -351,7 +351,7 @@ Tokenize the text and return the `input_ids` as PyTorch tensors:
 ```py
 >>> from transformers import AutoTokenizer

->>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_billsum_model")
+>>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_billsum_model")
 >>> inputs = tokenizer(text, return_tensors="pt").input_ids
 ```

@ -360,7 +360,7 @@ Use the [`~generation.GenerationMixin.generate`] method to create the summarizat
 ```py
 >>> from transformers import AutoModelForSeq2SeqLM

->>> model = AutoModelForSeq2SeqLM.from_pretrained("stevhliu/my_awesome_billsum_model")
+>>> model = AutoModelForSeq2SeqLM.from_pretrained("username/my_awesome_billsum_model")
 >>> outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)
 ```

@ -377,7 +377,7 @@ Tokenize the text and return the `input_ids` as TensorFlow tensors:
 ```py
 >>> from transformers import AutoTokenizer

->>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_billsum_model")
+>>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_billsum_model")
 >>> inputs = tokenizer(text, return_tensors="tf").input_ids
 ```

@ -386,7 +386,7 @@ Use the [`~transformers.generation_tf_utils.TFGenerationMixin.generate`] method
 ```py
 >>> from transformers import TFAutoModelForSeq2SeqLM

->>> model = TFAutoModelForSeq2SeqLM.from_pretrained("stevhliu/my_awesome_billsum_model")
+>>> model = TFAutoModelForSeq2SeqLM.from_pretrained("username/my_awesome_billsum_model")
 >>> outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)
 ```

--- a/docs/source/en/tasks/text-to-speech.md
+++ b/docs/source/en/tasks/text-to-speech.md
@ -580,7 +580,7 @@ Load the model from the 🤗 Hub:
 >>> model = SpeechT5ForTextToSpeech.from_pretrained("YOUR_ACCOUNT/speecht5_finetuned_voxpopuli_nl")
 ```

-Pick an example from the test dataset obtain a speaker embedding. 
+Pick an example from the test dataset to obtain a speaker embedding. 

 ```py 
 >>> example = dataset["test"][304]
--- a/docs/source/en/tasks/translation.md
+++ b/docs/source/en/tasks/translation.md
@ -212,7 +212,7 @@ At this point, only three steps remain:
 ...     save_total_limit=3,
 ...     num_train_epochs=2,
 ...     predict_with_generate=True,
-...     fp16=True,
+...     fp16=True, #change to bf16=True for XPU
 ...     push_to_hub=True,
 ... )

@ -346,7 +346,7 @@ The simplest way to try out your finetuned model for inference is to use it in a
 # Change `xx` to the language of the input and `yy` to the language of the desired output.
 # Examples: "en" for English, "fr" for French, "de" for German, "es" for Spanish, "zh" for Chinese, etc; translation_en_to_fr translates English to French
 # You can view all the lists of languages here - https://huggingface.co/languages
->>> translator = pipeline("translation_xx_to_yy", model="my_awesome_opus_books_model")
+>>> translator = pipeline("translation_xx_to_yy", model="username/my_awesome_opus_books_model")
 >>> translator(text)
 [{'translation_text': 'Legumes partagent des ressources avec des bactéries azotantes.'}]
 ```
@ -360,7 +360,7 @@ Tokenize the text and return the `input_ids` as PyTorch tensors:
 ```py
 >>> from transformers import AutoTokenizer

->>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_opus_books_model")
+>>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_opus_books_model")
 >>> inputs = tokenizer(text, return_tensors="pt").input_ids
 ```

@ -369,7 +369,7 @@ Use the [`~generation.GenerationMixin.generate`] method to create the translatio
 ```py
 >>> from transformers import AutoModelForSeq2SeqLM

->>> model = AutoModelForSeq2SeqLM.from_pretrained("my_awesome_opus_books_model")
+>>> model = AutoModelForSeq2SeqLM.from_pretrained("username/my_awesome_opus_books_model")
 >>> outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)
 ```

@ -386,7 +386,7 @@ Tokenize the text and return the `input_ids` as TensorFlow tensors:
 ```py
 >>> from transformers import AutoTokenizer

->>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_opus_books_model")
+>>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_opus_books_model")
 >>> inputs = tokenizer(text, return_tensors="tf").input_ids
 ```

@ -395,7 +395,7 @@ Use the [`~transformers.generation_tf_utils.TFGenerationMixin.generate`] method
 ```py
 >>> from transformers import TFAutoModelForSeq2SeqLM

->>> model = TFAutoModelForSeq2SeqLM.from_pretrained("my_awesome_opus_books_model")
+>>> model = TFAutoModelForSeq2SeqLM.from_pretrained("username/my_awesome_opus_books_model")
 >>> outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)
 ```

--- a/docs/source/en/tasks/video_classification.md
+++ b/docs/source/en/tasks/video_classification.md
@ -191,7 +191,7 @@ You should probably TRAIN this model on a down-stream task to be able to use it

 The warning is telling us we are throwing away some weights (e.g. the weights and bias of the `classifier` layer) and randomly initializing some others (the weights and bias of a new `classifier` layer). This is expected in this case, because we are adding a new head for which we don't have pretrained weights, so the library warns us we should fine-tune this model before using it for inference, which is exactly what we are going to do.

-**Note** that [this checkpoint](https://huggingface.co/MCG-NJU/videomae-base-finetuned-kinetics) leads to better performance on this task as the checkpoint was obtained fine-tuning on a similar downstream task having considerable domain overlap. You can check out [this checkpoint](https://huggingface.co/sayakpaul/videomae-base-finetuned-kinetics-finetuned-ucf101-subset) which was obtained by fine-tuning `MCG-NJU/videomae-base-finetuned-kinetics`.  
+**Note** that [this checkpoint](https://huggingface.co/MCG-NJU/videomae-base-finetuned-kinetics) leads to better performance on this task as the checkpoint was obtained by fine-tuning on a similar downstream task having considerable domain overlap. You can check out [this checkpoint](https://huggingface.co/sayakpaul/videomae-base-finetuned-kinetics-finetuned-ucf101-subset) which was obtained by fine-tuning `MCG-NJU/videomae-base-finetuned-kinetics`.  

 ## Prepare the datasets for training

--- a/docs/source/en/tasks/zero_shot_object_detection.md
+++ b/docs/source/en/tasks/zero_shot_object_detection.md
@ -26,8 +26,8 @@ is an open-vocabulary object detector. It means that it can detect objects in im
 the need to fine-tune the model on labeled datasets.

 OWL-ViT leverages multi-modal representations to perform open-vocabulary detection. It combines [CLIP](../model_doc/clip) with
-lightweight object classification and localization heads. Open-vocabulary detection is achieved by embedding free-text queries with the text encoder of CLIP and using them as input to the object classification and localization heads.
-associate images and their corresponding textual descriptions, and ViT processes image patches as inputs. The authors
+lightweight object classification and localization heads. Open-vocabulary detection is achieved by embedding free-text queries with the text encoder of CLIP and using them as input to the object classification and localization heads,
+which associate images with their corresponding textual descriptions, while ViT processes image patches as inputs. The authors
 of OWL-ViT first trained CLIP from scratch and then fine-tuned OWL-ViT end to end on standard object detection datasets using
 a bipartite matching loss.

--- a/docs/source/en/tasks_explained.md
+++ b/docs/source/en/tasks_explained.md
@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.

 # How 🤗 Transformers solve tasks

-In [What 🤗 Transformers can do](task_summary), you learned about natural language processing (NLP), speech and audio, computer vision tasks, and some important applications of them. This page will look closely at how models solve these tasks and explain what's happening under the hood. There are many ways to solve a given task, some models may implement certain techniques or even approach the task from a new angle, but for Transformer models, the general idea is the same. Owing to its flexible architecture, most models are a variant of an encoder, decoder, or encoder-decoder structure. In addition to Transformer models, our library also has several convolutional neural networks (CNNs), which are still used today for computer vision tasks. We'll also explain how a modern CNN works.
+In [What 🤗 Transformers can do](task_summary), you learned about natural language processing (NLP), speech and audio, computer vision tasks, and some important applications of them. This page will look closely at how models solve these tasks and explain what's happening under the hood. There are many ways to solve a given task, some models may implement certain techniques or even approach the task from a new angle, but for Transformer models, the general idea is the same. Owing to its flexible architecture, most models are a variant of an encoder, a decoder, or an encoder-decoder structure. In addition to Transformer models, our library also has several convolutional neural networks (CNNs), which are still used today for computer vision tasks. We'll also explain how a modern CNN works.

 To explain how tasks are solved, we'll walk through what goes on inside the model to output useful predictions.

--- a/docs/source/en/testing.md
+++ b/docs/source/en/testing.md
@ -1226,6 +1226,8 @@ import numpy as np
 np.random.seed(seed)

 # tf RNG
+import tensorflow as tf 
+
 tf.random.set_seed(seed)
 ```

--- a/docs/source/en/tiktoken.md
+++ b/docs/source/en/tiktoken.md
@ -0,0 +1,38 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+``
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Tiktoken and interaction with Transformers
+
+Support for tiktoken model files is seamlessly integrated in 🤗 transformers when loading models 
+`from_pretrained` with a `tokenizer.model` tiktoken file on the Hub, which is automatically converted into our 
+[fast tokenizer](https://huggingface.co/docs/transformers/main/en/main_classes/tokenizer#transformers.PreTrainedTokenizerFast). 
+
+### Known models that were released with a `tiktoken.model`:
+	- gpt2
+	- llama3
+
+## Example usage
+ 
+In order to load `tiktoken` files in `transformers`, ensure that the `tokenizer.model` file is a tiktoken file and it 
+will automatically be loaded when loading `from_pretrained`. Here is how one would load a tokenizer and a model, which 
+ can be loaded from the exact same file:
+
+```py
+from transformers import AutoTokenizer
+
+model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
+tokenizer = AutoTokenizer.from_pretrained(model_id, subfolder="original") 
+```
--- a/docs/source/en/torchscript.md
+++ b/docs/source/en/torchscript.md
@ -219,7 +219,7 @@ You only need to modify the following line:

 ```diff
 - torch.jit.trace(model, [tokens_tensor, segments_tensors])
-+ torch.neuron.trace(model, [token_tensor, segments_tensors])
+ torch.neuron.trace(model, [tokens_tensor, segments_tensors])
 ```

 This enables the Neuron SDK to trace the model and optimize it for Inf1 instances.
--- a/docs/source/en/trainer.md
+++ b/docs/source/en/trainer.md
@ -299,7 +299,7 @@ trainer = trl.SFTTrainer(
 trainer.train()
 ```

-To pass extra arguments supports by GaLore, you should pass correctly `optim_args`, for example:
+To pass extra arguments supported by GaLore, you should pass correctly `optim_args`, for example:

 ```python
 import torch
@ -518,6 +518,51 @@ trainer.train()

 This script demonstrates how to fine-tune the `google/gemma-2b` model on the IMDB dataset using the GrokAdamW optimizer. The `TrainingArguments` are configured to use GrokAdamW, and the dataset is passed to the `Trainer` for training.

+## Schedule Free Optimizer
+
+The Schedule Free optimizers have been introduced in [The Road Less Scheduled](https://hf.co/papers/2405.15682).
+Schedule-Free learning replaces the momentum of the base optimizer with a combination of averaging and interpolation, to completely remove the need to anneal the learning rate with a traditional schedule.
+Supported optimizers for SFO are `"schedule_free_adamw"` and `"schedule_free_sgd"`. First install schedulefree from pypi `pip install schedulefree`.
+
+Below is a simple script to demonstrate how to fine-tune [google/gemma-2b](https://huggingface.co/google/gemma-2b) on IMDB dataset in full precision:
+
+```python
+import torch
+import datasets
+from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM
+import trl
+
+train_dataset = datasets.load_dataset('imdb', split='train')
+
+args = TrainingArguments(
+    output_dir="./test-schedulefree",
+    max_steps=1000,
+    per_device_train_batch_size=4,
+    optim="schedule_free_adamw",
+    gradient_checkpointing=True,
+    logging_strategy="steps",
+    logging_steps=1,
+    learning_rate=2e-6,
+    save_strategy="no",
+    run_name="sfo-imdb",
+)
+
+model_id = "google/gemma-2b"
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True).to(0)
+
+trainer = trl.SFTTrainer(
+    model=model, 
+    args=args,
+    train_dataset=train_dataset,
+    dataset_text_field='text',
+    max_seq_length=1024,
+)
+
+trainer.train()
+```
+
 ## Accelerate and Trainer

 The [`Trainer`] class is powered by [Accelerate](https://hf.co/docs/accelerate), a library for easily training PyTorch models in distributed environments with support for integrations such as [FullyShardedDataParallel (FSDP)](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/) and [DeepSpeed](https://www.deepspeed.ai/).
--- a/examples/tensorflow/image-classification/run_image_classification.py
+++ b/examples/tensorflow/image-classification/run_image_classification.py
@ -282,7 +282,6 @@ def main():
            data_args.dataset_name,
            data_args.dataset_config_name,
            cache_dir=model_args.cache_dir,
-            task="image-classification",
            token=model_args.token,
            trust_remote_code=model_args.trust_remote_code,
        )
@ -296,7 +295,6 @@ def main():
            "imagefolder",
            data_files=data_files,
            cache_dir=model_args.cache_dir,
-            task="image-classification",
        )
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.
--- a/setup.py
+++ b/setup.py
@ -99,6 +99,7 @@ _deps = [
    "accelerate>=0.26.0",
    "av==9.2.0",  # Latest version of PyAV (10.0.0) has issues with audio stream.
    "beautifulsoup4",
+    "blobfile",
    "codecarbon==1.2.0",
    "cookiecutter==1.7.3",
    "dataclasses",
@ -162,6 +163,7 @@ _deps = [
    "sacremoses",
    "safetensors>=0.4.1",
    "sagemaker>=2.31.0",
+    "schedulefree>=1.2.6",
    "scikit-learn",
    "scipy<1.13.0",  # SciPy >= 1.13.0 is not supported with the current jax pin (`jax>=0.4.1,<=0.4.13`)
    "sentencepiece>=0.1.91,!=0.1.92",
@ -177,6 +179,7 @@ _deps = [
    "tensorflow-probability<0.24",
    "tf2onnx",
    "timeout-decorator",
+    "tiktoken",
    "timm<=0.9.16",
    "tokenizers>=0.19,<0.20",
    "torch",
@ -311,6 +314,7 @@ extras["codecarbon"] = deps_list("codecarbon")
 extras["video"] = deps_list("decord", "av")

 extras["sentencepiece"] = deps_list("sentencepiece", "protobuf")
+extras["tiktoken"] = deps_list("tiktoken", "blobfile")
 extras["testing"] = (
    deps_list(
        "pytest",
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@ -58,6 +58,7 @@ _import_structure = {
        "Agent",
        "CodeAgent",
        "HfApiEngine",
+        "ManagedAgent",
        "PipelineTool",
        "ReactAgent",
        "ReactCodeAgent",
@ -1322,6 +1323,13 @@ else:
            "WhisperTimeStampLogitsProcessor",
        ]
    )
+
+    # PyTorch domain libraries integration
+    _import_structure["integrations.executorch"] = [
+        "TorchExportableModuleWithStaticCache",
+        "convert_and_export_with_cache",
+    ]
+
    _import_structure["modeling_flash_attention_utils"] = []
    _import_structure["modeling_outputs"] = []
    _import_structure["modeling_rope_utils"] = ["ROPE_INIT_FUNCTIONS"]
@ -1499,7 +1507,6 @@ else:
            "BertForQuestionAnswering",
            "BertForSequenceClassification",
            "BertForTokenClassification",
-            "BertLayer",
            "BertLMHeadModel",
            "BertModel",
            "BertPreTrainedModel",
@ -1523,7 +1530,6 @@ else:
            "BigBirdForQuestionAnswering",
            "BigBirdForSequenceClassification",
            "BigBirdForTokenClassification",
-            "BigBirdLayer",
            "BigBirdModel",
            "BigBirdPreTrainedModel",
            "load_tf_weights_in_big_bird",
@ -1642,7 +1648,6 @@ else:
            "CanineForQuestionAnswering",
            "CanineForSequenceClassification",
            "CanineForTokenClassification",
-            "CanineLayer",
            "CanineModel",
            "CaninePreTrainedModel",
            "load_tf_weights_in_canine",
@ -1729,7 +1734,6 @@ else:
            "ConvBertForQuestionAnswering",
            "ConvBertForSequenceClassification",
            "ConvBertForTokenClassification",
-            "ConvBertLayer",
            "ConvBertModel",
            "ConvBertPreTrainedModel",
            "load_tf_weights_in_convbert",
@ -1958,7 +1962,6 @@ else:
            "QDQBertForQuestionAnswering",
            "QDQBertForSequenceClassification",
            "QDQBertForTokenClassification",
-            "QDQBertLayer",
            "QDQBertLMHeadModel",
            "QDQBertModel",
            "QDQBertPreTrainedModel",
@ -2210,7 +2213,6 @@ else:
            "FNetForQuestionAnswering",
            "FNetForSequenceClassification",
            "FNetForTokenClassification",
-            "FNetLayer",
            "FNetModel",
            "FNetPreTrainedModel",
        ]
@ -2311,7 +2313,6 @@ else:
            "GPTNeoXForQuestionAnswering",
            "GPTNeoXForSequenceClassification",
            "GPTNeoXForTokenClassification",
-            "GPTNeoXLayer",
            "GPTNeoXModel",
            "GPTNeoXPreTrainedModel",
        ]
@ -2319,7 +2320,6 @@ else:
    _import_structure["models.gpt_neox_japanese"].extend(
        [
            "GPTNeoXJapaneseForCausalLM",
-            "GPTNeoXJapaneseLayer",
            "GPTNeoXJapaneseModel",
            "GPTNeoXJapanesePreTrainedModel",
        ]
@ -2551,7 +2551,6 @@ else:
            "LongformerForTokenClassification",
            "LongformerModel",
            "LongformerPreTrainedModel",
-            "LongformerSelfAttention",
        ]
    )
    _import_structure["models.longt5"].extend(
@ -2584,7 +2583,6 @@ else:
            "LxmertModel",
            "LxmertPreTrainedModel",
            "LxmertVisualFeatureEncoder",
-            "LxmertXLayer",
        ]
    )
    _import_structure["models.m2m_100"].extend(
@ -2608,7 +2606,9 @@ else:
            "Mamba2PreTrainedModel",
        ]
    )
-    _import_structure["models.marian"].extend(["MarianForCausalLM", "MarianModel", "MarianMTModel"])
+    _import_structure["models.marian"].extend(
+        ["MarianForCausalLM", "MarianModel", "MarianMTModel", "MarianPreTrainedModel"]
+    )
    _import_structure["models.markuplm"].extend(
        [
            "MarkupLMForQuestionAnswering",
@ -2691,7 +2691,6 @@ else:
            "MobileBertForQuestionAnswering",
            "MobileBertForSequenceClassification",
            "MobileBertForTokenClassification",
-            "MobileBertLayer",
            "MobileBertModel",
            "MobileBertPreTrainedModel",
            "load_tf_weights_in_mobilebert",
@ -2737,7 +2736,6 @@ else:
            "MPNetForQuestionAnswering",
            "MPNetForSequenceClassification",
            "MPNetForTokenClassification",
-            "MPNetLayer",
            "MPNetModel",
            "MPNetPreTrainedModel",
        ]
@ -2827,7 +2825,6 @@ else:
            "NystromformerForQuestionAnswering",
            "NystromformerForSequenceClassification",
            "NystromformerForTokenClassification",
-            "NystromformerLayer",
            "NystromformerModel",
            "NystromformerPreTrainedModel",
        ]
@ -2941,7 +2938,6 @@ else:
            "PerceiverForMultimodalAutoencoding",
            "PerceiverForOpticalFlow",
            "PerceiverForSequenceClassification",
-            "PerceiverLayer",
            "PerceiverModel",
            "PerceiverPreTrainedModel",
        ]
@ -3077,11 +3073,9 @@ else:
    )
    _import_structure["models.reformer"].extend(
        [
-            "ReformerAttention",
            "ReformerForMaskedLM",
            "ReformerForQuestionAnswering",
            "ReformerForSequenceClassification",
-            "ReformerLayer",
            "ReformerModel",
            "ReformerModelWithLMHead",
            "ReformerPreTrainedModel",
@ -3102,7 +3096,6 @@ else:
            "RemBertForQuestionAnswering",
            "RemBertForSequenceClassification",
            "RemBertForTokenClassification",
-            "RemBertLayer",
            "RemBertModel",
            "RemBertPreTrainedModel",
            "load_tf_weights_in_rembert",
@ -3149,7 +3142,6 @@ else:
            "RoCBertForQuestionAnswering",
            "RoCBertForSequenceClassification",
            "RoCBertForTokenClassification",
-            "RoCBertLayer",
            "RoCBertModel",
            "RoCBertPreTrainedModel",
            "load_tf_weights_in_roc_bert",
@ -3163,7 +3155,6 @@ else:
            "RoFormerForQuestionAnswering",
            "RoFormerForSequenceClassification",
            "RoFormerForTokenClassification",
-            "RoFormerLayer",
            "RoFormerModel",
            "RoFormerPreTrainedModel",
            "load_tf_weights_in_roformer",
@ -3220,7 +3211,6 @@ else:
            "SegformerDecodeHead",
            "SegformerForImageClassification",
            "SegformerForSemanticSegmentation",
-            "SegformerLayer",
            "SegformerModel",
            "SegformerPreTrainedModel",
        ]
@ -3279,7 +3269,6 @@ else:
        [
            "SplinterForPreTraining",
            "SplinterForQuestionAnswering",
-            "SplinterLayer",
            "SplinterModel",
            "SplinterPreTrainedModel",
        ]
@ -3292,7 +3281,6 @@ else:
            "SqueezeBertForSequenceClassification",
            "SqueezeBertForTokenClassification",
            "SqueezeBertModel",
-            "SqueezeBertModule",
            "SqueezeBertPreTrainedModel",
        ]
    )
@ -3491,7 +3479,6 @@ else:
            "ViltForMaskedLM",
            "ViltForQuestionAnswering",
            "ViltForTokenClassification",
-            "ViltLayer",
            "ViltModel",
            "ViltPreTrainedModel",
        ]
@ -3511,7 +3498,6 @@ else:
            "VisualBertForQuestionAnswering",
            "VisualBertForRegionToPhraseAlignment",
            "VisualBertForVisualReasoning",
-            "VisualBertLayer",
            "VisualBertModel",
            "VisualBertPreTrainedModel",
        ]
@ -3527,7 +3513,6 @@ else:
    _import_structure["models.vit_mae"].extend(
        [
            "ViTMAEForPreTraining",
-            "ViTMAELayer",
            "ViTMAEModel",
            "ViTMAEPreTrainedModel",
        ]
@ -3707,7 +3692,6 @@ else:
            "YosoForQuestionAnswering",
            "YosoForSequenceClassification",
            "YosoForTokenClassification",
-            "YosoLayer",
            "YosoModel",
            "YosoPreTrainedModel",
        ]
@ -3854,7 +3838,6 @@ else:
    )
    _import_structure["models.bert"].extend(
        [
-            "TFBertEmbeddings",
            "TFBertForMaskedLM",
            "TFBertForMultipleChoice",
            "TFBertForNextSentencePrediction",
@ -3920,7 +3903,6 @@ else:
            "TFConvBertForQuestionAnswering",
            "TFConvBertForSequenceClassification",
            "TFConvBertForTokenClassification",
-            "TFConvBertLayer",
            "TFConvBertModel",
            "TFConvBertPreTrainedModel",
        ]
@ -4151,7 +4133,6 @@ else:
            "TFLongformerForTokenClassification",
            "TFLongformerModel",
            "TFLongformerPreTrainedModel",
-            "TFLongformerSelfAttention",
        ]
    )
    _import_structure["models.lxmert"].extend(
@ -4252,7 +4233,6 @@ else:
            "TFRemBertForQuestionAnswering",
            "TFRemBertForSequenceClassification",
            "TFRemBertForTokenClassification",
-            "TFRemBertLayer",
            "TFRemBertModel",
            "TFRemBertPreTrainedModel",
        ]
@ -4298,7 +4278,6 @@ else:
            "TFRoFormerForQuestionAnswering",
            "TFRoFormerForSequenceClassification",
            "TFRoFormerForTokenClassification",
-            "TFRoFormerLayer",
            "TFRoFormerModel",
            "TFRoFormerPreTrainedModel",
        ]
@ -4826,6 +4805,7 @@ if TYPE_CHECKING:
        Agent,
        CodeAgent,
        HfApiEngine,
+        ManagedAgent,
        PipelineTool,
        ReactAgent,
        ReactCodeAgent,
@ -5827,7 +5807,8 @@ if TYPE_CHECKING:
        from .models.llama import LlamaTokenizer
        from .models.m2m_100 import M2M100Tokenizer
        from .models.marian import MarianTokenizer
-        from .models.mbart import MBart50Tokenizer, MBartTokenizer
+        from .models.mbart import MBartTokenizer
+        from .models.mbart50 import MBart50Tokenizer
        from .models.mluke import MLukeTokenizer
        from .models.mt5 import MT5Tokenizer
        from .models.nllb import NllbTokenizer
@ -6147,6 +6128,10 @@ if TYPE_CHECKING:
            WatermarkLogitsProcessor,
            WhisperTimeStampLogitsProcessor,
        )
+        from .integrations.executorch import (
+            TorchExportableModuleWithStaticCache,
+            convert_and_export_with_cache,
+        )
        from .modeling_rope_utils import ROPE_INIT_FUNCTIONS
        from .modeling_utils import PreTrainedModel
        from .models.albert import (
@ -6298,7 +6283,6 @@ if TYPE_CHECKING:
            BertForQuestionAnswering,
            BertForSequenceClassification,
            BertForTokenClassification,
-            BertLayer,
            BertLMHeadModel,
            BertModel,
            BertPreTrainedModel,
@ -6318,7 +6302,6 @@ if TYPE_CHECKING:
            BigBirdForQuestionAnswering,
            BigBirdForSequenceClassification,
            BigBirdForTokenClassification,
-            BigBirdLayer,
            BigBirdModel,
            BigBirdPreTrainedModel,
            load_tf_weights_in_big_bird,
@ -6413,7 +6396,6 @@ if TYPE_CHECKING:
            CanineForQuestionAnswering,
            CanineForSequenceClassification,
            CanineForTokenClassification,
-            CanineLayer,
            CanineModel,
            CaninePreTrainedModel,
            load_tf_weights_in_canine,
@ -6486,7 +6468,6 @@ if TYPE_CHECKING:
            ConvBertForQuestionAnswering,
            ConvBertForSequenceClassification,
            ConvBertForTokenClassification,
-            ConvBertLayer,
            ConvBertModel,
            ConvBertPreTrainedModel,
            load_tf_weights_in_convbert,
@ -6671,7 +6652,6 @@ if TYPE_CHECKING:
            QDQBertForQuestionAnswering,
            QDQBertForSequenceClassification,
            QDQBertForTokenClassification,
-            QDQBertLayer,
            QDQBertLMHeadModel,
            QDQBertModel,
            QDQBertPreTrainedModel,
@ -6870,7 +6850,6 @@ if TYPE_CHECKING:
            FNetForQuestionAnswering,
            FNetForSequenceClassification,
            FNetForTokenClassification,
-            FNetLayer,
            FNetModel,
            FNetPreTrainedModel,
        )
@ -6958,13 +6937,11 @@ if TYPE_CHECKING:
            GPTNeoXForQuestionAnswering,
            GPTNeoXForSequenceClassification,
            GPTNeoXForTokenClassification,
-            GPTNeoXLayer,
            GPTNeoXModel,
            GPTNeoXPreTrainedModel,
        )
        from .models.gpt_neox_japanese import (
            GPTNeoXJapaneseForCausalLM,
-            GPTNeoXJapaneseLayer,
            GPTNeoXJapaneseModel,
            GPTNeoXJapanesePreTrainedModel,
        )
@ -7140,7 +7117,6 @@ if TYPE_CHECKING:
            LongformerForTokenClassification,
            LongformerModel,
            LongformerPreTrainedModel,
-            LongformerSelfAttention,
        )
        from .models.longt5 import (
            LongT5EncoderModel,
@ -7167,7 +7143,6 @@ if TYPE_CHECKING:
            LxmertModel,
            LxmertPreTrainedModel,
            LxmertVisualFeatureEncoder,
-            LxmertXLayer,
        )
        from .models.m2m_100 import (
            M2M100ForConditionalGeneration,
@ -7184,7 +7159,7 @@ if TYPE_CHECKING:
            Mamba2Model,
            Mamba2PreTrainedModel,
        )
-        from .models.marian import MarianForCausalLM, MarianModel, MarianMTModel
+        from .models.marian import MarianForCausalLM, MarianModel, MarianMTModel, MarianPreTrainedModel
        from .models.markuplm import (
            MarkupLMForQuestionAnswering,
            MarkupLMForSequenceClassification,
@ -7250,7 +7225,6 @@ if TYPE_CHECKING:
            MobileBertForQuestionAnswering,
            MobileBertForSequenceClassification,
            MobileBertForTokenClassification,
-            MobileBertLayer,
            MobileBertModel,
            MobileBertPreTrainedModel,
            load_tf_weights_in_mobilebert,
@ -7286,7 +7260,6 @@ if TYPE_CHECKING:
            MPNetForQuestionAnswering,
            MPNetForSequenceClassification,
            MPNetForTokenClassification,
-            MPNetLayer,
            MPNetModel,
            MPNetPreTrainedModel,
        )
@ -7358,7 +7331,6 @@ if TYPE_CHECKING:
            NystromformerForQuestionAnswering,
            NystromformerForSequenceClassification,
            NystromformerForTokenClassification,
-            NystromformerLayer,
            NystromformerModel,
            NystromformerPreTrainedModel,
        )
@ -7446,7 +7418,6 @@ if TYPE_CHECKING:
            PerceiverForMultimodalAutoencoding,
            PerceiverForOpticalFlow,
            PerceiverForSequenceClassification,
-            PerceiverLayer,
            PerceiverModel,
            PerceiverPreTrainedModel,
        )
@ -7548,11 +7519,9 @@ if TYPE_CHECKING:
            RecurrentGemmaPreTrainedModel,
        )
        from .models.reformer import (
-            ReformerAttention,
            ReformerForMaskedLM,
            ReformerForQuestionAnswering,
            ReformerForSequenceClassification,
-            ReformerLayer,
            ReformerModel,
            ReformerModelWithLMHead,
            ReformerPreTrainedModel,
@ -7569,7 +7538,6 @@ if TYPE_CHECKING:
            RemBertForQuestionAnswering,
            RemBertForSequenceClassification,
            RemBertForTokenClassification,
-            RemBertLayer,
            RemBertModel,
            RemBertPreTrainedModel,
            load_tf_weights_in_rembert,
@ -7608,7 +7576,6 @@ if TYPE_CHECKING:
            RoCBertForQuestionAnswering,
            RoCBertForSequenceClassification,
            RoCBertForTokenClassification,
-            RoCBertLayer,
            RoCBertModel,
            RoCBertPreTrainedModel,
            load_tf_weights_in_roc_bert,
@ -7620,7 +7587,6 @@ if TYPE_CHECKING:
            RoFormerForQuestionAnswering,
            RoFormerForSequenceClassification,
            RoFormerForTokenClassification,
-            RoFormerLayer,
            RoFormerModel,
            RoFormerPreTrainedModel,
            load_tf_weights_in_roformer,
@ -7665,7 +7631,6 @@ if TYPE_CHECKING:
            SegformerDecodeHead,
            SegformerForImageClassification,
            SegformerForSemanticSegmentation,
-            SegformerLayer,
            SegformerModel,
            SegformerPreTrainedModel,
        )
@ -7710,7 +7675,6 @@ if TYPE_CHECKING:
        from .models.splinter import (
            SplinterForPreTraining,
            SplinterForQuestionAnswering,
-            SplinterLayer,
            SplinterModel,
            SplinterPreTrainedModel,
        )
@ -7721,7 +7685,6 @@ if TYPE_CHECKING:
            SqueezeBertForSequenceClassification,
            SqueezeBertForTokenClassification,
            SqueezeBertModel,
-            SqueezeBertModule,
            SqueezeBertPreTrainedModel,
        )
        from .models.stablelm import (
@ -7870,7 +7833,6 @@ if TYPE_CHECKING:
            ViltForMaskedLM,
            ViltForQuestionAnswering,
            ViltForTokenClassification,
-            ViltLayer,
            ViltModel,
            ViltPreTrainedModel,
        )
@ -7886,7 +7848,6 @@ if TYPE_CHECKING:
            VisualBertForQuestionAnswering,
            VisualBertForRegionToPhraseAlignment,
            VisualBertForVisualReasoning,
-            VisualBertLayer,
            VisualBertModel,
            VisualBertPreTrainedModel,
        )
@ -7898,7 +7859,6 @@ if TYPE_CHECKING:
        )
        from .models.vit_mae import (
            ViTMAEForPreTraining,
-            ViTMAELayer,
            ViTMAEModel,
            ViTMAEPreTrainedModel,
        )
@ -8040,7 +8000,6 @@ if TYPE_CHECKING:
            YosoForQuestionAnswering,
            YosoForSequenceClassification,
            YosoForTokenClassification,
-            YosoLayer,
            YosoModel,
            YosoPreTrainedModel,
        )
@ -8174,7 +8133,6 @@ if TYPE_CHECKING:
            TFBartPretrainedModel,
        )
        from .models.bert import (
-            TFBertEmbeddings,
            TFBertForMaskedLM,
            TFBertForMultipleChoice,
            TFBertForNextSentencePrediction,
@ -8228,7 +8186,6 @@ if TYPE_CHECKING:
            TFConvBertForQuestionAnswering,
            TFConvBertForSequenceClassification,
            TFConvBertForTokenClassification,
-            TFConvBertLayer,
            TFConvBertModel,
            TFConvBertPreTrainedModel,
        )
@ -8413,7 +8370,6 @@ if TYPE_CHECKING:
            TFLongformerForTokenClassification,
            TFLongformerModel,
            TFLongformerPreTrainedModel,
-            TFLongformerSelfAttention,
        )
        from .models.lxmert import (
            TFLxmertForPreTraining,
@ -8503,7 +8459,6 @@ if TYPE_CHECKING:
            TFRemBertForQuestionAnswering,
            TFRemBertForSequenceClassification,
            TFRemBertForTokenClassification,
-            TFRemBertLayer,
            TFRemBertModel,
            TFRemBertPreTrainedModel,
        )
@ -8541,7 +8496,6 @@ if TYPE_CHECKING:
            TFRoFormerForQuestionAnswering,
            TFRoFormerForSequenceClassification,
            TFRoFormerForTokenClassification,
-            TFRoFormerLayer,
            TFRoFormerModel,
            TFRoFormerPreTrainedModel,
        )
--- a/src/transformers/agents/init.py
+++ b/src/transformers/agents/init.py
@ -39,7 +39,7 @@ else:
    _import_structure["default_tools"] = ["FinalAnswerTool", "PythonInterpreterTool"]
    _import_structure["document_question_answering"] = ["DocumentQuestionAnsweringTool"]
    _import_structure["image_question_answering"] = ["ImageQuestionAnsweringTool"]
-    _import_structure["search"] = ["DuckDuckGoSearchTool"]
+    _import_structure["search"] = ["DuckDuckGoSearchTool", "VisitWebpageTool"]
    _import_structure["speech_to_text"] = ["SpeechToTextTool"]
    _import_structure["text_to_speech"] = ["TextToSpeechTool"]
    _import_structure["translation"] = ["TranslationTool"]
@ -59,7 +59,7 @@ if TYPE_CHECKING:
        from .default_tools import FinalAnswerTool, PythonInterpreterTool
        from .document_question_answering import DocumentQuestionAnsweringTool
        from .image_question_answering import ImageQuestionAnsweringTool
-        from .search import DuckDuckGoSearchTool
+        from .search import DuckDuckGoSearchTool, VisitWebpageTool
        from .speech_to_text import SpeechToTextTool
        from .text_to_speech import TextToSpeechTool
        from .translation import TranslationTool
--- a/src/transformers/agents/search.py
+++ b/src/transformers/agents/search.py
@ -14,6 +14,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import re
+
+import requests
+from requests.exceptions import RequestException
+
 from .tools import Tool


@ -29,7 +34,44 @@ class DuckDuckGoSearchTool(Tool):
            from duckduckgo_search import DDGS
        except ImportError:
            raise ImportError(
-                "You must install package `duckduckgo_search`: for instance run `pip install duckduckgo-search`."
+                "You must install package `duckduckgo_search` to run this tool: for instance run `pip install duckduckgo-search`."
            )
        results = DDGS().text(query, max_results=7)
        return results
+
+
+class VisitWebpageTool(Tool):
+    name = "visit_webpage"
+    description = "Visits a wbepage at the given url and returns its content as a markdown string."
+    inputs = {
+        "url": {
+            "type": "text",
+            "description": "The url of the webpage to visit.",
+        }
+    }
+    output_type = "text"
+
+    def forward(self, url: str) -> str:
+        try:
+            from markdownify import markdownify
+        except ImportError:
+            raise ImportError(
+                "You must install package `markdownify` to run this tool: for instance run `pip install markdownify`."
+            )
+        try:
+            # Send a GET request to the URL
+            response = requests.get(url)
+            response.raise_for_status()  # Raise an exception for bad status codes
+
+            # Convert the HTML content to Markdown
+            markdown_content = markdownify(response.text).strip()
+
+            # Remove multiple line breaks
+            markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)
+
+            return markdown_content
+
+        except RequestException as e:
+            return f"Error fetching the webpage: {str(e)}"
+        except Exception as e:
+            return f"An unexpected error occurred: {str(e)}"
--- a/src/transformers/cache_utils.py
+++ b/src/transformers/cache_utils.py
@ -293,6 +293,46 @@ class QuantizedCacheConfig(CacheConfig):
            )


+@dataclass
+class StaticCacheConfig(CacheConfig):
+    """
+    Configuration class for static cache settings.
+    """
+
+    cache_implementation = "static"
+
+    def __init__(self, batch_size: int, max_cache_len: int, device="cpu"):
+        self.batch_size = batch_size
+        self.max_cache_len = max_cache_len
+        self.device = device
+
+    def validate(self):
+        """Validates if the arguments passed are correct"""
+
+        incorrect_arg_msg = (
+            "Some of the keys in `cache_config` are defined incorrectly. `{key}` should be {correct_value}` "
+            "but found {found_value}"
+        )
+
+        if self.batch_size <= 0:
+            raise ValueError(
+                incorrect_arg_msg.format(
+                    key="batch_size",
+                    correct_value="> 0",
+                    found_value=self.batch_size,
+                ),
+            )
+
+        if self.max_cache_len <= 0:
+            raise ValueError(
+                incorrect_arg_msg.format(
+                    key="max_cache_len",
+                    correct_value="> 0",
+                    found_value=self.max_cache_len,
+                ),
+            )
+
+
 class DynamicCache(Cache):
    """
    A cache that grows dynamically as more tokens are generated. This is the default for generative models.
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@ -565,6 +565,8 @@ class PretrainedConfig(PushToHubMixin):
        original_kwargs = copy.deepcopy(kwargs)
        # Get config dict associated with the base config file
        config_dict, kwargs = cls._get_config_dict(pretrained_model_name_or_path, **kwargs)
+        if config_dict is None:
+            return {}, kwargs
        if "_commit_hash" in config_dict:
            original_kwargs["_commit_hash"] = config_dict["_commit_hash"]

@ -635,6 +637,8 @@ class PretrainedConfig(PushToHubMixin):
                    subfolder=subfolder,
                    _commit_hash=commit_hash,
                )
+                if resolved_config_file is None:
+                    return None, kwargs
                commit_hash = extract_commit_hash(resolved_config_file, commit_hash)
            except EnvironmentError:
                # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to
--- a/src/transformers/convert_slow_tokenizer.py
+++ b/src/transformers/convert_slow_tokenizer.py
@ -26,10 +26,13 @@ from packaging import version
 from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
 from tokenizers.models import BPE, Unigram, WordPiece

-from .utils import is_protobuf_available, requires_backends
+from .utils import is_protobuf_available, logging, requires_backends
 from .utils.import_utils import PROTOBUF_IMPORT_ERROR


+logger = logging.get_logger(__name__)
+
+
 def import_protobuf(error_message=""):
    if is_protobuf_available():
        import google.protobuf
@ -1451,12 +1454,15 @@ class TikTokenConverter:
        vocab_file=None,
        pattern=r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""",
        add_prefix_space=False,
+        additional_special_tokens=None,
        *args,
+        **kwargs,
    ):
        super().__init__(*args)
        self.vocab_file = vocab_file
        self.pattern = pattern
        self.add_prefix_space = add_prefix_space
+        self.additional_special_tokens = additional_special_tokens

    def extract_vocab_merges_from_model(self, tiktoken_url: str):
        try:
@ -1505,7 +1511,10 @@ class TikTokenConverter:
            ]
        )
        tokenizer.decoder = decoders.ByteLevel()
+        tokenizer.add_special_tokens(self.additional_special_tokens)
+
        tokenizer.post_processor = processors.ByteLevel(trim_offsets=False)
+
        return tokenizer


@ -1566,10 +1575,11 @@ SLOW_TO_FAST_CONVERTERS = {
    "LlamaTokenizer": LlamaConverter,
    "CodeLlamaTokenizer": LlamaConverter,
    "GemmaTokenizer": GemmaConvert,
+    "Phi3Tokenizer": LlamaConverter,
 }


-def convert_slow_tokenizer(transformer_tokenizer) -> Tokenizer:
+def convert_slow_tokenizer(transformer_tokenizer, from_tiktoken=False) -> Tokenizer:
    """
    Utilities to convert a slow tokenizer instance in a fast tokenizer instance.

@ -1577,6 +1587,8 @@ def convert_slow_tokenizer(transformer_tokenizer) -> Tokenizer:
        transformer_tokenizer ([`~tokenization_utils_base.PreTrainedTokenizer`]):
            Instance of a slow tokenizer to convert in the backend tokenizer for
            [`~tokenization_utils_base.PreTrainedTokenizerFast`].
+       from_tiktoken (bool, optional): Whether to use the `tiktoken` library to convert the tokenizer instead of sentencepiece.
+            Defaults to False.

    Return:
        A instance of [`~tokenizers.Tokenizer`] to be used as the backend tokenizer of a
@ -1584,14 +1596,20 @@ def convert_slow_tokenizer(transformer_tokenizer) -> Tokenizer:
    """

    tokenizer_class_name = transformer_tokenizer.__class__.__name__
+    if tokenizer_class_name in SLOW_TO_FAST_CONVERTERS and not from_tiktoken:
+        converter_class = SLOW_TO_FAST_CONVERTERS[tokenizer_class_name]
+        return converter_class(transformer_tokenizer).converted()

-    if tokenizer_class_name not in SLOW_TO_FAST_CONVERTERS:
-        raise ValueError(
-            f"An instance of tokenizer class {tokenizer_class_name} cannot be converted in a Fast tokenizer instance."
-            " No converter was found. Currently available slow->fast convertors:"
-            f" {list(SLOW_TO_FAST_CONVERTERS.keys())}"
-        )
-
-    converter_class = SLOW_TO_FAST_CONVERTERS[tokenizer_class_name]
-
-    return converter_class(transformer_tokenizer).converted()
+    else:
+        try:
+            logger.info("Converting from Tiktoken")
+            return TikTokenConverter(
+                vocab_file=transformer_tokenizer.vocab_file,
+                additional_special_tokens=transformer_tokenizer.additional_special_tokens,
+            ).converted()
+        except Exception:
+            raise ValueError(
+                f"Converting from Tiktoken failed, if a converter for SentencePiece is available, provide a model path "
+                f"with a SentencePiece tokenizer.model file."
+                f"Currently available slow->fast convertors: {list(SLOW_TO_FAST_CONVERTERS.keys())}"
+            )
--- a/src/transformers/convert_slow_tokenizers_checkpoints_to_fast.py
+++ b/src/transformers/convert_slow_tokenizers_checkpoints_to_fast.py
@ -28,7 +28,11 @@ logging.set_verbosity_info()
 logger = logging.get_logger(__name__)


-TOKENIZER_CLASSES = {name: getattr(transformers, name + "Fast") for name in SLOW_TO_FAST_CONVERTERS}
+TOKENIZER_CLASSES = {
+    # Phi3 uses Llama tokenizer
+    name: getattr(transformers, "LlamaTokenizerFast" if name == "Phi3Tokenizer" else name + "Fast")
+    for name in SLOW_TO_FAST_CONVERTERS
+}


 def convert_slow_checkpoint_to_fast(tokenizer_name, checkpoint_name, dump_path, force_download):
--- a/Show More
+++ b/Show More