Merge remote-tracking branch 'origin/main' into serve-quantization

rm check for now
style
2025-10-21 17:48:57 +08:00 · 2025-10-15 12:19:48 +00:00 · 2025-10-01 16:44:42 +00:00 · 2025-10-01 16:28:58 +00:00 · 2025-10-01 16:28:29 +00:00 · 2025-10-01 15:54:05 +00:00
1145 changed files with 22665 additions and 19023 deletions
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@ -1,10 +1,7 @@
 name: Self-hosted runner (benchmark)

 on:
-  push:
-    branches: [main]
-  pull_request:
-    types: [ opened, labeled, reopened, synchronize ]
+  workflow_dispatch:

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@ -12,8 +9,6 @@ concurrency:

 env:
  HF_HOME: /mnt/cache
-  DATASET_ID: hf-benchmarks/transformers
-  MODEL_ID: meta-llama/Llama-3.1-8B-Instruct

 jobs:
  benchmark:
@ -36,12 +31,26 @@ jobs:
        with:
          ref: ${{ github.event.pull_request.head.sha || github.sha }}

+      - name: Install libpq-dev & psql
+        run: |
+          apt update
+          apt install -y libpq-dev postgresql-client
+
      - name: Install benchmark script dependencies
-        run: python3 -m pip install -r benchmark_v2/requirements.txt kernels
+        run: python3 -m pip install -r benchmark/requirements.txt

      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
        working-directory: /transformers
-        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e ".[torch]" && python3 -m pip uninstall -y torchvision # temp fix
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e ".[torch]"
+
+      - name: Run database init script
+        run: |
+          psql -f benchmark/utils/init_db.sql
+        env:
+          PGDATABASE: metrics
+          PGHOST: ${{ secrets.TRANSFORMERS_BENCHMARKS_PGHOST }}
+          PGUSER: transformers_benchmarks
+          PGPASSWORD: ${{ secrets.TRANSFORMERS_BENCHMARKS_PGPASSWORD }}

      - name: Run benchmark
        run: |
@ -52,11 +61,13 @@ jobs:
            commit_id=$GITHUB_SHA
          fi
          commit_msg=$(git show -s --format=%s | cut -c1-70)
-          python3 benchmark_v2/run_benchmarks.py -b 32 -s 128 -n 256 --branch-name "$BRANCH_NAME" --commit-id "$commit_id" --commit-message "$commit_msg" --model-id "$MODEL_ID" --log-level INFO --push-result-to-dataset "$DATASET_ID"
+          python3 benchmark/benchmarks_entrypoint.py "huggingface/transformers" "$BRANCH_NAME" "$commit_id" "$commit_msg"
        env:
          HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
-          PUSH_TO_HUB_TOKEN: ${{ secrets.PUSH_TO_HUB_TOKEN }}
          # Enable this to see debug logs
          # HF_HUB_VERBOSITY: debug
          # TRANSFORMERS_VERBOSITY: debug
+          PGHOST: ${{ secrets.TRANSFORMERS_BENCHMARKS_PGHOST }}
+          PGUSER: transformers_benchmarks
+          PGPASSWORD: ${{ secrets.TRANSFORMERS_BENCHMARKS_PGPASSWORD }}
          BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
@ -3,7 +3,7 @@ name: Build docker images (scheduled)
 on:
  push:
    branches:
-      - fix_docker_file
+      - build_ci_docker_image*
  repository_dispatch:
  workflow_dispatch:
  workflow_call:
@ -42,6 +42,315 @@ jobs:
        with:
          context: ./docker/transformers-all-latest-gpu
          build-args: |
-            REF=fix_docker_file
+            REF=main
          push: true
-          tags: huggingface/transformers-all-latest-gpu-test
+          tags: huggingface/transformers-all-latest-gpu${{ inputs.image_postfix }}
+      # Push CI images still need to be re-built daily
+      -
+        name: Build and push (for Push CI) in a daily basis
+        # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
+        # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
+        if: inputs.image_postfix != '-push-ci'
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-all-latest-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-all-latest-gpu-push-ci
+
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        with:
+          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
+          title: 🤗 Results of the transformers-all-latest-gpu-push-ci docker build
+          status: ${{ job.status }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+
+  latest-torch-deepspeed-docker:
+    name: "Latest PyTorch + DeepSpeed"
+    runs-on:
+      group: aws-g4dn-2xlarge-cache
+    steps:
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      -
+        name: Check out code
+        uses: actions/checkout@v4
+      -
+        name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      -
+        name: Build and push
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-pytorch-deepspeed-latest-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-pytorch-deepspeed-latest-gpu${{ inputs.image_postfix }}
+
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        with:
+          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER}}
+          title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu docker build
+          status: ${{ job.status }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+
+  # Can't build 2 images in a single job `latest-torch-deepspeed-docker` (for `nvcr.io/nvidia`)
+  latest-torch-deepspeed-docker-for-push-ci-daily-build:
+    name: "Latest PyTorch + DeepSpeed (Push CI - Daily Build)"
+    runs-on:
+      group: aws-general-8-plus
+    steps:
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      -
+        name: Check out code
+        uses: actions/checkout@v4
+      -
+        name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      # Push CI images still need to be re-built daily
+      -
+        name: Build and push (for Push CI) in a daily basis
+        # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
+        # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
+        if: inputs.image_postfix != '-push-ci'
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-pytorch-deepspeed-latest-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci
+
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        with:
+          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
+          title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu-push-ci docker build
+          status: ${{ job.status }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+
+  doc-builder:
+    name: "Doc builder"
+    # Push CI doesn't need this image
+    if: inputs.image_postfix != '-push-ci'
+    runs-on:
+      group: aws-general-8-plus
+    steps:
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      -
+        name: Check out code
+        uses: actions/checkout@v4
+      -
+        name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      -
+        name: Build and push
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-doc-builder
+          push: true
+          tags: huggingface/transformers-doc-builder
+
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        with:
+          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
+          title: 🤗 Results of the huggingface/transformers-doc-builder docker build
+          status: ${{ job.status }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+
+  latest-pytorch:
+    name: "Latest PyTorch [dev]"
+    # Push CI doesn't need this image
+    if: inputs.image_postfix != '-push-ci'
+    runs-on:
+      group: aws-general-8-plus
+    steps:
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      -
+        name: Check out code
+        uses: actions/checkout@v4
+      -
+        name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      -
+        name: Build and push
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-pytorch-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-pytorch-gpu
+
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        with:
+          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
+          title: 🤗 Results of the huggingface/transformers-pytorch-gpudocker build
+          status: ${{ job.status }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+
+  latest-pytorch-amd:
+    name: "Latest PyTorch (AMD) [dev]"
+    runs-on:
+      group: aws-highcpu-32-priv
+    steps:
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      -
+        name: Check out code
+        uses: actions/checkout@v4
+      -
+        name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      -
+        name: Build and push
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-pytorch-amd-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-pytorch-amd-gpu${{ inputs.image_postfix }}
+      # Push CI images still need to be re-built daily
+      -
+        name: Build and push (for Push CI) in a daily basis
+        # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
+        # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
+        if: inputs.image_postfix != '-push-ci'
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-pytorch-amd-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-pytorch-amd-gpu-push-ci
+
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        with:
+          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
+          title: 🤗 Results of the huggingface/transformers-pytorch-amd-gpu-push-ci build
+          status: ${{ job.status }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+
+  latest-pytorch-deepspeed-amd:
+    name: "PyTorch + DeepSpeed (AMD) [dev]"
+    runs-on:
+      group: aws-general-8-plus
+    steps:
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      -
+        name: Check out code
+        uses: actions/checkout@v4
+      -
+        name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      -
+        name: Build and push
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-pytorch-deepspeed-amd-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }}
+      # Push CI images still need to be re-built daily
+      -
+        name: Build and push (for Push CI) in a daily basis
+        # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
+        # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
+        if: inputs.image_postfix != '-push-ci'
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-pytorch-deepspeed-amd-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-pytorch-deepspeed-amd-gpu-push-ci
+
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        with:
+          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
+          title: 🤗 Results of the transformers-pytorch-deepspeed-amd-gpu build
+          status: ${{ job.status }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+
+  latest-quantization-torch-docker:
+    name: "Latest Pytorch + Quantization [dev]"
+     # Push CI doesn't need this image
+    if: inputs.image_postfix != '-push-ci'
+    runs-on:
+      group: aws-general-8-plus
+    steps:
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      -
+        name: Check out code
+        uses: actions/checkout@v4
+      -
+        name: Login to DockerHub
+        uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+      -
+        name: Build and push
+        uses: docker/build-push-action@v5
+        with:
+          context: ./docker/transformers-quantization-latest-gpu
+          build-args: |
+            REF=main
+          push: true
+          tags: huggingface/transformers-quantization-latest-gpu${{ inputs.image_postfix }}
+
+      - name: Post to Slack
+        if: always()
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
+        with:
+          slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }}
+          title: 🤗 Results of the transformers-quantization-latest-gpu build
+          status: ${{ job.status }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
--- a/.github/workflows/check_failed_tests.yml
+++ b/.github/workflows/check_failed_tests.yml
@ -41,14 +41,9 @@ env:

 jobs:
  check_new_failures:
-    name: "Find commits for new failing tests"
-    strategy:
-      matrix:
-        run_idx: [1]
+    name: " "
    runs-on:
      group: aws-g5-4xlarge-cache
-    outputs:
-      process: ${{ steps.check_file.outputs.process }}
    container:
      image: ${{ inputs.docker }}
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@ -59,17 +54,14 @@ jobs:
          path: /transformers/ci_results_${{ inputs.job }}

      - name: Check file
-        id: check_file
        working-directory: /transformers
        run: |
          if [ -f ci_results_${{ inputs.job }}/new_failures.json ]; then
            echo "`ci_results_${{ inputs.job }}/new_failures.json` exists, continue ..."
            echo "process=true" >> $GITHUB_ENV
-            echo "process=true" >> $GITHUB_OUTPUT
          else
            echo "`ci_results_${{ inputs.job }}/new_failures.json` doesn't exist, abort."
            echo "process=false" >> $GITHUB_ENV
-            echo "process=false" >> $GITHUB_OUTPUT
          fi

      - uses: actions/download-artifact@v4
@ -126,10 +118,6 @@ jobs:
        run: |
          python3 utils/print_env.py

-      - name: Install pytest-flakefinder
-        if: ${{ env.process == 'true' }}
-        run: python3 -m pip install pytest-flakefinder
-
      - name: Show installed libraries and their versions
        working-directory: /transformers
        if: ${{ env.process == 'true' }}
@ -138,63 +126,25 @@ jobs:
      - name: Check failed tests
        working-directory: /transformers
        if: ${{ env.process == 'true' }}
-        run: python3 utils/check_bad_commit.py --start_commit ${{ inputs.start_sha }} --end_commit ${{ env.END_SHA }} --file ci_results_${{ inputs.job }}/new_failures.json --output_file new_failures_with_bad_commit_${{ inputs.job }}_${{ matrix.run_idx }}.json
+        run: python3 utils/check_bad_commit.py --start_commit ${{ inputs.start_sha }} --end_commit ${{ env.END_SHA }} --file ci_results_${{ inputs.job }}/new_failures.json --output_file new_failures_with_bad_commit.json

      - name: Show results
        working-directory: /transformers
        if: ${{ env.process == 'true' }}
        run: |
-          ls -l new_failures_with_bad_commit_${{ inputs.job }}_${{ matrix.run_idx }}.json
-          cat new_failures_with_bad_commit_${{ inputs.job }}_${{ matrix.run_idx }}.json
+          ls -l new_failures_with_bad_commit.json
+          cat new_failures_with_bad_commit.json

-      - name: Upload artifacts
-        uses: actions/upload-artifact@v4
-        with:
-          name: new_failures_with_bad_commit_${{ inputs.job }}_${{ matrix.run_idx }}
-          path: /transformers/new_failures_with_bad_commit_${{ inputs.job }}_${{ matrix.run_idx }}.json
-
-  process_new_failures_with_commit_info:
-    name: "process bad commit reports"
-    needs: check_new_failures
-    if: needs.check_new_failures.outputs.process == 'true'
-    runs-on:
-      group: aws-g5-4xlarge-cache
-    container:
-      image: ${{ inputs.docker }}
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    steps:
-      - uses: actions/download-artifact@v4
-        with:
-          name: ci_results_${{ inputs.job }}
-          path: /transformers/ci_results_${{ inputs.job }}
-
-      - uses: actions/download-artifact@v4
-        with:
-          pattern: new_failures_with_bad_commit_${{ inputs.job }}*
-          path: /transformers/new_failures_with_bad_commit_${{ inputs.job }}
-          merge-multiple: true
-
-      - name: Check files
+      - name: Checkout back
        working-directory: /transformers
+        if: ${{ env.process == 'true' }}
        run: |
-          ls -la /transformers
-          ls -la /transformers/new_failures_with_bad_commit_${{ inputs.job }}
-
-      # Currently, we only run with a single runner by using `run_idx: [1]`. We might try to run with multiple runners
-      # to further reduce the false positive caused by flaky tests, which requires further processing to merge reports.
-      - name: Merge files
-        shell: bash
-        working-directory: /transformers
-        run: |
-          cp /transformers/new_failures_with_bad_commit_${{ inputs.job }}/new_failures_with_bad_commit_${{ inputs.job }}_1.json new_failures_with_bad_commit.json
-
-      - name: Update clone
-        working-directory: /transformers
-        run: git fetch && git checkout ${{ inputs.commit_sha || github.sha }}
+          git checkout ${{ inputs.start_sha }}

      - name: Process report
        shell: bash
        working-directory: /transformers
+        if: ${{ env.process == 'true' }}
        env:
          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
          TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}
@ -206,6 +156,7 @@ jobs:
      - name: Process report
        shell: bash
        working-directory: /transformers
+        if: ${{ env.process == 'true' }}
        env:
          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
          TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}
@ -220,12 +171,13 @@ jobs:

      - name: Prepare Slack report title
        working-directory: /transformers
+        if: ${{ env.process == 'true' }}
        run: |
          pip install slack_sdk
          echo "title=$(python3 -c 'import sys; sys.path.append("utils"); from utils.notification_service import job_to_test_map; ci_event = "${{ inputs.ci_event }}"; job = "${{ inputs.job }}"; test_name = job_to_test_map[job]; title = f"New failed tests of {ci_event}" + ":" + f" {test_name}"; print(title)')" >> $GITHUB_ENV

      - name: Send processed report
-        if: ${{ !endsWith(env.REPORT_TEXT, '{}') }}
+        if: ${{ env.process == 'true' && !endsWith(env.REPORT_TEXT, '{}') }}
        uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001
        with:
          # Slack channel id, channel name, or user id to post message.
--- a/.github/workflows/pr_build_doc_with_comment.yml
+++ b/.github/workflows/pr_build_doc_with_comment.yml
@ -98,7 +98,7 @@ jobs:
      commit_sha: ${{ needs.get-pr-info.outputs.PR_HEAD_SHA }}
      pr_number: ${{ needs.get-pr-number.outputs.PR_NUMBER }}
      package: transformers
-      languages: ar de en es fr hi it ja ko pt zh
+      languages: ar de en es fr hi it ko pt tr zh ja te

  update_run_status:
    name: Update Check Run Status
--- a/.gitignore
+++ b/.gitignore
@ -98,7 +98,6 @@ celerybeat-schedule
 # Environments
 .env
 .venv
-.venv*
 env/
 venv/
 ENV/
@ -172,6 +171,3 @@ tags

 # modular conversion
 *.modular_backup
-
-# Cursor IDE files
-.cursor/
--- a/benchmark/benches/llama.py
+++ b/benchmark/benches/llama.py
@ -16,6 +16,7 @@ import sys
 from logging import Logger
 from threading import Event, Thread
 from time import perf_counter, sleep
+from typing import Optional


 # Add the parent directory to Python path to import benchmarks_entrypoint
@ -41,7 +42,7 @@ except ImportError:
    GenerationConfig = None
    StaticCache = None

-os.environ["HF_XET_HIGH_PERFORMANCE"] = "1"
+os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 os.environ["TOKENIZERS_PARALLELISM"] = "1"

 # Only set torch precision if torch is available
@ -144,7 +145,7 @@ def run_benchmark(
            q = torch.empty_like(probs_sort).exponential_(1)
            return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int)

-        def logits_to_probs(logits, temperature: float = 1.0, top_k: int | None = None):
+        def logits_to_probs(logits, temperature: float = 1.0, top_k: Optional[int] = None):
            logits = logits / max(temperature, 1e-5)

            if top_k is not None:
@ -154,7 +155,7 @@ def run_benchmark(
            probs = torch.nn.functional.softmax(logits, dim=-1)
            return probs

-        def sample(logits, temperature: float = 1.0, top_k: int | None = None):
+        def sample(logits, temperature: float = 1.0, top_k: Optional[int] = None):
            probs = logits_to_probs(logits[0, -1], temperature, top_k)
            idx_next = multinomial_sample_one_no_sync(probs)
            return idx_next, probs
--- a/benchmark/requirements.txt
+++ b/benchmark/requirements.txt
@ -2,5 +2,5 @@ gpustat==1.1.1
 psutil==6.0.0
 psycopg2==2.9.9
 torch>=2.4.0
-hf_xet
+hf_transfer
 pandas>=1.5.0
--- a/benchmark_v2/framework/benchmark_config.py
+++ b/benchmark_v2/framework/benchmark_config.py
@ -1,7 +1,7 @@
 import hashlib
 import json
 import logging
-from typing import Any
+from typing import Any, Optional


 KERNELIZATION_AVAILABLE = False
@ -22,16 +22,16 @@ class BenchmarkConfig:
        self,
        warmup_iterations: int = 5,
        measurement_iterations: int = 20,
-        gpu_monitoring: bool = True,  # NOTE: you may want to disable this at times as we have obsvered it could heavily slow down benchmarks on AMD
+        gpu_monitoring: bool = False,  # False by default because it slows down the benchmark by a lot
        batch_size: int = 1,
        sequence_length: int = 128,
        num_tokens_to_generate: int = 128,
        attn_implementation: str = "eager",
-        sdpa_backend: str | None = None,
-        compile_mode: str | None = None,
-        compile_options: dict[str, Any] | None = None,
+        sdpa_backend: Optional[str] = None,
+        compile_mode: Optional[str] = None,
+        compile_options: Optional[dict[str, Any]] = None,
        kernelize: bool = False,
-        name: str | None = None,
+        name: Optional[str] = None,
        skip_validity_check: bool = False,
    ) -> None:
        # Benchmark parameters
@ -104,7 +104,7 @@ class BenchmarkConfig:
            "attn_implementation": self.attn_implementation,
            "sdpa_backend": self.sdpa_backend,
            "compile_mode": self.compile_mode,
-            "compile_options": self.compile_options | {},  # to avoid inplace modification of the original dict
+            "compile_options": self.compile_options,
            "kernelize": self.kernelize,
        }

@ -128,15 +128,15 @@ class BenchmarkConfig:


 def cross_generate_configs(
-    attn_impl_and_sdpa_backend: list[tuple[str, str | None]],
-    compiled_mode: list[str | None],
+    attn_impl_and_sdpa_backend: list[tuple[str, Optional[str]]],
+    compiled_mode: list[Optional[str]],
    kernelized: list[bool],
    warmup_iterations: int = 5,
    measurement_iterations: int = 20,
    batch_size: int = 1,
    sequence_length: int = 128,
    num_tokens_to_generate: int = 128,
-    gpu_monitoring: bool = True,
+    gpu_monitoring: bool = False,  # this slows down the benchmark by a lot so we disable it by default
 ) -> list[BenchmarkConfig]:
    # Create kwargs common to all configs
    kwargs = {
@ -169,7 +169,7 @@ def generate_all_configs(
    batch_size: int = 1,
    sequence_length: int = 128,
    num_tokens_to_generate: int = 128,
-    gpu_monitoring: bool = True,
+    gpu_monitoring: bool = False,
 ) -> list[BenchmarkConfig]:
    all_attn_implementations = [
        ("flash_attention_2", None),
@ -191,24 +191,28 @@ def generate_all_configs(
    )


-def generate_main_configs(
+def generate_default_configs(
    warmup_iterations: int = 5,
    measurement_iterations: int = 20,
    batch_size: int = 1,
    sequence_length: int = 128,
    num_tokens_to_generate: int = 128,
+    gpu_monitoring: bool = False,
 ) -> list[BenchmarkConfig]:
-    # Create kwargs common to all configs
-    kwargs = {
-        "warmup_iterations": warmup_iterations,
-        "measurement_iterations": measurement_iterations,
-        "batch_size": batch_size,
-        "sequence_length": sequence_length,
-        "num_tokens_to_generate": num_tokens_to_generate,
-    }
-    return [  # TODO: test max-autotune instead of default
-        BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default", gpu_monitoring=False, **kwargs),
-        BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default", gpu_monitoring=True, **kwargs),
-        BenchmarkConfig(attn_implementation="eager", compile_mode="default", gpu_monitoring=True, **kwargs),
-        BenchmarkConfig(attn_implementation="flash_attention_2", gpu_monitoring=True, **kwargs),
+    all_attn_implementations = [
+        ("flash_attention_2", None),
+        ("eager", None),
+        ("sdpa", "math"),
+        ("sdpa", "flash_attention"),  # note: this one can fail with compile because of attn mask
    ]
+    return cross_generate_configs(
+        attn_impl_and_sdpa_backend=all_attn_implementations,
+        compiled_mode=[None, "max-autotune"],
+        kernelized=[False, KERNELIZATION_AVAILABLE],
+        warmup_iterations=warmup_iterations,
+        measurement_iterations=measurement_iterations,
+        batch_size=batch_size,
+        sequence_length=sequence_length,
+        num_tokens_to_generate=num_tokens_to_generate,
+        gpu_monitoring=gpu_monitoring,
+    )
--- a/benchmark_v2/framework/benchmark_runner.py
+++ b/benchmark_v2/framework/benchmark_runner.py
@ -4,16 +4,13 @@ import logging
 import os
 import pathlib
 import re
-import tempfile
 import time
 from contextlib import nullcontext
 from datetime import datetime
 from queue import Queue
-from typing import Any
+from typing import Any, Optional

 import torch
-from datasets import Dataset
-from huggingface_hub import HfApi
 from tqdm import trange

 from transformers import (
@ -53,8 +50,6 @@ DEFAULT_PROMPT = "\n".join([
    "Its instability ended in the coup of 18 Brumaire and the establishment of the Consulate, with Napoleon Bonaparte as First Consul.",
 ])  # fmt: skip

-PUSH_TO_HUB_TOKEN = os.getenv("PUSH_TO_HUB_TOKEN", None)
-

 def compact_json_numeric_arrays(data: dict):
    # Match arrays that contain only numbers (ints/floats), whitespace, commas, and newlines
@ -79,7 +74,7 @@ def get_git_revision() -> str:
        return git_hash.readline().strip()


-def get_sdpa_backend(backend_name: str | None) -> torch.nn.attention.SDPBackend | None:
+def get_sdpa_backend(backend_name: Optional[str]) -> Optional[torch.nn.attention.SDPBackend]:
    """Get the SDPA backend enum from string name."""
    if backend_name is None:
        return None
@ -125,19 +120,15 @@ def flush_memory():

 class BenchmarkStreamer(BaseStreamer):
    def __init__(self, **kwargs) -> None:
-        self.timeout = kwargs.pop("timeout", 10)
        self.timestamps = []
        self.text_queue = Queue()
-        self.stop_signal = None

    def put(self, value):
        """Receives tokens and logs the timestamp of the generation."""
        self.timestamps.append(time.perf_counter())
-        self.text_queue.put(value)

    def end(self):
        self.timestamps.append(time.perf_counter())
-        self.text_queue.put(self.stop_signal)

    def __iter__(self):
        return self
@ -154,34 +145,25 @@ class BenchmarkRunner:
    """Main benchmark runner that coordinates benchmark execution."""

    def __init__(
-        self,
-        logger: logging.Logger,
-        output_dir: str | None = None,
-        branch_name: str | None = None,
-        commit_id: str | None = None,
-        commit_message: str | None = None,
+        self, logger: logging.Logger, output_dir: str = "benchmark_results", commit_id: Optional[str] = None
    ) -> None:
        # Those stay constant for the whole run
        self.logger = logger
-        if output_dir is None:
-            output_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "benchmark_results")
        self.output_dir = output_dir
-        self.branch_name = branch_name
        self.commit_id = get_git_revision() if commit_id is None else commit_id
-        self.commit_message = commit_message
        os.makedirs(self.output_dir, exist_ok=True)
        self.profile_dir = None
        # Attributes that are reset for each model
        self._setup_for = ""
        # Attributes that are reset for each run
-        self.model: GenerationMixin | None = None
+        self.model: Optional[GenerationMixin] = None

    def cleanup(self) -> None:
        del self.model
        self.model = None
        flush_memory()

-    def setup_benchmark(self, model_id: str, config: BenchmarkConfig) -> None:
+    def setup_one_run(self, model_id: str, config: BenchmarkConfig) -> None:
        # Some attributes only need to be set once per model
        if self._setup_for != model_id:
            self.tokenizer = AutoTokenizer.from_pretrained(model_id)
@ -218,13 +200,10 @@ class BenchmarkRunner:
        self.model = self.model.eval().to(config.device)

        # Kernelize the model if needed
-        if config.kernelize and kernelize is not None and Mode is not None:
+        if config.kernelize:
            self.model = kernelize(self.model, mode=Mode.INFERENCE)

-    def run_benchmark(
-        self, model_id: str, config: BenchmarkConfig, num_tokens_to_profile: int = 0
-    ) -> dict[str, Any] | None:
-        """Run a single benchmark with the given model ID and config."""
+    def run_one_benchmark(self, model_id: str, config: BenchmarkConfig, num_tokens_to_profile: int = 0) -> None:
        sdpa_ctx = nullcontext()
        if config.attn_implementation == "sdpa":
            sdpa_backend = get_sdpa_backend(config.sdpa_backend)
@ -235,7 +214,7 @@ class BenchmarkRunner:

            # Quick validation: try one measurement first to see if this scenario works
            flush_memory()
-            e2e_latency, token_generation_times, shape_and_decoded_output, gpu_metrics = self.time_generate(
+            e2e_latency, token_generation_times, decoded_output, gpu_metrics = self.time_generate(
                max_new_tokens=1, gpu_monitor=None
            )
            if e2e_latency < 0:
@ -252,11 +231,11 @@ class BenchmarkRunner:
            result = BenchmarkResult()
            self.logger.info(f"Benchmarking with {config.measurement_iterations} iterations.")
            for _ in trange(config.measurement_iterations):
-                e2e_latency, token_generation_times, shape_and_decoded_output, gpu_metrics = self.time_generate(
+                e2e_latency, token_generation_times, decoded_output, gpu_metrics = self.time_generate(
                    max_new_tokens=config.num_tokens_to_generate,
                    gpu_monitor=(GPUMonitor(logger=self.logger) if config.gpu_monitoring else None),
                )
-                result.accumulate(e2e_latency, token_generation_times, shape_and_decoded_output, gpu_metrics)
+                result.accumulate(e2e_latency, token_generation_times, decoded_output, gpu_metrics)
            self.logger.info("Benchmarking done. Cleaning up.")

            # Profile if needed
@ -264,12 +243,7 @@ class BenchmarkRunner:
                self.profile_generate(num_tokens_to_profile, config.name)

            return {
-                "metadata": BenchmarkMetadata(
-                    model_id=model_id,
-                    branch_name=self.branch_name,
-                    commit_id=self.commit_id,
-                    commit_message=self.commit_message,
-                ),
+                "metadata": BenchmarkMetadata(model_id=model_id, commit_id=self.commit_id),
                "measurements": result,
                "config": config,
            }
@ -277,8 +251,8 @@ class BenchmarkRunner:
    def time_generate(
        self,
        max_new_tokens: int,
-        gpu_monitor: GPUMonitor | None = None,
-    ) -> tuple[float, list[float], str, GPURawMetrics | None]:
+        gpu_monitor: Optional[GPUMonitor] = None,
+    ) -> tuple[float, list[float], str, Optional[GPURawMetrics]]:
        """Time the latency of a call to model.generate() with the given (inputs) and (max_new_tokens)."""
        # Prepare gpu monitoring if needed
        if gpu_monitor is not None:
@ -303,11 +277,10 @@ class BenchmarkRunner:
            raise RuntimeError(f"Generated {new_tokens} tokens, expected {max_new_tokens}")
        # Decode outputs
        decoded_output = self.tokenizer.decode(outputs[0, input_tokens:], skip_special_tokens=True)
-        shape_and_decoded_output = f"{tuple(outputs.shape)} | {decoded_output}"
        # Compute intermediate quantities
        e2e_latency = wall_time_1 - wall_time_0
        token_generation_times = [t - wall_time_0 for t in streamer.timestamps[1:]]
-        return e2e_latency, token_generation_times, shape_and_decoded_output, gpu_metrics
+        return e2e_latency, token_generation_times, decoded_output, gpu_metrics

    def profile_generate(self, num_tokens_to_profile: int, config_name: str) -> None:
        """Profile the latency of a call to model.generate() with the given (inputs) and (max_new_tokens)."""
@ -331,8 +304,7 @@ class BenchmarkRunner:
        benchmark_configs: list[BenchmarkConfig],
        num_tokens_to_profile: int = 0,
        pretty_print_summary: bool = True,
-    ) -> tuple[str, dict[str, Any]]:
-        """Run multiple benchmarks for the given model ID and list of benchmark configs."""
+    ) -> dict[str, Any]:
        all_results = {}
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        start_time = time.perf_counter()
@ -351,14 +323,14 @@ class BenchmarkRunner:
                continue

            # Otherwise, run the benchmark
-            self.setup_benchmark(model_id, config)
+            self.setup_one_run(model_id, config)
            self.logger.info(
                f"Running benchmark of model {model_id} with scenario: {config.name} ({i + 1}/{n_configs})"
            )

            # Launch benchmark in a try/except block to avoid stopping the whole run if one benchmark fails
            try:
-                results = self.run_benchmark(model_id, config, num_tokens_to_profile)
+                results = self.run_one_benchmark(model_id, config, num_tokens_to_profile)
                if results is not None:
                    all_results[config.hash] = results

@ -379,13 +351,13 @@ class BenchmarkRunner:
                first_metadata = all_results[first_key]["metadata"].to_dict()
                hardware_info = first_metadata.pop("hardware_info")
                pretty_print_dict(first_metadata | hardware_info, tabs=1)
-            for result in all_results.values():
+            for value in all_results.values():
                print("=" * 100)
-                print(f"Config: {result['config'].infer_name(compact=False)}\n")
-                result["measurements"].pprint(batch_size=result["config"].batch_size, tabs=1)
+                print(f"Config: {value['config'].infer_name(compact=False)}\n")
+                value["measurements"].pprint(tabs=1)
            print("=" * 100)

-        return (timestamp, all_results)
+        return all_results

    def save_results(self, model_name: str, results: dict, timestamp: str = "") -> str:
        """Save benchmark results to JSON file."""
@ -414,43 +386,3 @@ class BenchmarkRunner:

        self.logger.info(f"Results saved to {filepath}")
        return filepath
-
-    def push_results_to_hub(self, dataset_id: str, results: dict[Any, Any], timestamp: str) -> None:
-        if PUSH_TO_HUB_TOKEN is None:
-            raise ValueError(
-                "PUSH_TO_HUB_TOKEN is not set, cannot push results to the Hub. When setting dataset_id, please also set the PUSH_TO_HUB_TOKEN environment variable."
-            )
-
-        n_results = len(results)
-        self.logger.info(f"Pushing {n_results} results to: {dataset_id}")
-        rows = []
-        for cfg_hash, entry in results.items():
-            row = {
-                "benchmark_config_hash": cfg_hash,
-                "config": entry["config"].to_dict(),
-                "measurements": entry["measurements"].to_dict(),
-                "metadata": entry["metadata"].to_dict(),
-            }
-            rows.append(row)
-
-        ds = Dataset.from_list(rows)
-        with tempfile.TemporaryDirectory() as tmp:
-            jsonl_path = os.path.join(tmp, "data.jsonl")
-            with open(jsonl_path, "w") as f:
-                json_lines = []
-                for ex in ds:
-                    json_lines.append(json.dumps(ex, ensure_ascii=False))
-                f.write("\n".join(json_lines))
-
-            api = HfApi()
-            # NOTE: we expect the repository to already exist
-            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") if not timestamp else timestamp
-            file_name = f"benchmark_run_{timestamp}.jsonl"
-            api.upload_file(
-                path_or_fileobj=jsonl_path,
-                path_in_repo=file_name,
-                repo_id=dataset_id,
-                repo_type="dataset",
-                token=PUSH_TO_HUB_TOKEN,
-            )
-        self.logger.info(f"Succesfully uploaded results to: {dataset_id}")
--- a/benchmark_v2/framework/data_classes.py
+++ b/benchmark_v2/framework/data_classes.py
@ -1,6 +1,6 @@
 from dataclasses import dataclass
-from datetime import datetime, timezone
-from typing import Any
+from datetime import datetime
+from typing import Any, Optional, Union

 import numpy as np

@ -59,26 +59,19 @@ class BenchmarkMetadata:

    model_id: str
    timestamp: str
-    branch_name: str
    commit_id: str
-    commit_message: str
    hardware_info: HardwareInfo

-    def __init__(self, model_id: str, commit_id: str, branch_name: str = "main", commit_message: str = "") -> None:
+    def __init__(self, model_id: str, commit_id: str):
        self.model_id = model_id
-        self.timestamp = datetime.now(timezone.utc).isoformat()
-        self.branch_name = branch_name
+        self.timestamp = datetime.utcnow().isoformat()
        self.commit_id = commit_id
-        self.commit_message = commit_message
        self.hardware_info = HardwareInfo()

    def to_dict(self) -> dict[str, Any]:
        return {
-            "model_id": self.model_id,
            "timestamp": self.timestamp,
-            "branch_name": self.branch_name,
            "commit_id": self.commit_id,
-            "commit_message": self.commit_message,
            "hardware_info": self.hardware_info.to_dict(),
        }

@ -89,22 +82,22 @@ class BenchmarkResult:
    def __init__(self) -> None:
        self.e2e_latency = []
        self.token_generation_times = []  # time at which each token was generated (relative to start of the generation)
-        self.shape_and_decoded_outputs = []
+        self.decoded_outputs = []
        self.gpu_metrics = []

    def accumulate(
        self,
        e2e_latency: float,
        token_generation_times: list[float],
-        shape_and_decoded_output: str,
-        gpu_metrics: GPURawMetrics | None,
+        decoded_output: str,
+        gpu_metrics: Optional[GPURawMetrics],
    ) -> None:
        self.e2e_latency.append(e2e_latency)
        self.token_generation_times.append(token_generation_times)
-        self.shape_and_decoded_outputs.append(shape_and_decoded_output)
+        self.decoded_outputs.append(decoded_output)
        self.gpu_metrics.append(gpu_metrics)

-    def to_dict(self) -> dict[str, None | int | float]:
+    def to_dict(self) -> dict[str, Union[None, int, float]]:
        # Save GPU metrics as None if it contains only None values
        if all(gm is None for gm in self.gpu_metrics):
            gpu_metrics = None
@ -113,12 +106,12 @@ class BenchmarkResult:
        return {
            "e2e_latency": self.e2e_latency,
            "token_generation_times": self.token_generation_times,
-            "shape_and_decoded_outputs": self.shape_and_decoded_outputs,
+            "decoded_outputs": self.decoded_outputs,
            "gpu_metrics": gpu_metrics,
        }

    @classmethod
-    def from_dict(cls, data: dict[str, None | int | float]) -> "BenchmarkResult":
+    def from_dict(cls, data: dict[str, Union[None, int, float]]) -> "BenchmarkResult":
        # Handle GPU metrics, which is saved as None if it contains only None values
        if data["gpu_metrics"] is None:
            gpu_metrics = [None for _ in range(len(data["e2e_latency"]))]
@ -130,7 +123,7 @@ class BenchmarkResult:
            new_instance.accumulate(
                e2e_latency=data["e2e_latency"][i],
                token_generation_times=data["token_generation_times"][i],
-                shape_and_decoded_output=data["shape_and_decoded_outputs"][i],
+                decoded_output=data["decoded_output"][i],
                gpu_metrics=gpu_metrics[i],
            )
        return new_instance
@ -141,27 +134,19 @@ class BenchmarkResult:
    def get_measured_itl(self) -> list[float]:
        return [(dt[-1] - dt[0]) / (len(dt) - 1) for dt in self.token_generation_times if len(dt) > 1]

-    def get_throughput(self, batch_size: int) -> float:
-        return [
-            batch_size * len(dt) / e2e_latency
-            for e2e_latency, dt in zip(self.e2e_latency, self.token_generation_times)
-        ]
-
-    def pprint(self, batch_size: int = 0, tabs: int = 0) -> None:
-        stats_to_collate = [
-            add_unit_to_duration(compute_basic_statistics(self.e2e_latency)),
-            add_unit_to_duration(compute_basic_statistics(self.get_measured_ttft())),
-            add_unit_to_duration(compute_basic_statistics(self.get_measured_itl())),
-        ]
-        if batch_size > 0:
-            throughput_stats = compute_basic_statistics(self.get_throughput(batch_size))
-            stats_to_collate.append({key: f"{value:.2f}tok/s" for key, value in throughput_stats.items()})
-        collated_stats = equalize_lengths_and_collate(stats_to_collate)
-        dict_to_pprint = {
-            "E2E Latency": collated_stats[0],
-            "Time to First Token": collated_stats[1],
-            "Inter-Token Latency": collated_stats[2],
-        }
-        if batch_size > 0:
-            dict_to_pprint["Throughput"] = collated_stats[3]
-        pretty_print_dict(dict_to_pprint, tabs=tabs)
+    def pprint(self, tabs: int = 0) -> None:
+        collated_stats = equalize_lengths_and_collate(
+            [
+                add_unit_to_duration(compute_basic_statistics(self.e2e_latency)),
+                add_unit_to_duration(compute_basic_statistics(self.get_measured_ttft())),
+                add_unit_to_duration(compute_basic_statistics(self.get_measured_itl())),
+            ]
+        )
+        pretty_print_dict(
+            {
+                "E2E Latency": collated_stats[0],
+                "Time to First Token": collated_stats[1],
+                "Inter-Token Latency": collated_stats[2],
+            },
+            tabs=tabs,
+        )
--- a/benchmark_v2/framework/hardware_metrics.py
+++ b/benchmark_v2/framework/hardware_metrics.py
@ -7,6 +7,7 @@ import time
 from dataclasses import dataclass
 from enum import Enum
 from logging import Logger
+from typing import Optional, Union

 import gpustat
 import psutil
@ -41,7 +42,7 @@ class HardwareInfo:
        self.cpu_count = psutil.cpu_count()
        self.memory_total_mb = int(psutil.virtual_memory().total / (1024 * 1024))

-    def to_dict(self) -> dict[str, None | int | float | str]:
+    def to_dict(self) -> dict[str, Union[None, int, float, str]]:
        return {
            "gpu_name": self.gpu_name,
            "gpu_memory_total_gb": self.gpu_memory_total_gb,
@ -108,7 +109,7 @@ class GPURawMetrics:
    timestamp_0: float  # in seconds
    monitoring_status: GPUMonitoringStatus

-    def to_dict(self) -> dict[str, None | int | float | str]:
+    def to_dict(self) -> dict[str, Union[None, int, float, str]]:
        return {
            "utilization": self.utilization,
            "memory_used": self.memory_used,
@ -122,7 +123,7 @@ class GPURawMetrics:
 class GPUMonitor:
    """Monitor GPU utilization during benchmark execution."""

-    def __init__(self, sample_interval_sec: float = 0.1, logger: Logger | None = None):
+    def __init__(self, sample_interval_sec: float = 0.1, logger: Optional[Logger] = None):
        self.sample_interval_sec = sample_interval_sec
        self.logger = logger if logger is not None else logging.getLogger(__name__)

--- a/benchmark_v2/requirements.txt
+++ b/benchmark_v2/requirements.txt
@ -4,4 +4,4 @@ gpustat>=1.0.0
 torch>=2.0.0
 transformers>=4.30.0
 datasets>=2.10.0
-huggingface_hub>=0.16.0
+huggingface_hub>=0.16.0 
--- a/benchmark_v2/run_benchmarks.py
+++ b/benchmark_v2/run_benchmarks.py
@ -20,43 +20,31 @@ in the ./benches directory, organizing outputs into model-specific subfolders.

 import argparse
 import logging
+import random
 import sys
 import uuid

-from framework.benchmark_config import BenchmarkConfig, generate_all_configs, generate_main_configs
+from framework.benchmark_config import BenchmarkConfig, generate_all_configs
 from framework.benchmark_runner import BenchmarkRunner


 if __name__ == "__main__":
    # Parse arguments
    parser = argparse.ArgumentParser()
-    parser.add_argument("--output-dir", type=str, default=None, help="Output dir for benchmark results")
+    parser.add_argument("--output-dir", type=str, default="benchmark_results", help="Output dir for benchmark results")
    parser.add_argument("--log-level", type=str, choices=["DEBUG", "INFO", "WARNING", "ERROR"], default="INFO")
    parser.add_argument("--model-id", type=str, help="Specific model ID to benchmark (if supported by benchmarks)")
-    parser.add_argument("--warmup", "-w", type=int, default=3, help="Number of warmup iterations")
-    parser.add_argument("--iterations", "-i", type=int, default=10, help="Number of measurement iterations")
+
+    parser.add_argument("--warmup", type=int, default=5, help="Number of warmup iterations")
+    parser.add_argument("--iterations", type=int, default=20, help="Number of measurement iterations")

    parser.add_argument("--batch-size", "-b", type=int, nargs="+", help="Batch size")
    parser.add_argument("--sequence-length", "-s", type=int, nargs="+", help="Sequence length")
    parser.add_argument("--num-tokens-to-generate", "-n", type=int, nargs="+", help="Number of tokens to generate")

-    parser.add_argument("--cross-generate", action="store_true", help="Cross-generate all combinations of configs")
    parser.add_argument("--num-tokens-to-profile", "-p", type=int, default=0, help="Number of tokens to profile")

-    parser.add_argument("--branch-name", type=str, help="Git branch name")
    parser.add_argument("--commit-id", type=str, help="Git commit ID (if not provided, will auto-detect from git)")
-    parser.add_argument("--commit-message", type=str, help="Git commit message")
-
-    parser.add_argument(
-        "--no-gpu-monitoring", action="store_true", help="Disables GPU monitoring during benchmark runs"
-    )
-
-    parser.add_argument(
-        "--push-result-to-dataset",
-        type=str,
-        default=None,
-        help="Name of the dataset to push results to. If not provided, results are not pushed to the Hub.",
-    )
    args = parser.parse_args()

    # Setup logging
@ -81,62 +69,43 @@ if __name__ == "__main__":

    # If there is only one (batch_size, sequence_length, num_tokens_to_generate), we benchmark across configs
    elif len(args.batch_size) * len(args.sequence_length) * len(args.num_tokens_to_generate) == 1:
-        if args.cross_generate:
-            benchmark_configs = generate_all_configs(
-                warmup_iterations=args.warmup,
-                measurement_iterations=args.iterations,
-                batch_size=args.batch_size[0],
-                sequence_length=args.sequence_length[0],
-                num_tokens_to_generate=args.num_tokens_to_generate[0],
-                gpu_monitoring=not args.no_gpu_monitoring,
-            )
-        else:
-            benchmark_configs = generate_main_configs(
-                warmup_iterations=args.warmup,
-                measurement_iterations=args.iterations,
-                batch_size=args.batch_size[0],
-                sequence_length=args.sequence_length[0],
-                num_tokens_to_generate=args.num_tokens_to_generate[0],
-            )
-
-    # Otherwise, we benchmark across all combinations of dimensions
-    else:
-        main_config = generate_main_configs(
+        benchmark_configs = generate_all_configs(
            warmup_iterations=args.warmup,
            measurement_iterations=args.iterations,
            batch_size=args.batch_size[0],
            sequence_length=args.sequence_length[0],
            num_tokens_to_generate=args.num_tokens_to_generate[0],
-        )[0]
+        )
+        random.shuffle(benchmark_configs)
+
+    # Otherwise, we benchmark across all combinations of dimensions
+    else:
+        kwargs = {
+            "warmup_iterations": args.warmup,
+            "measurement_iterations": args.iterations,
+            "gpu_monitoring": False,
+            "batch_size": args.batch_size[0],
+            "sequence_length": args.sequence_length[0],
+            "num_tokens_to_generate": args.num_tokens_to_generate[0],
+            "attn_implementation": "flex_attention",
+            "sdpa_backend": None,
+            "compile_mode": "default",
+            "kernelize": False,
+        }
        benchmark_configs = []
        for num_tokens_to_generate in args.num_tokens_to_generate:
            for sequence_length in args.sequence_length:
                for batch_size in args.batch_size:
-                    cfg_dict = main_config.to_dict()
-                    cfg_dict["batch_size"] = batch_size
-                    cfg_dict["sequence_length"] = sequence_length
-                    cfg_dict["num_tokens_to_generate"] = num_tokens_to_generate
-                    cfg_dict.pop("name")
-                    benchmark_configs.append(BenchmarkConfig.from_dict(cfg_dict))
+                    kwargs["batch_size"] = batch_size
+                    kwargs["sequence_length"] = sequence_length
+                    kwargs["num_tokens_to_generate"] = num_tokens_to_generate
+                    benchmark_configs.append(BenchmarkConfig(**kwargs))

-    runner = BenchmarkRunner(
-        logger,
-        args.output_dir,
-        args.branch_name,
-        args.commit_id,
-        args.commit_message,
-    )
-    timestamp, results = runner.run_benchmarks(
+    runner = BenchmarkRunner(logger, args.output_dir, args.commit_id)
+    results = runner.run_benchmarks(
        args.model_id,
-        benchmark_configs,
+        benchmark_configs[:3],
        args.num_tokens_to_profile,
        pretty_print_summary=True,
    )
-
-    dataset_id = args.push_result_to_dataset
-    if dataset_id is not None and len(results) > 0:
-        runner.push_results_to_hub(
-            dataset_id,
-            results,
-            timestamp,
-        )
+    # runner.save_results(args.model_id, results)
--- a/conftest.py
+++ b/conftest.py
@ -58,6 +58,7 @@ NOT_DEVICE_TESTS = {
    "test_model_get_set_embeddings",
    "test_model_main_input_name",
    "test_correct_missing_keys",
+    "test_tie_model_weights",
    "test_can_use_safetensors",
    "test_load_save_without_tied_weights",
    "test_tied_weights_keys",
--- a/docker/consistency.dockerfile
+++ b/docker/consistency.dockerfile
@ -5,7 +5,7 @@ ARG REF=main
 RUN apt-get update && apt-get install -y time git g++ pkg-config make git-lfs
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip install uv && uv pip install --no-cache-dir -U pip setuptools GitPython
-RUN uv pip install --no-cache-dir --upgrade 'torch<2.9' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir --upgrade 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir pypi-kenlm
 RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[quality,testing,torch-speech,vision]"
 RUN git lfs install
--- a/docker/custom-tokenizers.dockerfile
+++ b/docker/custom-tokenizers.dockerfile
@ -17,7 +17,7 @@ RUN make install -j 10

 WORKDIR /

-RUN uv pip install --no-cache --upgrade 'torch<2.9' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache --upgrade 'torch' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir  --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install  --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[ja,testing,sentencepiece,spacy,ftfy,rjieba]" unidic unidic-lite
 # spacy is not used so not tested. Causes to failures. TODO fix later
--- a/docker/examples-torch.dockerfile
+++ b/docker/examples-torch.dockerfile
@ -5,7 +5,7 @@ USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git-lfs ffmpeg curl
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch<2.9' 'torchaudio' 'torchvision' 'torchcodec<0.8' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]" seqeval albumentations jiwer

--- a/docker/exotic-models.dockerfile
+++ b/docker/exotic-models.dockerfile
@ -5,7 +5,7 @@ USER root
 RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git libgl1 g++ tesseract-ocr git-lfs curl
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch<2.9' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir  --no-deps timm accelerate
 RUN uv pip install -U --no-cache-dir pytesseract python-Levenshtein opencv-python nltk
 # RUN uv pip install --no-cache-dir natten==0.15.1+torch210cpu -f https://shi-labs.com/natten/wheels
--- a/docker/pipeline-torch.dockerfile
+++ b/docker/pipeline-torch.dockerfile
@ -5,7 +5,7 @@ USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git pkg-config openssh-client git ffmpeg curl
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch<2.9' 'torchaudio' 'torchvision' 'torchcodec<0.8' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]"

--- a/docker/torch-light.dockerfile
+++ b/docker/torch-light.dockerfile
@ -5,7 +5,7 @@ USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git-lfs ffmpeg curl
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch<2.9' 'torchaudio' 'torchvision' 'torchcodec<0.8' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch' 'torchaudio' 'torchvision' 'torchcodec' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing,tiktoken,num2words,video]"

--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@ -1,8 +1,6 @@
 FROM nvidia/cuda:12.6.0-cudnn-devel-ubuntu22.04
 LABEL maintainer="Hugging Face"

-ENV NVIDIA_DRIVER_CAPABILITIES compute,utility,graphics,video,display,compat32
-
 ARG DEBIAN_FRONTEND=noninteractive

 # Use login shell to read variables from `~/.profile` (to pass dynamic created variables between RUN commands)
@ -26,8 +24,7 @@ RUN git clone https://github.com/huggingface/transformers && cd transformers &&
 # 1. Put several commands in a single `RUN` to avoid image/layer exporting issue. Could be revised in the future.
 # 2. Regarding `torch` part, We might need to specify proper versions for `torchvision` and `torchaudio`.
 #    Currently, let's not bother to specify their versions explicitly (so installed with their latest release versions).
-# 3. For `torchcodec<0.8`: this is quickly added as torch 2.9.0 + torchcodec 0.8.0 fails on our CI env. Need to remove later once they work.
-RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' ||  VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio "torchcodec<0.8" --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA
+RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' ||  VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA

 RUN python3 -m pip install --no-cache-dir -U timm

--- a/docs/source/ar/_toctree.yml
+++ b/docs/source/ar/_toctree.yml
@ -123,6 +123,8 @@
    title: تشغيل التدريب على Amazon SageMaker
  - local: serialization
    title: التصدير إلى ONNX
+  - local: torchscript
+    title: التصدير إلى TorchScript
  - local: notebooks
    title: دفاتر الملاحظات مع الأمثلة
  - local: community
--- a/docs/source/ar/torchscript.md
+++ b/docs/source/ar/torchscript.md
@ -0,0 +1,154 @@
+# التصدير إلى TorchScript
+
+<Tip>
+
+هذه هي بداية تجاربنا مع TorchScript ولا زلنا نستكشف قدراته مع نماذج المدخلات المتغيرة الحجم. إنه مجال اهتمامنا وسنعمق تحليلنا في الإصدارات القادمة، مع المزيد من الأمثلة البرمجية، وتنفيذ أكثر مرونة، ومقاييس مقارنة بين  الأكواد القائمة على Python مع أكواد TorchScript المُجمّعة.
+
+</Tip>
+
+وفقًا لـ [وثائق TorchScript](https://pytorch.org/docs/stable/jit.html):
+
+> TorchScript هي طريقة لإنشاء نماذج قابلة للتسلسل والتحسين من تعليمات PyTorch البرمجية.
+
+هناك وحدتان من PyTorch، [JIT and TRACE](https://pytorch.org/docs/stable/jit.html)، تتيحان للمطورين تصدير نماذجهم لإعادة استخدامها في برامج أخرى مثل برامج C++ المُحسّنة للأداء.
+
+نقدم واجهة تتيح لك تصدير نماذج 🤗 Transformers إلى TorchScript بحيث يمكن إعادة استخدامها في بيئة مختلفة عن برامج Python القائمة إلى PyTorch. هنا نشرح كيفية تصدير نماذجنا واستخدامها باستخدام TorchScript.
+
+يتطلب تصدير نموذج أمرين:
+
+- تهيئة مثيل للنموذج باستخدام علامة `torchscript`
+- تمرير مُدخلات وهمية (dummy inputs) خلال النموذج
+
+تنطوي هذه الضرورات على عدة أمور يجب على المطورين توخي الحذر بشأنها كما هو مفصل أدناه.
+
+## علامة TorchScript والأوزان المرتبطة
+
+علامة `torchscript` ضرورية لأن معظم نماذج اللغة 🤗 Transformers لها أوزان مرتبطة بين طبقة `Embedding` وطبقة `Decoding`. لا يسمح لك TorchScript بتصدير النماذج ذات الأوزان المرتبطة، لذلك من الضروري فصل الأوزان ونسخها مسبقًا.
+
+النماذج المُهيأة باستخدام علامة `torchscript` لها طبقة `Embedding` وطبقة`Decoding` منفصلتين، مما يعني أنه لا ينبغي تدريبها لاحقًا. سيؤدي التدريب إلى عدم تزامن الطبقتين، مما يؤدي إلى نتائج غير متوقعة.
+
+هذا لا ينطبق على النماذج التي لا تحتوي على رأس نموذج اللغة، حيث لا تملك أوزانًا مرتبطة. يمكن تصدير هذه النماذج بأمان دون علامة `torchscript`.
+
+## المدخلات الوهمية والأطوال القياسية
+
+تُستخدم المُدخلات الوهمية لتمرير أمامي خلال النموذج. أثناء انتشار قيم المُدخلات عبر الطبقات، يتتبع PyTorch العمليات المختلفة التي يتم تنفيذها على كل مصفوفة(tensor). ثم يتم استخدام هذه العمليات المُسجلة بعد ذلك لإنشاء *أثر* النموذج.
+
+يتم إنشاء التتبع بالنسبة لأبعاد المُدخلات. وبالتالي، فهو مُقيّد بأبعاد المُدخلات الوهمية، ولن يعمل لأي طول تسلسل أو حجم دفعة مختلف. عند المحاولة بحجم مختلف، يتم رفع الخطأ التالي:
+
+```
+`The expanded size of the tensor (3) must match the existing size (7) at non-singleton dimension 2`
+```
+
+نوصي بتتبع النموذج باستخدام حجم مُدخلات وهمية لا يقل عن أكبر مُدخل سيتم تقديمه للنموذج أثناء الاستدلال. يمكن أن تساعد الحشوة(padding) في ملء القيم المفقودة. ومع ذلك، نظرًا لتتبع النموذج بحجم مُدخل أكبر، ستكون أبعاد المصفوفة ستكون كبيرة أيضًا، مما يؤدي عنه المزيد من الحسابات.
+
+انتبه إلى إجمالي عدد العمليات المُنفذة على كل مُدخل وتابع الأداء عن كثب عند تصدير نماذج متغيرة طول التسلسل.
+
+## استخدام TorchScript في Python
+
+يوضح هذا القسم كيفية حفظ النماذج وتحميلها، بالإضافة إلى كيفية استخدام التتبع للاستدلال.
+
+### حفظ نموذج
+
+لتصدير `BertModel` باستخدام TorchScript، قم بتهيئة ـ `BertModel` من فئة `BertConfig` ثم احفظه على القرص تحت اسم الملف `traced_bert.pt`:
+
+```python
+from transformers import BertModel, BertTokenizer, BertConfig
+import torch
+
+enc = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
+
+# Tokenizing input text
+text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+tokenized_text = enc.tokenize(text)
+
+# Masking one of the input tokens
+masked_index = 8
+tokenized_text[masked_index] = "[MASK]"
+indexed_tokens = enc.convert_tokens_to_ids(tokenized_text)
+segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
+
+# Creating a dummy input
+tokens_tensor = torch.tensor([indexed_tokens])
+segments_tensors = torch.tensor([segments_ids])
+dummy_input = [tokens_tensor, segments_tensors]
+
+# Initializing the model with the torchscript flag
+# Flag set to True even though it is not necessary as this model does not have an LM Head.
+config = BertConfig(
+    vocab_size_or_config_json_file=32000,
+    hidden_size=768,
+    num_hidden_layers=12,
+    num_attention_heads=12,
+    intermediate_size=3072,
+    torchscript=True,
+)
+
+# Instantiating the model
+model = BertModel(config)
+
+# The model needs to be in evaluation mode
+model.eval()
+
+# If you are instantiating the model with *from_pretrained* you can also easily set the TorchScript flag
+model = BertModel.from_pretrained("google-bert/bert-base-uncased", torchscript=True)
+
+# Creating the trace
+traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors])
+torch.jit.save(traced_model, "traced_bert.pt")
+```
+
+### تحميل نموذج
+
+يمكنك الآن تحميل `BertModel` المُحفظ سابقًا، `traced_bert.pt`، من القرص واستخدامه على `dummy_input` المُهيأ سابقًا:
+
+```python
+loaded_model = torch.jit.load("traced_bert.pt")
+loaded_model.eval()
+
+all_encoder_layers, pooled_output = loaded_model(*dummy_input)
+```
+
+### استخدام نموذج مُتتبع للاستدلال
+
+استخدم النموذج المُتتبع للاستدلال باستخدام أسلوب `__call__` الخاص به:
+
+```python
+traced_model(tokens_tensor, segments_tensors)
+```
+
+## نشر نماذج Hugging Face TorchScript على AWS باستخدام Neuron SDK
+
+قدمت AWS عائلة [Amazon EC2 Inf1](https://aws.amazon.com/ec2/instance-types/inf1/) من اﻷجهزة لخفض التكلفة وأداء التعلم الآلي عالي الأداء في البيئة السحابية. تعمل أجهزة Inf1 بواسطة شريحة Inferentia من AWS، وهي مُسرّع أجهزة مُخصص، متخصص في أعباء عمل الاستدلال للتعلم العميق. [AWS Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/#) هي SDK لـ Inferentia التي تدعم تتبع نماذج المحولات وتحسينها للنشر على Inf1. توفر Neuron SDK ما يلي:
+
+1. واجهة برمجة تطبيقات سهلة الاستخدام مع تغيير سطر واحد من التعليمات البرمجية لتتبع نموذج TorchScript وتحسينه للاستدلال في البيئة السحابية.
+2. تحسينات الأداء الجاهزة للاستخدام [تحسين التكلفة والأداء](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/benchmark/>).
+3. دعم نماذج Hugging Face المحولات المبنية باستخدام إما [PyTorch](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/bert_tutorial/tutorial_pretrained_bert.html) أو [TensorFlow](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/tensorflow/huggingface_bert/huggingface_bert.html).
+
+### الآثار المترتبة
+
+تعمل نماذج المحولات المستندة إلى بنية [BERT (تمثيلات الترميز ثنائية الاتجاه من المحولات)](https://huggingface.co/docs/transformers/main/model_doc/bert) أو متغيراتها مثل [distilBERT](https://huggingface.co/docs/transformers/main/model_doc/distilbert) و [roBERTa](https://huggingface.co/docs/transformers/main/model_doc/roberta) بشكل أفضل على Inf1 للمهام غير التوليدية مثل الإجابة على الأسئلة الاستخراجية، وتصنيف التسلسلات، وتصنيف الرموز (tokens). ومع ذلك، يمكن تكييف مهام توليد النصوص للعمل على Inf1 وفقًا لهذا [برنامج تعليمي AWS Neuron MarianMT](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/src/examples/pytorch/transformers-marianmt.html). يمكن العثور على مزيد من المعلومات حول النماذج التي يمكن تحويلها جاهزة على Inferentia في قسم [ملاءمة بنية النموذج](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/models/models-inferentia.html#models-inferentia) من وثائق Neuron.
+
+### التبعيات (Dependencies)
+
+يتطلب استخدام AWS Neuron لتحويل النماذج [بيئة SDK Neuron](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/neuron-guide/neuron-frameworks/pytorch-neuron/index.html#installation-guide) والتي تأتي مسبقًا على [AMI للتعلم العميق من AWS](https://docs.aws.amazon.com/dlami/latest/devguide/tutorial-inferentia-launching.html).
+
+### تحويل نموذج لـ AWS Neuron
+
+قم بتحويل نموذج لـ AWS NEURON باستخدام نفس التعليمات البرمجية من [استخدام TorchScript في Python](torchscript#using-torchscript-in-python) لتتبع `BertModel`. قم باستيراد امتداد إطار عمل `torch.neuron` للوصول إلى مكونات Neuron SDK من خلال واجهة برمجة تطبيقات Python:
+
+```python
+from transformers import BertModel, BertTokenizer, BertConfig
+import torch
+import torch.neuron
+```
+
+كل ما عليك فعله هو تعديل السطر التالي:
+
+```diff
+- torch.jit.trace(model, [tokens_tensor, segments_tensors])
+ torch.neuron.trace(model, [token_tensor, segments_tensors])
+```
+
+يتيح ذلك لـ Neuron SDK تتبع النموذج وتحسينه لمثيلات Inf1.
+
+لمعرفة المزيد حول ميزات AWS Neuron SDK والأدوات ودروس البرامج التعليمية والتحديثات الأخيرة، يرجى الاطلاع على [وثائق AWS NeuronSDK](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/index.html).
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@ -227,6 +227,8 @@
    title: ONNX
  - local: executorch
    title: ExecuTorch
+  - local: torchscript
+    title: TorchScript
  title: Export to production
 - isExpanded: false
  sections:
@ -282,8 +284,6 @@
        title: Knowledge Distillation for Computer Vision
      - local: tasks/keypoint_matching
        title: Keypoint matching
-      - local: tasks/training_vision_backbone
-        title: Training vision models using Backbone API
      title: Computer vision
    - sections:
      - local: tasks/image_captioning
@ -544,6 +544,8 @@
        title: Helium
      - local: model_doc/herbert
        title: HerBERT
+      - local: model_doc/hgnet_v2
+        title: HGNet-V2
      - local: model_doc/hunyuan_v1_dense
        title: HunYuanDenseV1
      - local: model_doc/hunyuan_v1_moe
@ -1253,8 +1255,6 @@
      title: Importing Utilities
    - local: internal/time_series_utils
      title: Utilities for Time Series
-    - local: internal/rope_utils
-      title: Rotary Embeddings Utilities
    title: Internal helpers
  - sections:
    - local: reference/environment_variables
--- a/docs/source/en/accelerator_selection.md
+++ b/docs/source/en/accelerator_selection.md
@ -55,7 +55,6 @@ deepspeed --num_gpus 2 trainer-program.py ...
 </hfoptions>

 ## Order of accelerators
-
 To select specific accelerators to use and their order, use the environment variable appropriate for your hardware. This is often set on the command line for each run, but can also be added to your `~/.bashrc` or other startup config file.

 For example, if there are 4 accelerators (0, 1, 2, 3) and you only want to run accelerators 0 and 2:
--- a/docs/source/en/community.md
+++ b/docs/source/en/community.md
@ -6,13 +6,13 @@ rendered properly in your Markdown viewer.

 This page regroups resources around 🤗 Transformers developed by the community.

-## Community resources
+## Community resources:

 | Resource     |      Description      |      Author      |
 |:----------|:-------------|------:|
 | [Hugging Face Transformers Glossary Flashcards](https://www.darigovresearch.com/huggingface-transformers-glossary-flashcards) | A set of flashcards based on the [Transformers Docs Glossary](glossary) that has been put into a form which can be easily learned/revised using [Anki](https://apps.ankiweb.net/) an open source, cross platform app specifically designed for long term knowledge retention. See this [Introductory video on how to use the flashcards](https://www.youtube.com/watch?v=Dji_h7PILrw). | [Darigov Research](https://www.darigovresearch.com/) |

-## Community notebooks
+## Community notebooks:

 | Notebook     |      Description      |      Author      |      |
 |:----------|:-------------|:-------------|------:|
--- a/docs/source/en/executorch.md
+++ b/docs/source/en/executorch.md
@ -16,18 +16,44 @@ rendered properly in your Markdown viewer.

 # ExecuTorch

-[ExecuTorch](https://pytorch.org/executorch/stable/index.html) runs PyTorch models on mobile and edge devices. Export your Transformers models to the ExecuTorch format with [Optimum ExecuTorch](https://github.com/huggingface/optimum-executorch) with the command below.
+[ExecuTorch](https://pytorch.org/executorch/stable/index.html) is a platform that enables PyTorch training and inference programs to be run on mobile and edge devices. It is powered by [torch.compile](https://pytorch.org/docs/stable/torch.compiler.html) and [torch.export](https://pytorch.org/docs/main/export.html) for performance and deployment.

-```
-optimum-cli export executorch \
-    --model "HuggingFaceTB/SmolLM2-135M-Instruct" \
-    --task "text-generation" \
-    --recipe "xnnpack" \
-    --use_custom_sdpa \
-    --use_custom_kv_cache \
-    --qlinear 8da4w \
-    --qembedding 8w \
-    --output_dir="hf_smollm2"
+You can use ExecuTorch with Transformers with [torch.export](https://pytorch.org/docs/main/export.html). The [`~transformers.convert_and_export_with_cache`] method converts a [`PreTrainedModel`] into an exportable module. Under the hood, it uses [torch.export](https://pytorch.org/docs/main/export.html) to export the model, ensuring compatibility with ExecuTorch.
+
+```py
+import torch
+from transformers import LlamaForCausalLM, AutoTokenizer, GenerationConfig
+from transformers.integrations.executorch import(
+    TorchExportableModuleWithStaticCache,
+    convert_and_export_with_cache
+)
+
+generation_config = GenerationConfig(
+    use_cache=True,
+    cache_implementation="static",
+    cache_config={
+        "batch_size": 1,
+        "max_cache_len": 20,
+    }
+)
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B", pad_token="</s>", padding_side="right")
+model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B", device_map="auto", dtype=torch.bfloat16, attn_implementation="sdpa", generation_config=generation_config)
+
+exported_program = convert_and_export_with_cache(model)
 ```

-Run `optimum-cli export executorch --help` to see all export options. For detailed export instructions, check the [README](optimum/exporters/executorch/README.md).
+The exported PyTorch model is now ready to be used with ExecuTorch. Wrap the model with [`~transformers.TorchExportableModuleWithStaticCache`] to generate text.
+
+```py
+prompts = ["Simply put, the theory of relativity states that "]
+prompt_tokens = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device)
+prompt_token_ids = prompt_tokens["input_ids"]
+
+generated_ids = TorchExportableModuleWithStaticCache.generate(
+    exported_program=exported_program, prompt_token_ids=prompt_token_ids, max_new_tokens=20,
+)
+generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+print(generated_text)
+['Simply put, the theory of relativity states that 1) the speed of light is the']
+```
--- a/docs/source/en/hpo_train.md
+++ b/docs/source/en/hpo_train.md
@ -37,6 +37,7 @@ def model_init(trial):
        config=config,
        cache_dir=model_args.cache_dir,
        revision=model_args.model_revision,
+        token=True if model_args.use_auth_token else None,
    )
 ```

--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@ -36,6 +36,8 @@ Explore the [Hub](https://huggingface.com/) today to find a model and use Transf

 Explore the [Models Timeline](./models_timeline) to discover the latest text, vision, audio and multimodal model architectures in Transformers.

+
+
 ## Features

 Transformers provides everything you need for inference or training with state-of-the-art pretrained models. Some of the main features include:
--- a/docs/source/en/internal/model_debugging_utils.md
+++ b/docs/source/en/internal/model_debugging_utils.md
@ -320,7 +320,7 @@ df.sort_values(by=['skipped_proportion'], ascending=False)
 You can focus on a specific test method using `--test_method_name`:

 ```bash
-python utils/scan_skipped_tests.py --test_method_name test_inputs_embeds --output_dir path/to/output
+$ python utils/scan_skipped_tests.py --test_method_name test_inputs_embeds --output_dir path/to/output
 ```

 - `--test_method_name`: Name of the test method to scan (e.g., `test_inputs_embeds`).
@ -364,7 +364,6 @@ This utility analyzes code similarities between model implementations to identif
 When adding a new model to transformers, many components (attention layers, MLPs, outputs, etc.) may already exist in similar form in other models. Instead of implementing everything from scratch, model adders can identify which existing classes are similar and potentially reusable through modularization.

 The tool computes two similarity scores:
-
 - **Embedding score**: Uses semantic code embeddings (via `Qwen/Qwen3-Embedding-4B`) to detect functionally similar code even with different naming
 - **Jaccard score**: Measures token set overlap to identify structurally similar code patterns

--- a/docs/source/en/internal/rope_utils.md
+++ b/docs/source/en/internal/rope_utils.md
@ -1,89 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Utilities for Rotary Embedding
-
-This page explains how the Rotary Embedding is computed and applied in Transformers and what types of RoPE are supported.
-
-
-## Overview
-
-Rotary Position Embeddings are a technique used to inject positional information into attention mechanisms without relying on explicit position encodings.  
-Instead of adding position vectors to token embeddings, RoPE rotates query and key vectors in the complex plane according to their positions enabling relative positional awareness and better extrapolation to unseen sequence lengths.
-
-The Transformers library provides a flexible and extensible implementation of various RoPE types defined in `[`~modeling_rope_utils.ROPE_VALIDATION_FUNCTIONS`]`, including both the default and scaled variants:
-
-| Rope Type | Description |
-|------------|-------------|
-| `"default"` | Standard rotary embedding as in LLaMA. |
-| `"linear"` | Linear-scaled RoPE which allows longer context windows. |
-| `"dynamic"` | NTK-aware scaling computed by rescaling frequency base (`θ`) for longer context. |
-| `"yarn"` | YaRN scaling variant providing smoother extrapolation and stability. |
-| `"longrope"` | [LongRoPE](https://github.com/microsoft/LongRoPE) scaling as in Phi-2 model series. |
-| `"llama3"` | RoPE scaling as in Llama3.1. |
-
-
-# Configuration in Model Configs
-
-To enable and customize rotary embeddings, add a `rope_parameters` field to your model’s configuration file (`config.json`). This field controls the RoPE behavior across model layers. Note that each RoPE variant defines its own set of expected keys and missing keys will raise an error. See the example below which creates a llama config with default RoPE parameters: 
-
-
-```python
-from transformers import LlamaConfig
-
-config = LlamaConfig()
-config.rope_parameters = {
-    "rope_type": "default", # type of RoPE to use
-    "rope_theta": 10000.0 # base frequency parameter
-}
-
-# If we want to apply a scaled RoPE type, we need to pass extra parameters
-config.rope_parameters = {
-    "rope_type": "linear",
-    "rope_theta": 10000.0,
-    "factor": 8.0  # scale factor for context extension
-}
-```
-
-## Per-Layer-Type RoPE Configuration
-
-Some models such as Gemma-3 use different layer types with different attention mechanisms, i.e. "full attention" in some blocks and "sliding-window attention" in others. Transformers supports specifying distinct RoPE parameters per layer type for these models. In this case, `rope_parameters` should be a nested dictionary, where top-level keys correspond to `config.layer_types` and values are per-type RoPE parameters. During model initialization, each decoder layer will automatically look up the matching RoPE configuration based on its declared layer type.
-
-
-```python
-from transformers import Gemma3Config
-
-config = Gemma3Config()
-config.rope_parameters = {
-    "full_attention": {
-        "rope_type": "dynamic",
-        "rope_theta": 1000000.0,
-        "factor": 8.0,
-        "original_max_position_embeddings": 8096,
-    },
-    "sliding_attention": {
-        "rope_type": "default",
-        "rope_theta": 10000.0,
-    }
-}
-```
-
-# Utilities
-
-[[autodoc]] RopeParameters
-    - __call__
-
-
--- a/docs/source/en/kernel_doc/overview.md
+++ b/docs/source/en/kernel_doc/overview.md
@ -1,3 +1,3 @@
 # Overview

-Kernels in transformers are used to optimize the performance of models with custom layers from the hub and very low effort.
+Kernels in transformers are used to optimize the performance of models with custom layers from the hub and very low effort.
--- a/docs/source/en/kv_cache.md
+++ b/docs/source/en/kv_cache.md
@ -208,7 +208,7 @@ Some models have a unique way of storing past kv pairs or states that is not com

 Mamba models, such as [Mamba](./model_doc/mamba), require a specific cache because the model doesn't have an attention mechanism or kv states. Thus, they are not compatible with the above [`Cache`] classes.

-## Iterative generation
+# Iterative generation

 A cache can also work in iterative generation settings where there is back-and-forth interaction with a model (chatbots). Like regular generation, iterative generation with a cache allows a model to efficiently handle ongoing conversations without recomputing the entire context at each step.

--- a/docs/source/en/main_classes/pipelines.md
+++ b/docs/source/en/main_classes/pipelines.md
@ -267,7 +267,6 @@ about how many forward passes you inputs are actually going to trigger, you can
 independently of the inputs. The caveats from the previous section still apply.

 ## Pipeline FP16 inference
-
 Models can be run in FP16 which can be significantly faster on GPU while saving memory. Most models will not suffer noticeable performance loss from this. The larger the model, the less likely that it will.

 To enable FP16 inference, you can simply pass `dtype=torch.float16` or `dtype='float16'` to the pipeline constructor. Note that this only works for models with a PyTorch backend. Your inputs will be converted to FP16 internally.
@ -335,7 +334,6 @@ Pipelines available for audio tasks include the following.
 Pipelines available for computer vision tasks include the following.

 ### DepthEstimationPipeline
-
 [[autodoc]] DepthEstimationPipeline
    - __call__
    - all
--- a/docs/source/en/main_classes/quantization.md
+++ b/docs/source/en/main_classes/quantization.md
@ -43,7 +43,6 @@ Learn how to quantize models in the [Quantization](../quantization) guide.
 [[autodoc]] AwqConfig

 ## EetqConfig
-
 [[autodoc]] EetqConfig

 ## GPTQConfig
--- a/docs/source/en/main_classes/video_processor.md
+++ b/docs/source/en/main_classes/video_processor.md
@ -23,7 +23,6 @@ The video processor extends the functionality of image processors by allowing Vi
 When adding a new VLM or updating an existing one to enable distinct video preprocessing, saving and reloading the processor configuration will store the video related arguments in a dedicated file named `video_preprocessing_config.json`. Don't worry if you haven't updated your VLM, the processor will try to load video related configurations from a file named `preprocessing_config.json`.

 ### Usage Example
-
 Here's an example of how to load a video processor with [`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf) model:

 ```python
--- a/docs/source/en/model_doc/altclip.md
+++ b/docs/source/en/model_doc/altclip.md
@ -100,29 +100,22 @@ for label, prob in zip(labels, probs[0]):
 - [`AltCLIPProcessor`] combines [`CLIPImageProcessor`] and [`XLMRobertaTokenizer`] into a single instance to encode text and prepare images.

 ## AltCLIPConfig
-
 [[autodoc]] AltCLIPConfig

 ## AltCLIPTextConfig
-
 [[autodoc]] AltCLIPTextConfig

 ## AltCLIPVisionConfig
-
 [[autodoc]] AltCLIPVisionConfig

 ## AltCLIPModel
-
 [[autodoc]] AltCLIPModel

 ## AltCLIPTextModel
-
 [[autodoc]] AltCLIPTextModel

 ## AltCLIPVisionModel
-
 [[autodoc]] AltCLIPVisionModel

 ## AltCLIPProcessor
-
 [[autodoc]] AltCLIPProcessor
--- a/docs/source/en/model_doc/bart.md
+++ b/docs/source/en/model_doc/bart.md
@ -23,7 +23,6 @@ rendered properly in your Markdown viewer.
 </div>

 # BART
-
 [BART](https://huggingface.co/papers/1910.13461) is a sequence-to-sequence model that combines the pretraining objectives from BERT and GPT. It's pretrained by corrupting text in different ways like deleting words, shuffling sentences, or masking tokens and learning how to fix it. The encoder encodes the corrupted document and the corrupted text is fixed by the decoder. As it learns to recover the original text, BART gets really good at both understanding and generating language.

 You can find all the original BART checkpoints under the [AI at Meta](https://huggingface.co/facebook?search_models=bart) organization.
--- a/docs/source/en/model_doc/blt.md
+++ b/docs/source/en/model_doc/blt.md
@ -38,7 +38,7 @@ The abstract from the paper is the following:
 efficiency and robustness. BLT encodes bytes into dynamically sized patches, which serve as the primary units of computation. Patches are segmented based on the entropy of the next byte, allocating
 more compute and model capacity where increased data complexity demands it. We present the first flop controlled scaling study of byte-level models up to 8B parameters and 4T training bytes. Our results demonstrate the feasibility of scaling models trained on raw bytes without a fixed vocabulary. Both training and inference efficiency improve due to dynamically selecting long patches when data is predictable, along with qualitative improvements on reasoning and long tail generalization. Overall, for fixed inference costs, BLT shows significantly better scaling than tokenization-based models, by simultaneously growing both patch and model size.*

-## Usage Tips
+## Usage Tips:

 - **Dual Model Architecture**: BLT consists of two separate trained models:
  - **Patcher (Entropy Model)**: A smaller transformer model that predicts byte-level entropy to determine patch boundaries and segment input.
--- a/docs/source/en/model_doc/chameleon.md
+++ b/docs/source/en/model_doc/chameleon.md
@ -25,7 +25,8 @@ rendered properly in your Markdown viewer.

 ## Overview

-The Chameleon model was proposed in [Chameleon: Mixed-Modal Early-Fusion Foundation Models](https://huggingface.co/papers/2405.09818) by META AI Chameleon Team. Chameleon is a Vision-Language Model that use vector quantization to tokenize images which enables the model to generate multimodal output. The model takes images and texts as input, including an interleaved format, and generates textual response. Image generation module is not released yet.
+The Chameleon model was proposed in [Chameleon: Mixed-Modal Early-Fusion Foundation Models
+](https://huggingface.co/papers/2405.09818) by META AI Chameleon Team. Chameleon is a Vision-Language Model that use vector quantization to tokenize images which enables the model to generate multimodal output. The model takes images and texts as input, including an interleaved format, and generates textual response. Image generation module is not released yet.

 The abstract from the paper is the following:

--- a/docs/source/en/model_doc/clvp.md
+++ b/docs/source/en/model_doc/clvp.md
@ -39,7 +39,7 @@ The original code can be found [here](https://github.com/neonbjb/tortoise-tts).
 3. The use of the [`ClvpModelForConditionalGeneration.generate()`] method is strongly recommended for tortoise usage.
 4. Note that the CLVP model expects the audio to be sampled at 22.05 kHz contrary to other audio models which expects 16 kHz.

-## Brief Explanation
+## Brief Explanation:

 - The [`ClvpTokenizer`] tokenizes the text input, and the [`ClvpFeatureExtractor`] extracts the log mel-spectrogram from the desired audio.
 - [`ClvpConditioningEncoder`] takes those text tokens and audio representations and converts them into embeddings conditioned on the text and audio.
--- a/docs/source/en/model_doc/cwm.md
+++ b/docs/source/en/model_doc/cwm.md
@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.

+
 ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be rendered properly in your Markdown viewer.

 -->
-*This model was released on {release_date} and added to Hugging Face Transformers on 2025-10-09.*
+

 # Code World Model (CWM)

@ -52,8 +53,7 @@ CWM requires a dedicated system prompt to function optimally during inference. W
 configuration, CWM's output quality may be significantly degraded. The following serves as the default
 system prompt for reasoning tasks. For agentic workflows, append the relevant tool specifications
 after this base prompt. Checkout the original code repository for more details.
-
-```text
+```
 You are a helpful AI assistant. You always reason before responding, using the following format:

 <think>
@ -110,7 +110,6 @@ generated_ids = model.generate(
 output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
 print(tokenizer.decode(output_ids))
 ```
-
 <details>
 <summary>Produces the following output:</summary>

--- a/docs/source/en/model_doc/deepseek_v2.md
+++ b/docs/source/en/model_doc/deepseek_v2.md
@ -28,7 +28,6 @@ This model was contributed by [VladOS95-cyber](https://github.com/VladOS95-cyber
 The original code can be found [here](https://huggingface.co/deepseek-ai/DeepSeek-V2).

 ### Usage tips
-
 The model uses Multi-head Latent Attention (MLA) and DeepSeekMoE architectures for efficient inference and cost-effective training. It employs an auxiliary-loss-free strategy for load balancing and multi-token prediction training objective. The model can be used for various language tasks after being pre-trained on 14.8 trillion tokens and going through Supervised Fine-Tuning and Reinforcement Learning stages.

 ## DeepseekV2Config
--- a/docs/source/en/model_doc/deepseek_v3.md
+++ b/docs/source/en/model_doc/deepseek_v3.md
@ -34,7 +34,6 @@ We are super happy to make this code community-powered, and would love to see ho
 - static cache is not supported (this should be just a generation config issue / config shape issues)

 ### Usage tips
-
 The model uses Multi-head Latent Attention (MLA) and DeepSeekMoE architectures for efficient inference and cost-effective training. It employs an auxiliary-loss-free strategy for load balancing and multi-token prediction training objective. The model can be used for various language tasks after being pre-trained on 14.8 trillion tokens and going through Supervised Fine-Tuning and Reinforcement Learning stages.

 You can run the model in `FP8` automatically, using 2 nodes of 8 H100 should be more than enough!
--- a/docs/source/en/model_doc/detr.md
+++ b/docs/source/en/model_doc/detr.md
@ -105,7 +105,7 @@ DETR can be naturally extended to perform panoptic segmentation (which unifies s
 - The decoder of DETR updates the query embeddings in parallel. This is different from language models like GPT-2, which use autoregressive decoding instead of parallel. Hence, no causal attention mask is used.
 - DETR adds position embeddings to the hidden states at each self-attention and cross-attention layer before projecting to queries and keys. For the position embeddings of the image, one can choose between fixed sinusoidal or learned absolute position embeddings. By default, the parameter `position_embedding_type` of [`~transformers.DetrConfig`] is set to `"sine"`.
 - During training, the authors of DETR did find it helpful to use auxiliary losses in the decoder, especially to help the model output the correct number of objects of each class. If you set the parameter `auxiliary_loss` of [`~transformers.DetrConfig`] to `True`, then prediction feedforward neural networks and Hungarian losses are added after each decoder layer (with the FFNs sharing parameters).
- If you want to train the model in a distributed environment across multiple nodes, then one should update the *num_boxes* variable in the *DetrLoss* class of *modeling_detr.py*. When training on multiple nodes, this should be set to the average number of target boxes across all nodes, as can be seen in the original implementation [here](https://github.com/facebookresearch/detr/blob/a54b77800eb8e64e3ad0d8237789fcbf2f8350c5/models/detr.py#L227-L232).
+- If you want to train the model in a distributed environment across multiple nodes, then one should update the _num_boxes_ variable in the _DetrLoss_ class of _modeling_detr.py_. When training on multiple nodes, this should be set to the average number of target boxes across all nodes, as can be seen in the original implementation [here](https://github.com/facebookresearch/detr/blob/a54b77800eb8e64e3ad0d8237789fcbf2f8350c5/models/detr.py#L227-L232).
 - [`~transformers.DetrForObjectDetection`] and [`~transformers.DetrForSegmentation`] can be initialized with any convolutional backbone available in the [timm library](https://github.com/rwightman/pytorch-image-models). Initializing with a MobileNet backbone for example can be done by setting the `backbone` attribute of [`~transformers.DetrConfig`] to `"tf_mobilenetv3_small_075"`, and then initializing the model with that config.
 - DETR resizes the input images such that the shortest side is at least a certain amount of pixels while the longest is at most 1333 pixels. At training time, scale augmentation is used such that the shortest side is randomly set to at least 480 and at most 800 pixels. At inference time, the shortest side is set to 800. One can use [`~transformers.DetrImageProcessor`] to prepare images (and optional annotations in COCO format) for the model. Due to this resizing, images in a batch can have different sizes. DETR solves this by padding images up to the largest size in a batch, and by creating a pixel mask that indicates which pixels are real/which are padding. Alternatively, one can also define a custom `collate_fn` in order to batch images together, using [`~transformers.DetrImageProcessor.pad_and_create_pixel_mask`].
 - The size of the images will determine the amount of memory being used, and will thus determine the `batch_size`. It is advised to use a batch size of 2 per GPU. See [this Github thread](https://github.com/facebookresearch/detr/issues/150) for more info.
@ -142,7 +142,7 @@ As a summary, consider the following table:
 |------|------------------|-----------------------|-----------------------|
 | **Description** | Predicting bounding boxes and class labels around objects in an image | Predicting masks around objects (i.e. instances) in an image | Predicting masks around both objects (i.e. instances) as well as "stuff" (i.e. background things like trees and roads) in an image |
 | **Model** | [`~transformers.DetrForObjectDetection`] | [`~transformers.DetrForSegmentation`] | [`~transformers.DetrForSegmentation`] |
-| **Example dataset** | COCO detection | COCO detection, COCO panoptic | COCO panoptic                           |
+| **Example dataset** | COCO detection | COCO detection, COCO panoptic | COCO panoptic  |                                                                        |
 | **Format of annotations to provide to**  [`~transformers.DetrImageProcessor`] | {'image_id': `int`, 'annotations': `list[Dict]`} each Dict being a COCO object annotation  | {'image_id': `int`, 'annotations': `list[Dict]`}  (in case of COCO detection) or {'file_name': `str`, 'image_id': `int`, 'segments_info': `list[Dict]`} (in case of COCO panoptic) | {'file_name': `str`, 'image_id': `int`, 'segments_info': `list[Dict]`} and masks_path (path to directory containing PNG files of the masks) |
 | **Postprocessing** (i.e. converting the output of the model to Pascal VOC format) | [`~transformers.DetrImageProcessor.post_process`] | [`~transformers.DetrImageProcessor.post_process_segmentation`] | [`~transformers.DetrImageProcessor.post_process_segmentation`], [`~transformers.DetrImageProcessor.post_process_panoptic`] |
 | **evaluators** | `CocoEvaluator` with `iou_types="bbox"` | `CocoEvaluator` with `iou_types="bbox"` or `"segm"` | `CocoEvaluator` with `iou_tupes="bbox"` or `"segm"`, `PanopticEvaluator` |
--- a/docs/source/en/model_doc/diffllama.md
+++ b/docs/source/en/model_doc/diffllama.md
@ -33,7 +33,6 @@ The abstract from the paper is the following:
 *Transformer tends to overallocate attention to irrelevant context. In this work, we introduce Diff Transformer, which amplifies attention to the relevant context while canceling noise. Specifically, the differential attention mechanism calculates attention scores as the difference between two separate softmax attention maps. The subtraction cancels noise, promoting the emergence of sparse attention patterns. Experimental results on language modeling show that Diff Transformer outperforms Transformer in various settings of scaling up model size and training tokens. More intriguingly, it offers notable advantages in practical applications, such as long-context modeling, key information retrieval, hallucination mitigation, in-context learning, and reduction of activation outliers. By being less distracted by irrelevant context, Diff Transformer can mitigate hallucination in question answering and text summarization. For in-context learning, Diff Transformer not only enhances accuracy but is also more robust to order permutation, which was considered as a chronic robustness issue. The results position Diff Transformer as a highly effective and promising architecture to advance large language models.*

 ### Usage tips
-
 The hyperparameters of this model is the same as Llama model.

 ## DiffLlamaConfig
--- a/docs/source/en/model_doc/dinat.md
+++ b/docs/source/en/model_doc/dinat.md
@ -47,7 +47,7 @@ Our large model is faster and ahead of its Swin counterpart by 1.5% box AP in CO
 Paired with new frameworks, our large variant is the new state of the art panoptic segmentation model on COCO (58.2 PQ)
 and ADE20K (48.5 PQ), and instance segmentation model on Cityscapes (44.5 AP) and ADE20K (35.4 AP) (no extra data).
 It also matches the state of the art specialized semantic segmentation models on ADE20K (58.2 mIoU),
-and ranks second on Cityscapes (84.5 mIoU) (no extra data).*
+and ranks second on Cityscapes (84.5 mIoU) (no extra data). *

 <img
 src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/dilated-neighborhood-attention-pattern.jpg"
--- a/docs/source/en/model_doc/dinov3.md
+++ b/docs/source/en/model_doc/dinov3.md
@ -182,4 +182,4 @@ print("Pooled output shape:", pooled_output.shape)
 ## DINOv3ConvNextBackbone

 [[autodoc]] DINOv3ConvNextBackbone
-    - forward
+    - forward
--- a/docs/source/en/model_doc/donut.md
+++ b/docs/source/en/model_doc/donut.md
@ -120,7 +120,7 @@ print(answer)
    ```py
    >>> import re
    >>> from transformers import DonutProcessor, VisionEncoderDecoderModel
-    >>> from accelerate import Accelerator
+from accelerate import Accelerator
    >>> from datasets import load_dataset
    >>> import torch

@ -162,9 +162,9 @@ print(answer)

    ```py
    >>> import re
-    >>> from accelerate import Accelerator
-    >>> from datasets import load_dataset
    >>> from transformers import DonutProcessor, VisionEncoderDecoderModel
+from accelerate import Accelerator
+    >>> from datasets import load_dataset
    >>> import torch

    >>> processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2")
--- a/docs/source/en/model_doc/edgetam.md
+++ b/docs/source/en/model_doc/edgetam.md
@ -305,6 +305,7 @@ EdgeTAM can use masks from previous predictions as input to refine segmentation:
 ...     )
 ```

+
 ## EdgeTamConfig

 [[autodoc]] EdgeTamConfig
--- a/docs/source/en/model_doc/edgetam_video.md
+++ b/docs/source/en/model_doc/edgetam_video.md
@ -12,11 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.

+
 ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be rendered properly in your Markdown viewer.

 -->
 *This model was released on 2025-01-13 and added to Hugging Face Transformers on 2025-09-29.*

+
 <div style="float: right;">
    <div class="flex flex-wrap space-x-1">
        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
--- a/docs/source/en/model_doc/evolla.md
+++ b/docs/source/en/model_doc/evolla.md
@ -61,7 +61,7 @@ message_list = [
    ]
 ]
 input_dict = processor(
-    protein_inputs, messages_list, return_tensors="pt", text_max_length=512, protein_max_length=1024
+    protein_informations, messages_list, return_tensors="pt", text_max_length=512, protein_max_length=1024
 )
 with torch.no_grad():
    generated_ids = hf_model.generate(**input_dict)
--- a/docs/source/en/model_doc/fastspeech2_conformer.md
+++ b/docs/source/en/model_doc/fastspeech2_conformer.md
@ -28,19 +28,15 @@ The abstract from the original FastSpeech2 paper is the following:
 This model was contributed by [Connor Henderson](https://huggingface.co/connor-henderson). The original code can be found [here](https://github.com/espnet/espnet/blob/master/espnet2/tts/fastspeech2/fastspeech2.py).

 ## 🤗 Model Architecture
-
 FastSpeech2's general structure with a Mel-spectrogram decoder was implemented, and the traditional transformer blocks were replaced with conformer blocks as done in the ESPnet library.

 #### FastSpeech2 Model Architecture
-
 ![FastSpeech2 Model Architecture](https://www.microsoft.com/en-us/research/uploads/prod/2021/04/fastspeech2-1.png)

 #### Conformer Blocks
-
 ![Conformer Blocks](https://www.researchgate.net/profile/Hirofumi-Inaguma-2/publication/344911155/figure/fig2/AS:951455406108673@1603856054097/An-overview-of-Conformer-block.png)

 #### Convolution Module
-
 ![Convolution Module](https://d3i71xaburhd42.cloudfront.net/8809d0732f6147d4ad9218c8f9b20227c837a746/2-Figure1-1.png)

 ## 🤗 Transformers Usage
--- a/docs/source/en/model_doc/florence2.md
+++ b/docs/source/en/model_doc/florence2.md
@ -70,8 +70,8 @@ from transformers import AutoProcessor, Florence2ForConditionalGeneration
 url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"
 image = Image.open(requests.get(url, stream=True).raw).convert("RGB")

-model = Florence2ForConditionalGeneration.from_pretrained("florence-community/Florence-2-base", dtype=torch.bfloat16, device_map="auto")
-processor = AutoProcessor.from_pretrained("florence-community/Florence-2-base")
+model = Florence2ForConditionalGeneration.from_pretrained("microsoft/Florence-2-base", dtype=torch.bfloat16, device_map="auto")
+processor = AutoProcessor.from_pretrained("microsoft/Florence-2-base")

 task_prompt = "<OD>"
 inputs = processor(text=task_prompt, images=image, return_tensors="pt").to(model.device)
@ -105,12 +105,12 @@ from transformers import AutoProcessor, Florence2ForConditionalGeneration, BitsA
 quantization_config = BitsAndBytesConfig(load_in_4bit=True)

 model = Florence2ForConditionalGeneration.from_pretrained(
-    "florence-community/Florence-2-base",
+    "microsoft/Florence-2-large",
    dtype=torch.bfloat16,
    device_map="auto",
    quantization_config=quantization_config
 )
-processor = AutoProcessor.from_pretrained("florence-community/Florence-2-base")
+processor = AutoProcessor.from_pretrained("microsoft/Florence-2-large")

 url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"
 image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
--- a/docs/source/en/model_doc/glm4_moe.md
+++ b/docs/source/en/model_doc/glm4_moe.md
@ -37,6 +37,7 @@ We evaluated GLM-4.6 across eight public benchmarks covering agents, reasoning,

 For more eval results, show cases, and technical details, please visit our [technical blog](https://z.ai/blog/glm-4.6).

+
 ### GLM-4.5

 The [**GLM-4.5**](https://huggingface.co/papers/2508.06471) series models are foundation models designed for intelligent agents, MoE variants are documented here as Glm4Moe.
--- a/docs/source/en/model_doc/gpt_neox.md
+++ b/docs/source/en/model_doc/gpt_neox.md
@ -101,7 +101,6 @@ Below is an expected speedup diagram that compares pure inference time between t
 </div>

 ## Using Scaled Dot Product Attention (SDPA)
-
 PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function
 encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the
 [official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)
@ -124,7 +123,6 @@ On a local benchmark (rtx3080ti-16GB, PyTorch 2.2.1, OS Ubuntu 22.04) using `flo
 following speedups during training and inference.

 ### Training
-
 | Batch size |    Seq len | Time per batch (Eager - s) |    Time per batch (SDPA - s) | Speedup (%) | Eager peak mem (MB) | SDPA peak mem (MB) |    Mem saving (%) |
 |-----------:|-----------:|---------------------------:|-----------------------------:|------------:|--------------------:|-------------------:|------------------:|
 |          1 |        128 |                      0.024 |                        0.019 |      28.945 |             1789.95 |            1789.95 |                 0 |
@ -144,7 +142,6 @@ following speedups during training and inference.
 |          4 |       2048 |                        OOM |                        0.731 |           / |                 OOM |            12705.1 | SDPA does not OOM |

 ### Inference
-
 |    Batch size |      Seq len |    Per token latency Eager (ms) |    Per token latency SDPA (ms) |    Speedup (%) |    Mem Eager (MB) |   Mem SDPA (MB) |    Mem saved (%) |
 |--------------:|-------------:|--------------------------------:|-------------------------------:|---------------:|------------------:|----------------:|-----------------:|
 |             1 |          128 |                           6.569 |                          5.858 |          12.14 |           974.831 |         974.826 |                0 |
--- a/docs/source/en/model_doc/gpt_neox_japanese.md
+++ b/docs/source/en/model_doc/gpt_neox_japanese.md
@ -41,7 +41,7 @@ The example below demonstrates how to generate text with [`Pipeline`] or the [`A
 <hfoptions id="usage">
 <hfoption id="Pipeline">

-```python
+```py
 import torch
 from transformers import pipeline
 pipeline = pipeline(task="text-generation", 
@ -52,7 +52,7 @@ pipeline("人とAIが協調するためには、")
 </hfoption>
 <hfoption id="AutoModel">

-```python
+```py
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer

@ -112,7 +112,6 @@ visualizer("<img>What is shown in this image?")
 </div>

 ## Resources
-
 Refer to the [Training a better GPT model: Learnings from PaLM](https://medium.com/ml-abeja/training-a-better-gpt-2-93b157662ae4) blog post for more details about how ABEJA trained GPT-NeoX-Japanese.

 ## GPTNeoXJapaneseConfig
--- a/docs/source/en/model_doc/gpt_oss.md
+++ b/docs/source/en/model_doc/gpt_oss.md
@ -35,9 +35,9 @@ The abstract from the paper is the following:
 *<INSERT PAPER ABSTRACT HERE>*

 Tips:
-
 - **Attention Sinks with Flex Attention**: When using flex attention, attention sinks require special handling. Unlike with standard attention implementations where sinks can be added directly to attention scores, flex attention `score_mod` function operates on individual score elements rather than the full attention matrix. Therefore, attention sinks renormalization have to be applied after the flex attention computations by renormalizing the outputs using the log-sum-exp (LSE) values returned by flex attention.

+
 <INSERT TIPS ABOUT MODEL HERE>

 This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/<INSERT YOUR HF USERNAME HERE>).
--- a/docs/source/en/model_doc/gptsan-japanese.md
+++ b/docs/source/en/model_doc/gptsan-japanese.md
@ -79,8 +79,6 @@ When token_type_ids=None or all zero, it is equivalent to regular causal mask
 for example:

 >>> x_token = tokenizer("ｱｲｳｴ")
-
-```text
 input_ids:      | SOT | SEG | ｱ | ｲ | ｳ | ｴ |
 token_type_ids: | 1   | 0   | 0 | 0 | 0 | 0 |
 prefix_lm_mask:
@ -90,11 +88,8 @@ SEG | 1 1 0 0 0 0 |
 ｲ   | 1 1 1 1 0 0 |
 ｳ   | 1 1 1 1 1 0 |
 ｴ   | 1 1 1 1 1 1 |
-```

 >>> x_token = tokenizer("", prefix_text="ｱｲｳｴ")
-
-```text
 input_ids:      | SOT | ｱ | ｲ | ｳ | ｴ | SEG |
 token_type_ids: | 1   | 1 | 1 | 1 | 1 | 0  |
 prefix_lm_mask:
@ -104,11 +99,8 @@ SOT | 1 1 1 1 1 0 |
 ｳ   | 1 1 1 1 1 0 |
 ｴ   | 1 1 1 1 1 0 |
 SEG | 1 1 1 1 1 1 |
-```

 >>> x_token = tokenizer("ｳｴ", prefix_text="ｱｲ")
-
-```text
 input_ids:      | SOT | ｱ | ｲ | SEG | ｳ | ｴ |
 token_type_ids: | 1   | 1 | 1 | 0   | 0 | 0 |
 prefix_lm_mask:
@ -118,7 +110,6 @@ SOT | 1 1 1 0 0 0 |
 SEG | 1 1 1 1 0 0 |
 ｳ   | 1 1 1 1 1 0 |
 ｴ   | 1 1 1 1 1 1 |
-```

 ### Spout Vector

--- a/docs/source/en/model_doc/granite_speech.md
+++ b/docs/source/en/model_doc/granite_speech.md
@ -22,7 +22,6 @@ rendered properly in your Markdown viewer.
 </div>

 ## Overview
-
 The [Granite Speech](https://huggingface.co/papers/2505.08699) model ([blog post](https://www.ibm.com/new/announcements/ibm-granite-3-3-speech-recognition-refined-reasoning-rag-loras)) is a multimodal language model, consisting of a speech encoder, speech projector, large language model, and LoRA adapter(s). More details regarding each component for the current (Granite 3.2 Speech) model architecture may be found below.

 1. Speech Encoder: A [Conformer](https://huggingface.co/papers/2005.08100) encoder trained with Connectionist Temporal Classification (CTC) on character-level targets on ASR corpora. The encoder uses block-attention and self-conditioned CTC from the middle layer.
--- a/docs/source/en/model_doc/helium.md
+++ b/docs/source/en/model_doc/helium.md
@ -39,14 +39,14 @@ It supports the following languages: English, French, German, Italian, Portugues

 <!-- This section describes the evaluation protocols and provides the results. -->

-### Testing Data
+#### Testing Data

 <!-- This should link to a Dataset Card if possible. -->

 The model was evaluated on MMLU, TriviaQA, NaturalQuestions, ARC Easy & Challenge, Open Book QA, Common Sense QA,
 Physical Interaction QA, Social Interaction QA, HellaSwag, WinoGrande, Multilingual Knowledge QA, FLORES 200.

-### Metrics
+#### Metrics

 <!-- These are the evaluation metrics being used, ideally with a description of why. -->

--- a/docs/source/en/model_doc/idefics.md
+++ b/docs/source/en/model_doc/idefics.md
@ -24,7 +24,9 @@ rendered properly in your Markdown viewer.

 ## Overview

-The IDEFICS model was proposed in [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh
+The IDEFICS model was proposed in [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents
+](https://huggingface.co/papers/2306.16527
+) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh

 The abstract from the paper is the following:

--- a/docs/source/en/model_doc/idefics2.md
+++ b/docs/source/en/model_doc/idefics2.md
@ -215,16 +215,13 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
    - forward

 ## Idefics2ImageProcessor
-
 [[autodoc]] Idefics2ImageProcessor
    - preprocess

 ## Idefics2ImageProcessorFast
-
 [[autodoc]] Idefics2ImageProcessorFast
    - preprocess

 ## Idefics2Processor
-
 [[autodoc]] Idefics2Processor
    - __call__
--- a/docs/source/en/model_doc/idefics3.md
+++ b/docs/source/en/model_doc/idefics3.md
@ -77,16 +77,13 @@ This model was contributed by [amyeroberts](https://huggingface.co/amyeroberts)
    - forward

 ## Idefics3ImageProcessor
-
 [[autodoc]] Idefics3ImageProcessor
    - preprocess

 ## Idefics3ImageProcessorFast
-
 [[autodoc]] Idefics3ImageProcessorFast
    - preprocess

 ## Idefics3Processor
-
 [[autodoc]] Idefics3Processor
    - __call__
--- a/docs/source/en/model_doc/instructblipvideo.md
+++ b/docs/source/en/model_doc/instructblipvideo.md
@ -79,7 +79,6 @@ The attributes can be obtained from model config, as `model.config.num_query_tok
    - forward

 ## InstructBlipVideoModel
-
 [[autodoc]] InstructBlipVideoModel
    - forward

--- a/docs/source/en/model_doc/internvl.md
+++ b/docs/source/en/model_doc/internvl.md
@ -105,7 +105,6 @@ This example demonstrates how to perform inference on a single image with the In
 ```

 ### Text-only generation
-
 This example shows how to generate text using the InternVL model without providing any image input.

 ```python
@ -135,7 +134,6 @@ This example shows how to generate text using the InternVL model without providi
 ```

 ### Batched image and text inputs
-
 InternVL models also support batched image and text inputs.

 ```python
@ -179,7 +177,6 @@ InternVL models also support batched image and text inputs.
 ```

 ### Batched multi-image input
-
 This implementation of the InternVL models supports batched text-images inputs with different number of images for each text.

 ```python
@ -223,7 +220,6 @@ This implementation of the InternVL models supports batched text-images inputs w
 ```

 ### Video input
-
 InternVL models can also handle video inputs. Here is an example of how to perform inference on a video input using chat templates.

 ```python
@ -263,7 +259,6 @@ InternVL models can also handle video inputs. Here is an example of how to perfo
 ```

 ### Interleaved image and video inputs
-
 This example showcases how to handle a batch of chat conversations with interleaved image and video inputs using chat template.

 ```python
--- a/docs/source/en/model_doc/jukebox.md
+++ b/docs/source/en/model_doc/jukebox.md
@ -14,7 +14,6 @@ rendered properly in your Markdown viewer.

 -->
 *This model was released on 2020-04-30 and added to Hugging Face Transformers on 2023-06-20.*
-
 # Jukebox

 <div class="flex flex-wrap space-x-1">
--- a/docs/source/en/model_doc/kyutai_speech_to_text.md
+++ b/docs/source/en/model_doc/kyutai_speech_to_text.md
@ -16,7 +16,6 @@ rendered properly in your Markdown viewer.
 *This model was released on 2025-06-17 and added to Hugging Face Transformers on 2025-06-25.*

 # Kyutai Speech-To-Text
-
 ## Overview

 [Kyutai STT](https://kyutai.org/next/stt) is a speech-to-text model architecture based on the [Mimi codec](https://huggingface.co/docs/transformers/en/model_doc/mimi), which encodes audio into discrete tokens in a streaming fashion, and a [Moshi-like](https://huggingface.co/docs/transformers/en/model_doc/moshi) autoregressive decoder. Kyutai's lab has released two model checkpoints:
--- a/docs/source/en/model_doc/levit.md
+++ b/docs/source/en/model_doc/levit.md
@ -36,7 +36,7 @@ in vision transformers. As a result, we propose LeVIT: a hybrid neural network f
 We consider different measures of efficiency on different hardware platforms, so as to best reflect a wide range of
 application scenarios. Our extensive experiments empirically validate our technical choices and show they are suitable
 to most architectures. Overall, LeViT significantly outperforms existing convnets and vision transformers with respect
-to the speed/accuracy tradeoff. For example, at 80% ImageNet top-1 accuracy, LeViT is 5 times faster than EfficientNet on CPU.*
+to the speed/accuracy tradeoff. For example, at 80% ImageNet top-1 accuracy, LeViT is 5 times faster than EfficientNet on CPU. *

 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/levit_architecture.png"
 alt="drawing" width="600"/>
--- a/docs/source/en/model_doc/lfm2_moe.md
+++ b/docs/source/en/model_doc/lfm2_moe.md
@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.

+
 ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be rendered properly in your Markdown viewer.

 -->
-*This model was released on {release_date} and added to Hugging Face Transformers on 2025-10-07.*
+

 # Lfm2Moe

@ -23,7 +24,7 @@ limitations under the License.

 LFM2-MoE is a Mixture-of-Experts (MoE) variant of [LFM2](https://huggingface.co/collections/LiquidAI/lfm2-686d721927015b2ad73eaa38). The LFM2 family is optimized for on-device inference by combining short‑range, input‑aware gated convolutions with grouped‑query attention (GQA) in a layout tuned to maximize quality under strict speed and memory constraints.

-LFM2‑MoE keeps this fast backbone and introduces sparse MoE feed‑forward networks to add representational capacity without significantly increasing the active compute path. The first LFM2-MoE release is LFM2-8B-A1B, with 8.3B total parameters and 1.5B active parameters. The model excels in quality (comparable to 3-4B dense models) and speed (faster than other 1.5B class models).
+LFM2‑MoE keeps this fast backbone and introduces sparse MoE feed‑forward networks to add representational capacity without significantly increasing the active compute path. The first LFM2-MoE release is LFM2-8B-A1B, with 8.3B total parameters and 1.5B active parameters. The model excels in quality (comparable to 3-4B dense models) and speed (faster than other 1.5B class models). 

 ## Example

--- a/docs/source/en/model_doc/llama4.md
+++ b/docs/source/en/model_doc/llama4.md
@ -436,6 +436,11 @@ model = Llama4ForConditionalGeneration.from_pretrained(
 [[autodoc]] Llama4TextModel
    - forward

+## Llama4ForCausalLM
+
+[[autodoc]] Llama4ForCausalLM
+    - forward
+
 ## Llama4VisionModel

 [[autodoc]] Llama4VisionModel
--- a/docs/source/en/model_doc/llava_next_video.md
+++ b/docs/source/en/model_doc/llava_next_video.md
@ -25,7 +25,8 @@ rendered properly in your Markdown viewer.

 ## Overview

-The LLaVa-NeXT-Video model was proposed in [LLaVA-NeXT: A Strong Zero-shot Video Understanding Model](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/) by Yuanhan Zhang, Bo Li, Haotian Liu, Yong Jae Lee, Liangke Gui, Di Fu, Jiashi Feng, Ziwei Liu, Chunyuan Li. LLaVa-NeXT-Video improves upon [LLaVa-NeXT](llava_next) by fine-tuning on a mix if video and image dataset thus increasing the model's performance on videos.
+The LLaVa-NeXT-Video model was proposed in [LLaVA-NeXT: A Strong Zero-shot Video Understanding Model
+](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/) by Yuanhan Zhang, Bo Li, Haotian Liu, Yong Jae Lee, Liangke Gui, Di Fu, Jiashi Feng, Ziwei Liu, Chunyuan Li. LLaVa-NeXT-Video improves upon [LLaVa-NeXT](llava_next) by fine-tuning on a mix if video and image dataset thus increasing the model's performance on videos.

 [LLaVA-NeXT](llava_next) surprisingly has strong performance in understanding video content in zero-shot fashion with the AnyRes technique that it uses. The AnyRes technique naturally represents a high-resolution image into multiple images. This technique is naturally generalizable to represent videos because videos can be considered as a set of frames (similar to a set of images in LLaVa-NeXT). The current version of LLaVA-NeXT makes use of AnyRes and trains with supervised fine-tuning (SFT) on top of LLaVA-Next on video data to achieves better video understanding capabilities.The model is a current SOTA among open-source models on [VideoMME bench](https://huggingface.co/papers/2405.21075).

--- a/docs/source/en/model_doc/m2m_100.md
+++ b/docs/source/en/model_doc/m2m_100.md
@ -171,7 +171,6 @@ Below is an expected speedup diagram that compares pure inference time between t
 </div>

 ## Using Scaled Dot Product Attention (SDPA)
-
 PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function
 encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the
 [official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)
--- a/docs/source/en/model_doc/mega.md
+++ b/docs/source/en/model_doc/mega.md
@ -39,7 +39,7 @@ attractive option for long-document NLP tasks.

 The abstract from the paper is the following:

- *The design choices in the Transformer attention mechanism, including weak inductive bias and quadratic computational complexity, have limited its application for modeling long sequences. In this paper, we introduce Mega, a simple, theoretically grounded, single-head gated attention mechanism equipped with (exponential) moving average to incorporate inductive bias of position-aware local dependencies into the position-agnostic attention mechanism. We further propose a variant of Mega that offers linear time and space complexity yet yields only minimal quality loss, by efficiently splitting the whole sequence into multiple chunks with fixed length. Extensive experiments on a wide range of sequence modeling benchmarks, including the Long Range Arena, neural machine translation, auto-regressive language modeling, and image and speech classification, show that Mega achieves significant improvements over other sequence models, including variants of Transformers and recent state space models.*
+ *The design choices in the Transformer attention mechanism, including weak inductive bias and quadratic computational complexity, have limited its application for modeling long sequences. In this paper, we introduce Mega, a simple, theoretically grounded, single-head gated attention mechanism equipped with (exponential) moving average to incorporate inductive bias of position-aware local dependencies into the position-agnostic attention mechanism. We further propose a variant of Mega that offers linear time and space complexity yet yields only minimal quality loss, by efficiently splitting the whole sequence into multiple chunks with fixed length. Extensive experiments on a wide range of sequence modeling benchmarks, including the Long Range Arena, neural machine translation, auto-regressive language modeling, and image and speech classification, show that Mega achieves significant improvements over other sequence models, including variants of Transformers and recent state space models. *

 This model was contributed by [mnaylor](https://huggingface.co/mnaylor).
 The original code can be found [here](https://github.com/facebookresearch/mega).
--- a/docs/source/en/model_doc/minimax.md
+++ b/docs/source/en/model_doc/minimax.md
@ -186,6 +186,5 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
    - forward

 ## MiniMaxForQuestionAnswering
-
 [[autodoc]] MiniMaxForQuestionAnswering
    - forward
--- a/docs/source/en/model_doc/mixtral.md
+++ b/docs/source/en/model_doc/mixtral.md
@ -223,6 +223,5 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
    - forward

 ## MixtralForQuestionAnswering
-
 [[autodoc]] MixtralForQuestionAnswering
    - forward
--- a/docs/source/en/model_doc/mllama.md
+++ b/docs/source/en/model_doc/mllama.md
@ -136,6 +136,11 @@ print(processor.decode(output[0], skip_special_tokens=True))

 [[autodoc]] MllamaModel

+## MllamaForCausalLM
+
+[[autodoc]] MllamaForCausalLM
+    - forward
+
 ## MllamaVisionModel

 [[autodoc]] MllamaVisionModel
--- a/docs/source/en/model_doc/mms.md
+++ b/docs/source/en/model_doc/mms.md
@ -316,7 +316,6 @@ with torch.no_grad():
 Different LID models are available based on the number of languages they can recognize - [126](https://huggingface.co/facebook/mms-lid-126), [256](https://huggingface.co/facebook/mms-lid-256), [512](https://huggingface.co/facebook/mms-lid-512), [1024](https://huggingface.co/facebook/mms-lid-1024), [2048](https://huggingface.co/facebook/mms-lid-2048), [4017](https://huggingface.co/facebook/mms-lid-4017).

 #### Inference
-
 First, we install transformers and some other libraries

 ```bash
--- a/docs/source/en/model_doc/mobilevit.md
+++ b/docs/source/en/model_doc/mobilevit.md
@ -99,6 +99,7 @@ print(f"The predicted class label is:{predicted_class_label}")

 [[autodoc]] MobileViTConfig

+
 ## MobileViTImageProcessor

 [[autodoc]] MobileViTImageProcessor
--- a/docs/source/en/model_doc/moshi.md
+++ b/docs/source/en/model_doc/moshi.md
@ -64,11 +64,11 @@ Note that each timestamp - i.e each codebook - gets its own set of Linear Layers

 It's the audio encoder from Kyutai, that has recently been integrated to transformers, which is used to "tokenize" audio. It has the same use that [`~EncodecModel`] has in [`~MusicgenModel`].

-## Tips
+## Tips:

 The original checkpoints can be converted using the conversion script `src/transformers/models/moshi/convert_moshi_transformers.py`

-### How to use the model
+### How to use the model:

 This implementation has two main aims:

@ -152,7 +152,7 @@ Once it's done, you can simply forward `text_labels` and `audio_labels` to [`Mos

 A training guide will come soon, but user contributions are welcomed!

-### How does the model forward the inputs / generate
+### How does the model forward the inputs / generate:

 1. The input streams are embedded and combined into `inputs_embeds`.

--- a/docs/source/en/model_doc/musicgen_melody.md
+++ b/docs/source/en/model_doc/musicgen_melody.md
@ -50,7 +50,7 @@ MusicGen Melody is compatible with two generation modes: greedy and sampling. In

 Transformers supports both mono (1-channel) and stereo (2-channel) variants of MusicGen Melody. The mono channel versions generate a single set of codebooks. The stereo versions generate 2 sets of codebooks, 1 for each channel (left/right), and each set of codebooks is decoded independently through the audio compression model. The audio streams for each channel are combined to give the final stereo output.

-### Audio Conditional Generation
+#### Audio Conditional Generation

 The model can generate an audio sample conditioned on a text and an audio prompt through use of the [`MusicgenMelodyProcessor`] to pre-process the inputs.

--- a/docs/source/en/model_doc/myt5.md
+++ b/docs/source/en/model_doc/myt5.md
@ -40,3 +40,7 @@ The original code can be found [here](https://github.com/tomlimi/MYTE).
    - get_special_tokens_mask
    - create_token_type_ids_from_sequences
    - save_vocabulary
+
+## MyT5Tokenizer
+
+[[autodoc]] MyT5Tokenizer
--- a/docs/source/en/model_doc/nat.md
+++ b/docs/source/en/model_doc/nat.md
@ -47,7 +47,7 @@ with efficient C++ and CUDA kernels, which allows NA to run up to 40% faster tha
 memory. We further present Neighborhood Attention Transformer (NAT), a new hierarchical transformer design based on NA
 that boosts image classification and downstream vision performance. Experimental results on NAT are competitive;
 NAT-Tiny reaches 83.2% top-1 accuracy on ImageNet, 51.4% mAP on MS-COCO and 48.4% mIoU on ADE20K, which is 1.9%
-ImageNet accuracy, 1.0% COCO mAP, and 2.6% ADE20K mIoU improvement over a Swin model with similar size.*
+ImageNet accuracy, 1.0% COCO mAP, and 2.6% ADE20K mIoU improvement over a Swin model with similar size. *

 <img
 src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/neighborhood-attention-pattern.jpg"
--- a/docs/source/en/model_doc/nemotron.md
+++ b/docs/source/en/model_doc/nemotron.md
@ -21,22 +21,21 @@ specific language governing permissions and limitations under the License.
 <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
 </div>

-## License
+### License

-Minitron is released under the [NVIDIA Open Model License Agreement](https://developer.download.nvidia.com/licenses/nvidia-open-model-license-agreement-june-2024.pdf).
 The use of this model is governed by the [NVIDIA AI Foundation Models Community License Agreement](https://developer.nvidia.com/downloads/nv-ai-foundation-models-license).

-## Description
+### Description

 Nemotron-4 is a family of enterprise ready generative text models compatible with [NVIDIA NeMo Framework](https://www.nvidia.com/en-us/ai-data-science/generative-ai/nemo-framework/).

 NVIDIA NeMo is an end-to-end, cloud-native platform to build, customize, and deploy generative AI models anywhere. It includes training and inferencing frameworks, guardrailing toolkits, data curation tools, and pretrained models, offering enterprises an easy, cost-effective, and fast way to adopt generative AI. To get access to NeMo Framework, please sign up at [this link](https://developer.nvidia.com/nemo-framework/join).

-## References
+### References

 [Announcement Blog](https://developer.nvidia.com/blog/nvidia-ai-foundation-models-build-custom-enterprise-chatbots-and-co-pilots-with-production-ready-llms/)

-## Model Architecture
+### Model Architecture

 **Architecture Type:** Transformer

@ -81,6 +80,10 @@ output_text = tokenizer.decode(outputs[0])
 print(output_text)
 ```

+### License
+
+Minitron is released under the [NVIDIA Open Model License Agreement](https://developer.download.nvidia.com/licenses/nvidia-open-model-license-agreement-june-2024.pdf).
+
 ### Evaluation Results

 *5-shot performance.* Language Understanding evaluated using [Massive Multitask Language Understanding](https://huggingface.co/papers/2009.03300):
@ -93,7 +96,7 @@ print(output_text)

 | HellaSwag | Winogrande | GSM8K| ARC-C | XLSum |
 | :------------- | :------------- | :------------- | :------------- | :------------- |
-| 75.0 | 74.0 | 24.1  | 50.9 | 29.5 |
+| 75.0 | 74.0 | 24.1  | 50.9 | 29.5

 *Code generation performance*. Evaluated using [HumanEval](https://github.com/openai/human-eval):

--- a/docs/source/en/model_doc/nllb.md
+++ b/docs/source/en/model_doc/nllb.md
@ -55,7 +55,7 @@ pipeline("UN Chief says there is no military solution in Syria")
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

 tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
-model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M", dtype="auto", attn_implementation="sdpa")
+model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M", dtype="auto", attn_implementaiton="sdpa")

 article = "UN Chief says there is no military solution in Syria"
 inputs = tokenizer(article, return_tensors="pt")
--- a/docs/source/en/model_doc/olmo.md
+++ b/docs/source/en/model_doc/olmo.md
@ -25,7 +25,6 @@ rendered properly in your Markdown viewer.
 </div>

 # OLMo
-
 [OLMo](https://huggingface.co/papers/2402.00838) is a 7B-parameter dense language model. It uses SwiGLU activations, non-parametric layer normalization, rotary positional embeddings, and a BPE tokenizer that masks personally identifiable information. It is pretrained on [Dolma](https://huggingface.co/datasets/allenai/dolma), a 3T-token dataset. OLMo was released to provide complete transparency of not just the model weights but the training data, training code, and evaluation code to enable more research on language models.

 You can find all the original OLMo checkpoints under the [OLMo](https://huggingface.co/collections/allenai/olmo-suite-65aeaae8fe5b6b2122b46778) collection.
--- a/docs/source/en/model_doc/olmo2.md
+++ b/docs/source/en/model_doc/olmo2.md
@ -24,7 +24,6 @@ rendered properly in your Markdown viewer.
 </div>

 # OLMo2
-
 [OLMo2](https://huggingface.co/papers/2501.00656) improves on [OLMo](./olmo) by changing the architecture and training recipes of the original models. This includes excluding all biases to improve training stability, non-parametric layer norm, SwiGLU activation function, rotary positional embeddings, and a modified BPE-based tokenizer that masks personal identifiable information. It is pretrained on [Dolma](https://huggingface.co/datasets/allenai/dolma), a dataset of 3T tokens.

 You can find all the original OLMo2 checkpoints under the [OLMo2](https://huggingface.co/collections/allenai/olmo-2-674117b93ab84e98afc72edc) collection.
--- a/docs/source/en/model_doc/olmo3.md
+++ b/docs/source/en/model_doc/olmo3.md
@ -26,7 +26,6 @@ limitations under the License.
 </div>

 # OLMo3
-
 Olmo3 is an improvement on [OLMo2](./olmo2). More details will be released on *soon*.

 > [!TIP]
--- a/docs/source/en/model_doc/pvt_v2.md
+++ b/docs/source/en/model_doc/pvt_v2.md
@ -32,7 +32,7 @@ Another powerful feature of the PVTv2 is the complexity reduction in the self-at

 SRA was introduced in PVT, and is the default attention complexity reduction method used in PVTv2. However, PVTv2 also introduced the option of using a self-attention mechanism with linear complexity related to image size, which they called "Linear SRA". This method uses average pooling to reduce the hidden states to a fixed size that is invariant to their original resolution (although this is inherently more lossy than regular SRA). This option can be enabled by setting `linear_attention` to `True` in the PVTv2Config.

-### Abstract from the paper
+### Abstract from the paper:

 *Transformer recently has presented encouraging progress in computer vision. In this work, we present new baselines by improving the original Pyramid Vision Transformer (PVT v1) by adding three designs, including (1) linear complexity attention layer, (2) overlapping patch embedding, and (3) convolutional feed-forward network. With these modifications, PVT v2 reduces the computational complexity of PVT v1 to linear and achieves significant improvements on fundamental vision tasks such as classification, detection, and segmentation. Notably, the proposed PVT v2 achieves comparable or better performances than recent works such as Swin Transformer. We hope this work will facilitate state-of-the-art Transformer researches in computer vision. Code is available at https://github.com/whai362/PVT.*

--- a/docs/source/en/model_doc/qwen2_5_omni.md
+++ b/docs/source/en/model_doc/qwen2_5_omni.md
@ -271,7 +271,6 @@ processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B", min_pixels=min
 ```

 #### Prompt for audio output
-
 If users need audio output, the system prompt must be set as "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.", otherwise the audio output may not work as expected.

 ```python
@ -308,7 +307,6 @@ text_ids = model.generate(**inputs, return_audio=False)
 ```

 #### Change voice type of output audio
-
 Qwen2.5-Omni supports the ability to change the voice of the output audio. Users can use the `spk` parameter of `generate` function to specify the voice type. The `"Qwen/Qwen2.5-Omni-7B"` checkpoint support two voice types: `Chelsie` and `Ethan`, while `Chelsie` is a female voice and `Ethan` is a male voice. By default, if `spk` is not specified, the default voice type is `Chelsie`.

 ```python
--- a/docs/source/en/model_doc/qwen2_audio.md
+++ b/docs/source/en/model_doc/qwen2_audio.md
@ -34,7 +34,7 @@ It was proposed in [Qwen2-Audio Technical Report](https://huggingface.co/papers/

 The abstract from the paper is the following:

-*We introduce the latest progress of Qwen-Audio, a large-scale audio-language model called Qwen2-Audio, which is capable of accepting various audio signal inputs and performing audio analysis or direct textual responses with regard to speech instructions. In contrast to complex hierarchical tags, we have simplified the pre-training process by utilizing natural language prompts for different data and tasks, and have further expanded the data volume. We have boosted the instruction-following capability of Qwen2-Audio and implemented two distinct audio interaction modes for voice chat and audio analysis. In the voice chat mode, users can freely engage in voice interactions with Qwen2-Audio without text input. In the audio analysis mode, users could provide audio and text instructions for analysis during the interaction. Note that we do not use any system prompts to switch between voice chat and audio analysis modes. Qwen2-Audio is capable of intelligently comprehending the content within audio and following voice commands to respond appropriately. For instance, in an audio segment that simultaneously contains sounds, multi-speaker conversations, and a voice command, Qwen2-Audio can directly understand the command and provide an interpretation and response to the audio. Additionally, DPO has optimized the model's performance in terms of factuality and adherence to desired behavior. According to the evaluation results from AIR-Bench, Qwen2-Audio outperformed previous SOTAs, such as Gemini-1.5-pro, in tests focused on audio-centric instruction-following capabilities. Qwen2-Audio is open-sourced with the aim of fostering the advancement of the multi-modal language community.*
+*We introduce the latest progress of Qwen-Audio, a large-scale audio-language model called Qwen2-Audio, which is capable of accepting various audio signal inputs and performing audio analysis or direct textual responses with regard to speech instructions. In contrast to complex hierarchical tags, we have simplified the pre-training process by utilizing natural language prompts for different data and tasks, and have further expanded the data volume. We have boosted the instruction-following capability of Qwen2-Audio and implemented two distinct audio interaction modes for voice chat and audio analysis. In the voice chat mode, users can freely engage in voice interactions with Qwen2-Audio without text input. In the audio analysis mode, users could provide audio and text instructions for analysis during the interaction. Note that we do not use any system prompts to switch between voice chat and audio analysis modes. Qwen2-Audio is capable of intelligently comprehending the content within audio and following voice commands to respond appropriately. For instance, in an audio segment that simultaneously contains sounds, multi-speaker conversations, and a voice command, Qwen2-Audio can directly understand the command and provide an interpretation and response to the audio. Additionally, DPO has optimized the model's performance in terms of factuality and adherence to desired behavior. According to the evaluation results from AIR-Bench, Qwen2-Audio outperformed previous SOTAs, such as Gemini-1.5-pro, in tests focused on audio-centric instruction-following capabilities. Qwen2-Audio is open-sourced with the aim of fostering the advancement of the multi-modal language community. *

 ## Usage tips

@ -74,7 +74,6 @@ response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_
 In the following, we demonstrate how to use `Qwen2-Audio-7B-Instruct` for the inference, supporting both voice chat and audio analysis modes. Note that we have used the ChatML format for dialog, in this demo we show how to leverage `apply_chat_template` for this purpose.

 ### Voice Chat Inference
-
 In the voice chat mode, users can freely engage in voice interactions with Qwen2-Audio without text input:

 ```python
@ -116,7 +115,6 @@ response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_
 ```

 ### Audio Analysis Inference
-
 In the audio analysis, users could provide both audio and text instructions for analysis:

 ```python
@ -166,7 +164,6 @@ response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_
 ```

 ### Batch Inference
-
 We also support batch inference:

 ```python
--- a/docs/source/en/model_doc/qwen3_next.md
+++ b/docs/source/en/model_doc/qwen3_next.md
@ -31,7 +31,6 @@ Despite its ultra-efficiency, it outperforms Qwen3-32B on downstream tasks — w
 Moreover, it delivers over **10x higher inference throughput** than Qwen3-32B when handling contexts longer than 32K tokens.

 For more details, please visit our blog [Qwen3-Next](qwen3_next) ([blog post](https://qwenlm.github.io/blog/qwen3_next/)).
-
 ## Usage examples

 ```python
--- a/docs/source/en/model_doc/qwen3_omni_moe.md
+++ b/docs/source/en/model_doc/qwen3_omni_moe.md
@ -271,7 +271,6 @@ processor = AutoProcessor.from_pretrained("Qwen/Qwen3-Omni-30B-A3B-Instruct", mi
 ```

 #### Prompt for audio output
-
 If users need audio output, the system prompt must be set as "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, capable of perceiving auditory and visual inputs, as well as generating text and speech.", otherwise the audio output may not work as expected.

 ```json
@ -308,7 +307,6 @@ text_ids = model.generate(**inputs, return_audio=False)
 ```

 #### Change voice type of output audio
-
 Qwen3-Omni-MOE supports the ability to change the voice of the output audio. Users can use the `spk` parameter of `generate` function to specify the voice type. The `"Qwen/Qwen3-Omni-30B-A3B-Instruct"` checkpoint support two voice types: `Chelsie` and `Ethan`, while `Chelsie` is a female voice and `Ethan` is a male voice. By default, if `spk` is not specified, the default voice type is `Chelsie`.

 ```python
--- a/docs/source/en/model_doc/roberta-prelayernorm.md
+++ b/docs/source/en/model_doc/roberta-prelayernorm.md
@ -35,7 +35,7 @@ The original code can be found [here](https://github.com/princeton-nlp/DinkyTrai

 ## Usage tips

- The implementation is the same as [Roberta](roberta) except instead of using *Add and Norm* it does *Norm and Add*. *Add* and *Norm* refers to the Addition and LayerNormalization as described in [Attention Is All You Need](https://huggingface.co/papers/1706.03762).
+- The implementation is the same as [Roberta](roberta) except instead of using _Add and Norm_ it does _Norm and Add_. _Add_ and _Norm_ refers to the Addition and LayerNormalization as described in [Attention Is All You Need](https://huggingface.co/papers/1706.03762).
 - This is identical to using the `--encoder-normalize-before` flag in [fairseq](https://fairseq.readthedocs.io/).

 ## Resources
--- a/docs/source/en/model_doc/rt_detr.md
+++ b/docs/source/en/model_doc/rt_detr.md
@ -40,8 +40,7 @@ The model version was contributed by [rafaelpadilla](https://huggingface.co/rafa

 ## Usage tips

-Initially, an image is processed using a pre-trained convolutional neural network, specifically a Resnet-D variant as referenced in the original code. This network extracts features from the final three layers of the architecture. Following this, a hybrid encoder is employed to convert the multi-scale features into a sequential array of image features. Then, a decoder, equipped with auxiliary prediction heads is used to refine the object queries. This process facilitates the direct generation of bounding boxes, eliminating the need for any additional post-processing to acquire the logits and coordinates for the bounding boxes. The model is meant to be used on images resized to a size 640x640 with the corresponding ImageProcessor. Reshaping to other sizes will generally degrade performance.
-
+Initially, an image is processed using a pre-trained convolutional neural network, specifically a Resnet-D variant as referenced in the original code. This network extracts features from the final three layers of the architecture. Following this, a hybrid encoder is employed to convert the multi-scale features into a sequential array of image features. Then, a decoder, equipped with auxiliary prediction heads is used to refine the object queries. This process facilitates the direct generation of bounding boxes, eliminating the need for any additional post-processing to acquire the logits and coordinates for the bounding boxes. The model is meant to be used on images resized to a size 640x640 with the corresponding ImageProcessor. Reshaping to other sizes will generally degrade performance. 
 ```py
 >>> import torch
 >>> import requests
--- a/docs/source/en/model_doc/rt_detr_v2.md
+++ b/docs/source/en/model_doc/rt_detr_v2.md
@ -43,7 +43,6 @@ This second version of RT-DETR improves how the decoder finds objects in an imag
 - **optimized processing** – improves how attention weights mix information

 The model is meant to be used on images resized to a size 640x640 with the corresponding ImageProcessor. Reshaping to other sizes will generally degrade performance.
-
 ```py
 >>> import torch
 >>> import requests
--- a/docs/source/en/model_doc/smolvlm.md
+++ b/docs/source/en/model_doc/smolvlm.md
@ -24,7 +24,6 @@ rendered properly in your Markdown viewer.
 </div>

 ## Overview
-
 [SmolVLM2](https://huggingface.co/papers/2504.05299) ([blog post](https://huggingface.co/blog/smolvlm2)) is an adaptation of the Idefics3 model with two main differences:

 - It uses SmolLM2 for the text model.
@ -194,21 +193,17 @@ print(generated_texts[0])
    - forward

 ## SmolVLMImageProcessor
-
 [[autodoc]] SmolVLMImageProcessor
    - preprocess

 ## SmolVLMImageProcessorFast
-
 [[autodoc]] SmolVLMImageProcessorFast
    - preprocess

 ## SmolVLMVideoProcessor
-
 [[autodoc]] SmolVLMVideoProcessor
    - preprocess

 ## SmolVLMProcessor
-
 [[autodoc]] SmolVLMProcessor
    - __call__
--- a/docs/source/en/model_doc/speech-encoder-decoder.md
+++ b/docs/source/en/model_doc/speech-encoder-decoder.md
@ -33,7 +33,7 @@ Alexis Conneau.

 An example of how to use a [`SpeechEncoderDecoderModel`] for inference can be seen in [Speech2Text2](speech_to_text_2).

-## Randomly initializing `SpeechEncoderDecoderModel` from model configurations
+## Randomly initializing `SpeechEncoderDecoderModel` from model configurations.

 [`SpeechEncoderDecoderModel`] can be randomly initialized from an encoder and a decoder config. In the following example, we show how to do this using the default [`Wav2Vec2Model`] configuration for the encoder
 and the default [`BertForCausalLM`] configuration for the decoder.
@ -48,7 +48,7 @@ and the default [`BertForCausalLM`] configuration for the decoder.
 >>> model = SpeechEncoderDecoderModel(config=config)
 ```

-## Initialising `SpeechEncoderDecoderModel` from a pretrained encoder and a pretrained decoder
+## Initialising `SpeechEncoderDecoderModel` from a pretrained encoder and a pretrained decoder.

 [`SpeechEncoderDecoderModel`] can be initialized from a pretrained encoder checkpoint and a pretrained decoder checkpoint. Note that any pretrained Transformer-based speech model, *e.g.* [Wav2Vec2](wav2vec2), [Hubert](hubert) can serve as the encoder and both pretrained auto-encoding models, *e.g.* BERT, pretrained causal language models, *e.g.* GPT2, as well as the pretrained decoder part of sequence-to-sequence models, *e.g.* decoder of BART, can be used as the decoder.
 Depending on which architecture you choose as the decoder, the cross-attention layers might be randomly initialized.
@ -63,7 +63,7 @@ To do so, the `SpeechEncoderDecoderModel` class provides a [`SpeechEncoderDecode
 ... )
 ```

-## Loading an existing `SpeechEncoderDecoderModel` checkpoint and perform inference
+## Loading an existing `SpeechEncoderDecoderModel` checkpoint and perform inference.

 To load fine-tuned checkpoints of the `SpeechEncoderDecoderModel` class, [`SpeechEncoderDecoderModel`] provides the `from_pretrained(...)` method just like any other model architecture in Transformers.

--- a/docs/source/en/model_doc/starcoder2.md
+++ b/docs/source/en/model_doc/starcoder2.md
@ -31,7 +31,6 @@ StarCoder2 is a family of open LLMs for code and comes in 3 different sizes with
 The abstract of the paper is the following:

 > The BigCode project, an open-scientific collaboration focused on the responsible development of Large Language Models for Code (Code LLMs), introduces StarCoder2. In partnership with Software Heritage (SWH), we build The Stack v2 on top of the digital commons of their source code archive. Alongside the SWH repositories spanning 619 programming languages, we carefully select other high-quality data sources, such as GitHub pull requests, Kaggle notebooks, and code documentation. This results in a training set that is 4x larger than the first StarCoder dataset. We train StarCoder2 models with 3B, 7B, and 15B parameters on 3.3 to 4.3 trillion tokens and thoroughly evaluate them on a comprehensive set of Code LLM benchmarks. We find that our small model, StarCoder2-3B, outperforms other Code LLMs of similar size on most benchmarks, and also outperforms StarCoderBase-15B. Our large model, StarCoder2- 15B, significantly outperforms other models of comparable size. In addition, it matches or outperforms CodeLlama-34B, a model more than twice its size. Although DeepSeekCoder- 33B is the best-performing model at code completion for high-resource languages, we find that StarCoder2-15B outperforms it on math and code reasoning benchmarks, as well as several low-resource languages. We make the model weights available under an OpenRAIL license and ensure full transparency regarding the training data by releasing the SoftWare Heritage persistent IDentifiers (SWHIDs) of the source code data.
->
 ## License

 The models are licensed under the [BigCode OpenRAIL-M v1 license agreement](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement).
--- a/docs/source/en/model_doc/superglue.md
+++ b/docs/source/en/model_doc/superglue.md
@ -88,16 +88,16 @@ processed_outputs = processor.post_process_keypoint_matching(outputs, image_size
    import torch
    from PIL import Image
    import requests
-
+    
    processor = AutoImageProcessor.from_pretrained("magic-leap-community/superglue_outdoor")
    model = AutoModel.from_pretrained("magic-leap-community/superglue_outdoor")
-
+    
    # SuperGlue requires pairs of images
    images = [image1, image2]
    inputs = processor(images, return_tensors="pt")
    with torch.inference_mode():
        outputs = model(**inputs)
-
+    
    # Extract matching information
    keypoints0 = outputs.keypoints0  # Keypoints in first image
    keypoints1 = outputs.keypoints1  # Keypoints in second image
@ -112,7 +112,7 @@ processed_outputs = processor.post_process_keypoint_matching(outputs, image_size
    # Process outputs for visualization
    image_sizes = [[(image.height, image.width) for image in images]]
    processed_outputs = processor.post_process_keypoint_matching(outputs, image_sizes, threshold=0.2)
-
+    
    for i, output in enumerate(processed_outputs):
        print(f"For the image pair {i}")
        for keypoint0, keypoint1, matching_score in zip(
@ -147,13 +147,6 @@ processed_outputs = processor.post_process_keypoint_matching(outputs, image_size
    - post_process_keypoint_matching
    - visualize_keypoint_matching

-## SuperGlueImageProcessorFast
-
-[[autodoc]] SuperGlueImageProcessorFast
-    - preprocess
-    - post_process_keypoint_matching
-    - visualize_keypoint_matching
-
 ## SuperGlueForKeypointMatching

 [[autodoc]] SuperGlueForKeypointMatching
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Marc Sun	72d8e7bb3c	Merge remote-tracking branch 'origin/main' into serve-quantization	2025-10-15 12:19:48 +00:00
Marc Sun	747fcfa227	rm check for now	2025-10-01 16:44:42 +00:00
Marc Sun	a6506fa478	style	2025-10-01 16:28:58 +00:00
Marc Sun	72ffb3d1d2	fix	2025-10-01 16:28:29 +00:00
Marc Sun	f525309408	minor doc fix	2025-10-01 15:54:05 +00:00
Marc Sun	ffa68ba7b8	fix args	2025-10-01 15:48:21 +00:00
Marc Sun	eab734d23c	fix	2025-10-01 15:10:55 +00:00
Marc Sun	b604f62b6b	Merge remote-tracking branch 'origin/main' into serve-quantization	2025-10-01 13:32:42 +00:00
Marc Sun	35fff29efd	fix	2025-10-01 13:31:05 +00:00
Marc Sun	1cdd0bf0fb	fix	2025-10-01 13:30:56 +00:00
Marc Sun	907f206a1b	fix api	2025-10-01 13:25:59 +00:00
Marc Sun	86ba65350b	doc	2025-10-01 12:47:38 +00:00