try

2025-11-04 20:14:36 +08:00 · 2025-04-30 15:10:49 +02:00
763 changed files with 55237 additions and 47663 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -7,18 +7,6 @@ parameters:
    nightly:
        type: boolean
        default: false
-    GHA_Actor:
-        type: string
-        default: ""
-    GHA_Action:
-        type: string
-        default: ""
-    GHA_Event:
-        type: string
-        default: ""
-    GHA_Meta:
-        type: string
-        default: ""

 jobs:
    # Ensure running with CircleCI/huggingface
@ -43,12 +31,8 @@ jobs:
        parallelism: 1
        steps:
            - checkout
-            - run: git branch
-            - run: git log -n 1
-            - run: python3 utils/extract_pr_number_from_circleci.py > pr_number.txt
-            - run: echo $(cat pr_number.txt)
-            - run: if [[ "$(cat pr_number.txt)" == "" && "$CIRCLE_BRANCH" != "main" && "$CIRCLE_BRANCH" != *-release ]]; then echo "Not a PR, not the main branch and not a release branch, skip test!"; circleci-agent step halt; fi
-            - run: 'curl -L -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" https://api.github.com/repos/$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME/pulls/$(cat pr_number.txt) >> github.txt'
+            - run: if [[ "$CIRCLE_PULL_REQUEST" == "" && "$CIRCLE_BRANCH" != "main" && "$CIRCLE_BRANCH" != *-release ]]; then echo "Not a PR, not the main branch and not a release branch, skip test!"; circleci-agent step halt; fi
+            - run: 'curl -L -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" https://api.github.com/repos/$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME/pulls/${CIRCLE_PULL_REQUEST##*/} >> github.txt'
            - run: cat github.txt
            - run: (python3 -c 'import json; from datetime import datetime; fp = open("github.txt"); data = json.load(fp); fp.close(); f = "%Y-%m-%dT%H:%M:%SZ"; created = datetime.strptime(data["created_at"], f); updated = datetime.strptime(data["updated_at"], f); s = (updated - created).total_seconds(); print(int(s))' || true) > elapsed.txt
            - run: if [ "$(cat elapsed.txt)" == "" ]; then echo 60 > elapsed.txt; fi
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@ -28,8 +28,6 @@ COMMON_ENV_VARIABLES = {
    "TRANSFORMERS_IS_CI": True,
    "PYTEST_TIMEOUT": 120,
    "RUN_PIPELINE_TESTS": False,
-    # will be adjust in `CircleCIJob.to_dict`.
-    "RUN_FLAKY": True,
 }
 # Disable the use of {"s": None} as the output is way too long, causing the navigation on CircleCI impractical
 COMMON_PYTEST_OPTIONS = {"max-worker-restart": 0, "vvv": None, "rsfE":None}
@ -110,7 +108,6 @@ class CircleCIJob:
            print(f"Using {self.docker_image} docker image")
        if self.install_steps is None:
            self.install_steps = ["uv venv && uv pip install ."]
-        self.install_steps.append("uv venv && uv pip install git+https://github.com/ydshieh/pytest.git@8.3.5-ydshieh git+https://github.com/ydshieh/pluggy.git@1.5.0-ydshieh")
        if self.pytest_options is None:
            self.pytest_options = {}
        if isinstance(self.tests_to_run, str):
@ -129,8 +126,6 @@ class CircleCIJob:

    def to_dict(self):
        env = COMMON_ENV_VARIABLES.copy()
-        # Do not run tests decorated by @is_flaky on pull requests
-        env['RUN_FLAKY'] = os.environ.get("CIRCLE_PULL_REQUEST", "") == ""
        env.update(self.additional_env)

        job = {
@ -398,12 +393,7 @@ def create_circleci_config(folder=None):
        "parameters": {
            # Only used to accept the parameters from the trigger
            "nightly": {"type": "boolean", "default": False},
-            # Only used to accept the parameters from GitHub Actions trigger
-            "GHA_Actor": {"type": "string", "default": ""},
-            "GHA_Action": {"type": "string", "default": ""},
-            "GHA_Event": {"type": "string", "default": ""},
-            "GHA_Meta": {"type": "string", "default": ""},
-            "tests_to_run": {"type": "string", "default": ""},
+            "tests_to_run": {"type": "string", "default": ''},
            **{j.job_name + "_test_list":{"type":"string", "default":''} for j in jobs},
            **{j.job_name + "_parallelism":{"type":"integer", "default":1} for j in jobs},
        },
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@ -16,7 +16,7 @@ body:
    id: system-info
    attributes:
      label: System Info
-      description: Please share your system info with us. You can run the command `transformers env` and copy-paste its output below.
+      description: Please share your system info with us. You can run the command `transformers-cli env` and copy-paste its output below.
      placeholder: transformers version, platform, python version, ...
    validations:
      required: true
--- a/.github/ISSUE_TEMPLATE/migration.yml
+++ b/.github/ISSUE_TEMPLATE/migration.yml
@ -6,7 +6,7 @@ body:
    id: system-info
    attributes:
      label: System Info
-      description: Please share your system info with us. You can run the command `transformers env` and copy-paste its output below.
+      description: Please share your system info with us. You can run the command `transformers-cli env` and copy-paste its output below.
      render: shell
      placeholder: transformers version, platform, python version, ...
    validations:
--- a/.github/workflows/add-model-like.yml
+++ b/.github/workflows/add-model-like.yml
@ -54,7 +54,7 @@ jobs:
      - name: Create model files
        run: |
          . ~/venv/bin/activate
-          transformers add-new-model-like --config_file tests/fixtures/add_distilbert_like_config.json --path_to_repo .
+          transformers-cli add-new-model-like --config_file tests/fixtures/add_distilbert_like_config.json --path_to_repo .
          make style
          make fix-copies

--- a/.github/workflows/check_failed_model_tests.yml
+++ b/.github/workflows/check_failed_model_tests.yml
@ -29,7 +29,7 @@ jobs:
  run_models_gpu:
    name: " "
    runs-on:
-      group: aws-g4dn-4xlarge-cache
+      group: aws-g4dn-2xlarge-cache
    container:
      image: ${{ inputs.docker }}
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
--- a/.github/workflows/doctest_job.yml
+++ b/.github/workflows/doctest_job.yml
@ -28,7 +28,7 @@ jobs:
      matrix:
        split_keys: ${{ fromJson(inputs.split_keys) }}
    runs-on: 
-      group: aws-g4dn-4xlarge-cache
+      group: aws-g4dn-2xlarge-cache
    container:
      image: huggingface/transformers-all-latest-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
--- a/.github/workflows/doctests.yml
+++ b/.github/workflows/doctests.yml
@ -15,7 +15,7 @@ jobs:
  setup:
    name: Setup
    runs-on: 
-      group: aws-g4dn-4xlarge-cache
+      group: aws-g4dn-2xlarge-cache
    container:
      image: huggingface/transformers-all-latest-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
--- a/.github/workflows/model_jobs.yml
+++ b/.github/workflows/model_jobs.yml
@ -107,7 +107,7 @@ jobs:
        run: |
          echo "${{ inputs.machine_type }}"

-          if [ "${{ inputs.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
+          if [ "${{ inputs.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ inputs.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
--- a/.github/workflows/new_model_pr_merged_notification.yml
+++ b/.github/workflows/new_model_pr_merged_notification.yml
@ -59,7 +59,7 @@ jobs:
                  "type": "section",
                  "text": {
                    "type": "mrkdwn",
-                    "text": "<https://github.com/huggingface/transformers/commit/${{ env.COMMIT_SHA }}|New model: ${{ env.NEW_MODEL }}> GH_ArthurZucker, GH_lysandrejik, GH_ydshieh\ncommit SHA: ${{ env.COMMIT_SHA }}"
+                    "text": "<https://github.com/huggingface/transformers/commit/${{ env.COMMIT_SHA }}|New model: ${{ env.NEW_MODEL }}> GH_ArthurZucker, GH_lysandrejik, GH_ydshieh"
                  }
                }
              ]
--- a/.github/workflows/pr-style-bot.yml
+++ b/.github/workflows/pr-style-bot.yml
@ -1,19 +0,0 @@
-# To run this bot, comment "@bot /style" on a PR
-name: Style Bot
-
-on:
-  issue_comment:
-    types: [created]
-
-permissions:
-  contents: write
-  pull-requests: write
-
-jobs:
-  style:
-    uses: huggingface/huggingface_hub/.github/workflows/style-bot-action.yml@main
-    with:
-      python_quality_dependencies: "[quality]"
-      style_command_type: "default"
-    secrets:
-      bot_token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/self-comment-ci.yml
+++ b/.github/workflows/self-comment-ci.yml
@ -145,7 +145,7 @@ jobs:
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          MODELS: ${{ needs.get-tests.outputs.models }}
-          BODY: "\n\nmodels: ${{ needs.get-tests.outputs.models }}\nquantizations: ${{ needs.get-tests.outputs.quantizations }}"
+          BODY: "This comment contains run-slow, running the specified jobs:\n\nmodels: ${{ needs.get-tests.outputs.models }}\nquantizations: ${{ needs.get-tests.outputs.quantizations }}"
        run: |
          gh api \
            --method POST \
@ -185,7 +185,7 @@ jobs:
      fail-fast: false
      matrix:
        folders: ${{ fromJson(needs.get-tests.outputs.models) }}
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
       group: '${{ matrix.machine_type }}'
    container:
@ -239,7 +239,7 @@ jobs:
        shell: bash
        run: |
          echo "${{ matrix.machine_type }}"
-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
@ -292,7 +292,7 @@ jobs:
      fail-fast: false
      matrix:
        folders: ${{ fromJson(needs.get-tests.outputs.quantizations) }}
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -338,7 +338,7 @@ jobs:
        shell: bash
        run: |
          echo "${{ matrix.machine_type }}"
-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@ -49,7 +49,7 @@ jobs:
    name: Setup
    strategy:
      matrix:
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -107,7 +107,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
        slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
    uses: ./.github/workflows/model_jobs.yml
    with:
@ -125,7 +125,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
        slice_id: [0, 1]
    uses: ./.github/workflows/model_jobs.yml
    with:
@ -143,7 +143,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -177,7 +177,7 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
@ -211,7 +211,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -246,7 +246,7 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
@ -280,7 +280,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-4xlarge-cache]
+        machine_type: [aws-g4dn-2xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -314,7 +314,7 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
@ -349,7 +349,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -411,7 +411,7 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
@ -448,7 +448,7 @@ jobs:
      fail-fast: false
      matrix:
        folders: ${{ fromJson(needs.setup.outputs.quantization_matrix) }}
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -491,7 +491,7 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
--- a/.github/workflows/ssh-runner.yml
+++ b/.github/workflows/ssh-runner.yml
@ -35,7 +35,7 @@ jobs:
        shell: bash
        run: |
          if [[ "${{ github.event.inputs.num_gpus }}" == "single" && "${{ github.event.inputs.runner_type }}" == "t4" ]]; then
-            echo "RUNNER=aws-g4dn-4xlarge-cache" >> $GITHUB_ENV
+            echo "RUNNER=aws-g4dn-2xlarge-cache" >> $GITHUB_ENV
          elif [[ "${{ github.event.inputs.num_gpus }}" == "multi" && "${{ github.event.inputs.runner_type }}" == "t4" ]]; then
            echo "RUNNER=aws-g4dn-12xlarge-cache" >> $GITHUB_ENV
          elif [[ "${{ github.event.inputs.num_gpus }}" == "single" && "${{ github.event.inputs.runner_type }}" == "a10" ]]; then
--- a/.github/workflows/trigger_circleci.yml
+++ b/.github/workflows/trigger_circleci.yml
@ -1,20 +0,0 @@
-name: Trigger CircleCI
-
-
-on:
-  pull_request_target:
-    types: [ready_for_review]
-
-
-jobs:
-  trigger-circleci:
-    runs-on: ubuntu-22.04
-    steps:
-      - name: trigger CircleCI pipeline via GitHub Actions
-        uses: CircleCI-Public/trigger-circleci-pipeline-action@v1.2.0
-        with:
-          GHA_Meta: "Trigger via GitHub Actions"
-          target-slug: "github/huggingface/transformers"
-          target-branch: "pull/${{ github.event.number }}/head"
-        env:
-          CCI_TOKEN: ${{ secrets.CIRCLECI_PAT }}
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -78,7 +78,7 @@ Once you've confirmed the bug hasn't already been reported, please include the f
 To get the OS and software versions automatically, run the following command:

 ```bash
-transformers env
+transformers-cli env
 ```

 You can also run the same command from the root of the repository:
--- a/2
+++ b/2
@ -79,7 +79,7 @@ fixup: modified_only_fixup extra_style_checks autogenerate_code repo-consistency

 fix-copies:
 	python utils/check_copies.py --fix_and_overwrite
-	python utils/check_modular_conversion.py --fix_and_overwrite
+	python utils/check_modular_conversion.py  --fix_and_overwrite
 	python utils/check_dummies.py --fix_and_overwrite
 	python utils/check_doctest_list.py --fix_and_overwrite
 	python utils/check_docstrings.py --fix_and_overwrite
--- a/README.md
+++ b/README.md
@ -78,6 +78,7 @@ Create and activate a virtual environment with [venv](https://docs.python.org/3/
 # venv
 python -m venv .my-env
 source .my-env/bin/activate
+
 # uv
 uv venv .my-env
 source .my-env/bin/activate
@ -87,10 +88,10 @@ Install Transformers in your virtual environment.

 ```py
 # pip
-pip install "transformers[torch]"
+pip install transformers

 # uv
-uv pip install "transformers[torch]"
+uv pip install transformers
 ```

 Install Transformers from source if you want the latest changes in the library or are interested in contributing. However, the *latest* version may not be stable. Feel free to open an [issue](https://github.com/huggingface/transformers/issues) if you encounter an error.
@ -98,7 +99,7 @@ Install Transformers from source if you want the latest changes in the library o
 ```shell
 git clone https://github.com/huggingface/transformers.git
 cd transformers
-pip install .[torch]
+pip install .
 ```

 ## Quickstart
@ -120,7 +121,7 @@ To chat with a model, the usage pattern is the same. The only difference is you
 > [!TIP]
 > You can also chat with a model directly from the command line.
 > ```shell
-> transformers chat Qwen/Qwen2.5-0.5B-Instruct
+> transformers-cli chat --model_name_or_path Qwen/Qwen2.5-0.5B-Instruct
 > ```

 ```py
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -1,54 +0,0 @@
-version: '3'
-services:
-  memcached:
-    image: memcached:1.6.29
-    container_name: memcached
-    ports:
-      - "11211:11211"
-    environment:
-      - MEMCACHED_MAX_MEMORY=64m  # Set the maximum memory usage
-      - MEMCACHED_THREADS=4       # Number of threads to use
-
-  prometheus:
-    image: prom/prometheus:latest
-    command:
-      - "--config.file=/etc/prometheus/prometheus.yml"
-      - --web.enable-otlp-receiver  # Enable OTLP receiver
-      - --web.enable-remote-write-receiver
-      - --enable-feature=exemplar-storage
-      - --enable-feature=native-histograms
-    volumes:
-      - ./prometheus.yml:/etc/prometheus/prometheus.yml
-    ports:
-      - "9090:9090"
-
-  tempo:
-    image: grafana/tempo:latest
-    command: [ "-config.file=/etc/tempo.yaml" ]
-    volumes:
-      - ./tempo.yaml:/etc/tempo.yaml
-    ports:
-      - "14268:14268"  # jaeger ingest
-      - "3200:3200"   # tempo
-      - "9095:9095" # tempo grpc
-      - "4317:4317"  # otlp grpc
-      - "4318:4318"  # otlp http
-      - "9411:9411"   # zipkin
-    depends_on:
-      - memcached
-
-  grafana:
-    image: grafana/grafana:latest
-    volumes:
-      - ./grafana-datasources.yaml:/etc/grafana/provisioning/datasources/datasources.yaml
-    environment:
-      - GF_AUTH_ANONYMOUS_ENABLED=true
-      - GF_AUTH_ANONYMOUS_ORG_ROLE=Admin
-      - GF_AUTH_DISABLE_LOGIN_FORM=true
-      - GF_FEATURE_TOGGLES_ENABLE=traceqlEditor metricsSummary
-      - GF_INSTALL_PLUGINS=https://storage.googleapis.com/integration-artifacts/grafana-exploretraces-app/grafana-exploretraces-app-latest.zip;grafana-traces-app
-    ports:
-      - "3000:3000"
-    depends_on:
-      - prometheus
-      - tempo
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@ -71,9 +71,6 @@ RUN python3 -m pip install --no-cache-dir g2p-en
 # For Some bitsandbytes tests
 RUN python3 -m pip install --no-cache-dir einops

-# `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
-RUN python3 -m pip uninstall -y kernels
-
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
--- a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
@ -45,9 +45,6 @@ RUN python3 -m pip uninstall -y deepspeed
 # TODO: Find out why test fail.
 RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1

-# `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
-RUN python3 -m pip uninstall -y kernels
-
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
--- a/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
@ -57,9 +57,6 @@ RUN python3 -m pip uninstall -y deepspeed
 #RUN git clone https://github.com/pytorch/TensorRT.git
 #RUN cd TensorRT/py && python3 setup.py install --fx-only

-# `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
-RUN python3 -m pip uninstall -y kernels
-
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
--- a/docker/transformers-pytorch-gpu/Dockerfile
+++ b/docker/transformers-pytorch-gpu/Dockerfile
@ -28,9 +28,6 @@ RUN python3 -m pip uninstall -y tensorflow flax
 RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract
 RUN python3 -m pip install -U "itsdangerous<2.1.0"

-# `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
-RUN python3 -m pip uninstall -y kernels
-
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
--- a/docker/transformers-quantization-latest-gpu/Dockerfile
+++ b/docker/transformers-quantization-latest-gpu/Dockerfile
@ -90,9 +90,6 @@ RUN python3 -m pip install --no-cache-dir "auto-round>=0.5.0"
 # Add transformers in editable mode
 RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch]

-# `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
-RUN python3 -m pip uninstall -y kernels
-
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
--- a/docs/source/de/add_new_model.md
+++ b/docs/source/de/add_new_model.md
@ -95,7 +95,7 @@ wie der Code geschrieben werden sollte :-)
 1. Der Vorwärtsdurchlauf Ihres Modells sollte vollständig in die Modellierungsdatei geschrieben werden und dabei völlig unabhängig von anderen
   Modellen in der Bibliothek. Wenn Sie einen Block aus einem anderen Modell wiederverwenden möchten, kopieren Sie den Code und fügen ihn mit einem
   `# Kopiert von` ein (siehe [hier](https://github.com/huggingface/transformers/blob/v4.17.0/src/transformers/models/roberta/modeling_roberta.py#L160)
-   für ein gutes Beispiel und [hier](pr_checks#check-copies) für weitere Dokumentation zu Copied from).
+   für ein gutes Beispiel und [hier](pr_checks#check-copies) für weitere Dokumentation zu Copied from). 
 2. Der Code sollte vollständig verständlich sein, auch für einen Nicht-Muttersprachler. Das heißt, Sie sollten
   beschreibende Variablennamen wählen und Abkürzungen vermeiden. Ein Beispiel: `activation` ist `act` vorzuziehen.
   Von Variablennamen mit nur einem Buchstaben wird dringend abgeraten, es sei denn, es handelt sich um einen Index in einer for-Schleife.
@ -402,7 +402,7 @@ Andernfalls beginnen wir mit der Erstellung eines neuen Modells. Wir empfehlen d
 ein bestehendes Modell:

 ```bash
-transformers add-new-model-like
+transformers-cli add-new-model-like
 ```

 Sie werden mit einem Fragebogen aufgefordert, die grundlegenden Informationen Ihres Modells einzugeben.
--- a/docs/source/de/contributing.md
+++ b/docs/source/de/contributing.md
@ -63,7 +63,7 @@ Wenn Sie sich vergewissert haben, dass der Fehler noch nicht gemeldet wurde, geb
 Um das Betriebssystem und die Softwareversionen automatisch auszugeben, führen Sie den folgenden Befehl aus:

 ```bash
-transformers env
+transformers-cli env
 ```

 Sie können denselben Befehl auch im Hauptverzeichnis des Repositorys ausführen:
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@ -21,8 +21,6 @@
      title: Adding a new model to Transformers
    - local: modular_transformers
      title: Modular Transformers
-    - local: auto_docstring
-      title: Document your models
    - local: task_summary
      title: What 🤗 Transformers can do
    - local: tasks_explained
@ -39,8 +37,6 @@
      title: Tokenizers
    - local: image_processors
      title: Image processors
-    - local: video_processors
-      title: Video processors
    - local: backbones
      title: Backbones
    - local: feature_extractors
@ -153,8 +149,6 @@
      title: TPU
    - local: perf_train_special
      title: Apple Silicon
-    - local: perf_train_gaudi
-      title: Intel Gaudi
    - local: perf_hardware
      title: Build your own machine
    title: Hardware
@ -364,9 +358,7 @@
      title: Feature Extractor
    - local: main_classes/image_processor
      title: Image Processor
-    - local: main_classes/video_processor
-      title: Video Processor
-    title: Main Classes
+    title: Main classes
  - sections:
    - sections:
      - local: model_doc/albert
@ -501,8 +493,6 @@
        title: Granite
      - local: model_doc/granitemoe
        title: GraniteMoe
-      - local: model_doc/granitemoehybrid
-        title: GraniteMoeHybrid
      - local: model_doc/granitemoeshared
        title: GraniteMoeShared
      - local: model_doc/helium
@ -831,8 +821,6 @@
        title: Bark
      - local: model_doc/clap
        title: CLAP
-      - local: model_doc/csm
-        title: CSM
      - local: model_doc/dac
        title: dac
      - local: model_doc/encodec
--- a/docs/source/en/add_new_model.md
+++ b/docs/source/en/add_new_model.md
@ -161,7 +161,7 @@ The downside is that if you aren't used to them, it may take some time to get us
 Run the command below to start and complete the questionnaire with some basic information about the new model. This command jumpstarts the process by automatically generating some model code that you'll need to adapt.

 ```bash
-transformers add-new-model-like
+transformers-cli add-new-model-like
 ```

 ## Create a pull request
@ -292,7 +292,7 @@ Once you're able to run the original checkpoint, you're ready to start adapting

 ## Adapt the model code

-The `transformers add-new-model-like` command should have generated a model and configuration file.
+The `transformers-cli add-new-model-like` command should have generated a model and configuration file.

 - `src/transformers/models/brand_new_llama/modeling_brand_new_llama.py`
 - `src/transformers/models/brand_new_llama/configuration_brand_new_llama.py`
@ -551,10 +551,10 @@ While this example doesn't include an image processor, you may need to implement

 If you do need to implement a new image processor, refer to an existing image processor to understand the expected structure. Slow image processors ([`BaseImageProcessor`]) and fast image processors ([`BaseImageProcessorFast`]) are designed differently, so make sure you follow the correct structure based on the processor type you're implementing.

-Run the following command (only if you haven't already created the fast image processor with the `transformers add-new-model-like` command) to generate the necessary imports and to create a prefilled template for the fast image processor. Modify the template to fit your model.
+Run the following command (only if you haven't already created the fast image processor with the `transformers-cli add-new-model-like` command) to generate the necessary imports and to create a prefilled template for the fast image processor. Modify the template to fit your model.

 ```bash
-transformers add-fast-image-processor --model-name your_model_name
+transformers-cli add-fast-image-processor --model-name your_model_name
 ```

 This command will generate the necessary imports and provide a pre-filled template for the fast image processor. You can then modify it to fit your model's needs.
--- a/docs/source/en/attention_interface.md
+++ b/docs/source/en/attention_interface.md
@ -108,7 +108,7 @@ If in doubt about what args/kwargs a given model sends to the attention function
 ## Accessing current available implementations

 Most of the time, you will simply need to `register` a new function. If, however, you need to access an existing one,
-and/or perform a few checks, the preferred way is to use the global `ALL_ATTENTION_FUNCTIONS`. It behaves the same way you
+and/or perform a few checks, the prefered way is to use the global `ALL_ATTENTION_FUNCTIONS`. It behaves the same way you
 would expect from a usual Python dictionary:

 ```python
--- a/docs/source/en/auto_docstring.md
+++ b/docs/source/en/auto_docstring.md
@ -1,279 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Utilizing the @auto_docstring Decorator
-
-The `@auto_docstring` decorator in the Hugging Face Transformers library helps generate docstrings for model classes and their methods, which will be used to build the documentation for the library. It aims to improve consistency and reduce boilerplate by automatically including standard argument descriptions and allowing for targeted overrides and additions.
-
---
-
-## 📜 How it Works
-
-The `@auto_docstring` decorator constructs docstrings by:
-
-1.  **Signature Inspection:** It inspects the signature (arguments, types, defaults) of the decorated class's `__init__` method or the decorated function.
-2.  **Centralized Docstring Fetching:** It retrieves predefined docstrings for common arguments (e.g., `input_ids`, `attention_mask`) from internal library sources (like `ModelArgs` or `ImageProcessorArgs` in `utils/args_doc.py`).
-3.  **Overriding or Adding Arguments Descriptions:**
-    * **Direct Docstring Block:** It incorporates custom docstring content from an `r""" """` (or `""" """`) block below the method signature or within the `__init__` docstring. This is for documenting new arguments or overriding standard descriptions.
-    * **Decorator Arguments (`custom_args`):** A `custom_args` docstring block can be passed to the decorator to provide docstrings for specific arguments directly in the decorator call. This can be used to define the docstring block for new arguments once if they are repeated in multiple places in the modeling file.
-4.  **Adding Classes and Functions Introduction:**
-    * **`custom_intro` argument:** Allows prepending a custom introductory paragraph to a class or function docstring.
-    * **Automatic Introduction Generation:** For model classes with standard naming patterns (like `ModelForCausalLM`) or belonging to a pipeline, the decorator automatically generates an appropriate introductory paragraph using `ClassDocstring` in `utils/args_doc.py` as the source.
-5.  **Templating:** The decorator uses a templating system, allowing predefined docstrings to include dynamic information deduced from the `auto_modules` of the library, such as `{{processor_class}}` or `{{config_class}}`.
-6.  **Deducing Relevant Examples:** The decorator attempts to find appropriate usage examples based on the model's task or pipeline compatibility. It extracts checkpoint information from the model's configuration class to provide concrete examples with real model identifiers.
-7.  **Adding Return Value Documentation:** For methods like `forward`, the decorator can automatically generate the "Returns" section based on the method's return type annotation. For example, for a method returning a `ModelOutput` subclass, it will extracts field descriptions from that class's docstring to create a comprehensive return value description. A custom `Returns` section can also be manually specified in the function docstring block.
-8.  **Unrolling Kwargs Typed With Unpack Operator:** For specific methods (defined in `UNROLL_KWARGS_METHODS`) or classes (defined in `UNROLL_KWARGS_CLASSES`), the decorator processes `**kwargs` parameters that are typed with `Unpack[KwargsTypedDict]`. It extracts the documentation from the TypedDict and adds each parameter to the function's docstring. Currently, this functionality is only supported for `FastImageProcessorKwargs`.
-
-
---
-
-## 🚀 How to Use @auto_docstring
-
-### 1. Importing the Decorator
-Import the decorator into your modeling file:
-
-```python
-from ...utils import auto_docstring
-```
-
-### 2. Applying to Classes
-Place `@auto_docstring` directly above the class definition. It uses the `__init__` method's signature and its docstring for parameter descriptions.
-
-```python
-from transformers.modeling_utils import PreTrainedModel
-from ...utils import auto_docstring
-
-@auto_docstring
-class MyAwesomeModel(PreTrainedModel):
-    def __init__(self, config, custom_parameter: int = 10, another_custom_arg: str = "default"):
-        r"""
-        custom_parameter (`int`, *optional*, defaults to 10):
-            Description of the custom_parameter for MyAwesomeModel.
-        another_custom_arg (`str`, *optional*, defaults to "default"):
-            Documentation for another unique argument.
-        """
-        super().__init__(config)
-        self.custom_parameter = custom_parameter
-        self.another_custom_arg = another_custom_arg
-        # ... rest of your init
-
-    # ... other methods
-```
-
-#### Advanced Class Decoration:
-
-Arguments can be passed directly to `@auto_docstring` for more control:
-
-```python
-@auto_docstring(
-    custom_intro="""This model performs specific synergistic operations.
-    It builds upon the standard Transformer architecture with unique modifications.""",
-    custom_args="""
-    custom_parameter (`type`, *optional*, defaults to `default_value`):
-        A concise description for custom_parameter if not defined or overriding the description in `args_doc.py`.
-    internal_helper_arg (`type`, *optional*, defaults to `default_value`):
-        A concise description for internal_helper_arg if not defined or overriding the description in `args_doc.py`.
-    """
-)
-class MySpecialModel(PreTrainedModel):
-    def __init__(self, config: ConfigType, custom_parameter: "type" = "default_value", internal_helper_arg=None):
-        # ...
-```
-
-Or:
-
-```python
-@auto_docstring(
-    custom_intro="""This model performs specific synergistic operations.
-    It builds upon the standard Transformer architecture with unique modifications.""",
-)
-class MySpecialModel(PreTrainedModel):
-    def __init__(self, config: ConfigType, custom_parameter: "type" = "default_value", internal_helper_arg=None):
-        r"""
-        custom_parameter (`type`, *optional*, defaults to `default_value`):
-            A concise description for custom_parameter if not defined or overriding the description in `args_doc.py`.
-        internal_helper_arg (`type`, *optional*, defaults to `default_value`):
-            A concise description for internal_helper_arg if not defined or overriding the description in `args_doc.py`.
-        """
-        # ...
-```
-
-### 3. Applying to Functions (e.g., `forward` method)
-Apply the decorator above method definitions, such as the `forward` method.
-
-```python
-    @auto_docstring
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        new_custom_argument: Optional[torch.Tensor] = None,
-        arg_documented_in_args_doc: Optional[torch.Tensor] = None,
-        # ... other arguments
-    ) -> Union[Tuple, ModelOutput]: # The description of the return value will automatically be generated from the ModelOutput class docstring.
-        r"""
-        new_custom_argument (`torch.Tensor`, *optional*):
-            Description of this new custom argument and its expected shape or type.
-        """
-        # ...
-```
-
-#### Advanced Function Decoration:
-
-Arguments can be passed directly to `@auto_docstring` for more control. `Returns` and `Examples` sections can also be manually specified:
-
-```python
-MODEL_COMMON_CUSTOM_ARGS = r"""
-    common_arg_1 (`torch.Tensor`, *optional*, defaults to `default_value`):
-        Description of common_arg_1
-    common_arg_2 (`torch.Tensor`, *optional*, defaults to `default_value`):
-        Description of common_arg_2
-    ...
-"""
-
-class MyModel(PreTrainedModel):
-    # ...
-    @auto_docstring(
-        custom_intro="""
-        This is a custom introduction for the function.
-        """
-        custom_args=MODEL_COMMON_CUSTOM_ARGS
-    )
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        common_arg_1: Optional[torch.Tensor] = None,
-        common_arg_2: Optional[torch.Tensor] = None,
-        #...
-        function_specific_argument: Optional[torch.Tensor] = None,
-        # ... other arguments
-    ) -> torch.Tensor:
-        r"""
-        function_specific_argument (`torch.Tensor`, *optional*):
-            Description of an argument specific to this function
-
-        Returns:
-            `torch.Tensor`: For a function returning a generic type, a custom "Returns" section can be specified.
-
-        Example:
-
-        (To override the default example with a custom one or to add an example for a model class that does not have a pipeline)
-
-        ```python
-        ...
-        ```
-        """
-        # ...
-```
-
---
-
-### ✍️ Documenting Arguments: Approach & Priority
-
-1.  **Standard Arguments (e.g., `input_ids`, `attention_mask`, `pixel_values`, `encoder_hidden_states` etc.):**
-    * `@auto_docstring` retrieves descriptions from a central source. Do not redefine these locally if their description and shape are the same as in `args_doc.py`.
-
-2.  **New or Custom Arguments:**
-    * **Primary Method:** Document these within an `r""" """` docstring block following the signature (for functions) or in the `__init__` method's docstring (for class parameters).
-    * **Format:**
-        ```
-        argument_name (`type`, *optional*, defaults to `X`):
-            Description of the argument.
-            Explain its purpose, expected shape/type if complex, and default behavior.
-            This can span multiple lines.
-        ```
-    * Include `type` in backticks.
-    * Add "*optional*" if the argument is not required (has a default value).
-    * Add "defaults to `X`" if it has a default value (no need to specify "defaults to `None`" if the default value is `None`).
-
-3.  **Overriding Standard Arguments:**
-    * If a standard argument behaves differently (e.g., different expected shape, model-specific behavior), provide its complete description in the local `r""" """` docstring. This local definition takes precedence.
-    * The `labels` argument is often customized per model and typically requires a specific docstring.
-
-4.  **Using Decorator Arguments for Overrides or New Arguments (`custom_args`):**
-    * New or custom arguments docstrings can also be passed to `@auto_docstring` as a `custom_args` argument. This can be used to define the docstring block for new arguments once if they are repeated in multiple places in the modeling file.
-
---
-
-### Usage with [modular files](./modular_transformers)
-
-When working with modular files, follow these guidelines for applying the `@auto_docstring` decorator:
-
- **For standalone models in modular files:**
-  Apply the `@auto_docstring` decorator just as you would in regular modeling files.
-
- **For models inheriting from other library models:**
-  - When inheriting from a parent model, decorators (including `@auto_docstring`) are automatically carried over to the generated modeling file without needing to add them in your modular file.
-  - If you need to modify the `@auto_docstring` behavior, apply the customized decorator in your modular file, making sure to *include all other decorators* that were present on the original function/class.
-
-  > **Warning**: When overriding any decorator in a modular file, you must include ALL decorators that were applied to that function/class in the parent model. If you only override some decorators, the others won't be included in the generated modeling file.
-
-
-**Note**: The `check_auto_docstrings` tool doesn't check modular files directly, but it will check (and modify when using `--fix_and_overwrite`) the generated modeling files. If issues are found in the generated files, you'll need to update your modular files accordingly.
-
---
-
-## ✅ Checking Your Docstrings with `check_auto_docstrings`
-
-The library includes a utility script to validate docstrings. This check is typically run during Continuous Integration (CI).
-
-#### What it Checks:
-
-* **Decorator Presence:** Ensures `@auto_docstring` is applied to relevant model classes and public methods. (TODO)
-* **Argument Completeness & Consistency:**
-    * Flags arguments in the signature that are not known standard arguments and lack a local description.
-    * Ensures documented arguments exist in the signature. (TODO)
-    * Verifies that types and default values in the docstring match the signature. (TODO)
-* **Placeholder Detection:** Reminds you to complete placeholders like `<fill_type>` or `<fill_docstring>`.
-* **Formatting:** Adherence to the expected docstring style.
-
-#### Running the Check Locally:
-
-Run this check locally before committing. The common command is:
-
-```bash
-make fix-copies
-```
-
-Alternatively, to only perform docstrings and auto-docstring checks, you can use:
-
-```bash
-python utils/check_docstrings.py # to only check files included in the diff without fixing them
-# Or: python utils/check_docstrings.py --fix_and_overwrite # to fix and overwrite the files in the diff
-# Or: python utils/check_docstrings.py --fix_and_overwrite --check_all # to fix and overwrite all files
-```
-
-#### Workflow with the Checker:
-
-1.  Add `@auto_docstring(...)` to the class or method.
-2.  For new, custom, or overridden arguments, add descriptions in an `r""" """` block.
-3.  Run `make fix-copies` (or the `check_docstrings.py` utility).
-    * For unrecognized arguments lacking documentation, the utility will create placeholder entries.
-4.  Manually edit these placeholders with accurate types and descriptions.
-5.  Re-run the check to ensure all issues are resolved.
-
---
-
-## 🔑 Key Takeaways & Best Practices
-
-* Use `@auto_docstring` for new PyTorch model classes (`PreTrainedModel` subclasses) and their primary for methods (e.g., `forward`, `get_text_features` etc.).
-* For classes, the `__init__` method's docstring is the main source for parameter descriptions when using `@auto_docstring` on the class.
-* Rely on standard docstrings; do not redefine common arguments unless their behavior is different in your specific model.
-* Document new or custom arguments clearly.
-* Run `check_docstrings` locally and iteratively.
-
-By following these guidelines, you help maintain consistent and informative documentation for the Hugging Face Transformers library 🤗.
--- a/docs/source/en/conversations.md
+++ b/docs/source/en/conversations.md
@ -25,28 +25,22 @@ Check model leaderboards like [OpenLLM](https://hf.co/spaces/HuggingFaceH4/open_

 This guide shows you how to quickly start chatting with Transformers from the command line, how build and format a conversation, and how to chat using the [`TextGenerationPipeline`].

-## transformers CLI
+## transformers-cli

-After you've [installed Transformers](./installation.md), chat with a model directly from the command line as shown below. It launches an interactive session with a model, with a few base commands listed at the start of the session.
+Chat with a model directly from the command line as shown below. It launches an interactive session with a model. Enter `clear` to reset the conversation, `exit` to terminate the session, and `help` to display all the command options.

 ```bash
-transformers chat Qwen/Qwen2.5-0.5B-Instruct
+transformers-cli chat --model_name_or_path Qwen/Qwen2.5-0.5B-Instruct
 ```

 <div class="flex justify-center">
    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers-chat-cli.png"/>
 </div>

-You can launch the CLI with arbitrary `generate` flags, with the format `arg_1=value_1 arg_2=value_2 ...`
-
-```bash
-transformers chat Qwen/Qwen2.5-0.5B-Instruct do_sample=False max_new_tokens=10
-```
-
 For a full list of options, run the command below.

 ```bash
-transformers chat -h
+transformers-cli chat -h
 ```

 The chat is implemented on top of the [AutoClass](./model_doc/auto), using tooling from [text generation](./llm_tutorial) and [chat](./chat_templating).
@ -82,16 +76,16 @@ print(response[0]["generated_text"][-1]["content"])
 (sigh) Oh boy, you're asking me for advice? You're gonna need a map, pal! Alright,
 alright, I'll give you the lowdown. But don't say I didn't warn you, I'm a robot, not a tour guide!

-So, you wanna know what's fun to do in the Big Apple? Well, let me tell you, there's a million
-things to do, but I'll give you the highlights. First off, you gotta see the sights: the Statue of
-Liberty, Central Park, Times Square... you know, the usual tourist traps. But if you're lookin' for
-something a little more... unusual, I'd recommend checkin' out the Museum of Modern Art. It's got
+So, you wanna know what's fun to do in the Big Apple? Well, let me tell you, there's a million 
+things to do, but I'll give you the highlights. First off, you gotta see the sights: the Statue of 
+Liberty, Central Park, Times Square... you know, the usual tourist traps. But if you're lookin' for 
+something a little more... unusual, I'd recommend checkin' out the Museum of Modern Art. It's got 
 some wild stuff, like that Warhol guy's soup cans and all that jazz.

-And if you're feelin' adventurous, take a walk across the Brooklyn Bridge. Just watch out for
+And if you're feelin' adventurous, take a walk across the Brooklyn Bridge. Just watch out for 
 those pesky pigeons, they're like little feathered thieves! (laughs) Get it? Thieves? Ah, never mind.

-Now, if you're lookin' for some serious fun, hit up the comedy clubs in Greenwich Village. You might
+Now, if you're lookin' for some serious fun, hit up the comedy clubs in Greenwich Village. You might 
 even catch a glimpse of some up-and-coming comedians... or a bunch of wannabes tryin' to make it big. (winks)

 And finally, if you're feelin' like a real New Yorker, grab a slice of pizza from one of the many amazing
@ -113,9 +107,9 @@ print(response[0]["generated_text"][-1]["content"])
 ```

 ```txt
-(laughs) Oh, you're killin' me, pal! You don't get it, do you? Warhol's soup cans are like, art, man!
-It's like, he took something totally mundane, like a can of soup, and turned it into a masterpiece. It's
-like, "Hey, look at me, I'm a can of soup, but I'm also a work of art!"
+(laughs) Oh, you're killin' me, pal! You don't get it, do you? Warhol's soup cans are like, art, man! 
+It's like, he took something totally mundane, like a can of soup, and turned it into a masterpiece. It's 
+like, "Hey, look at me, I'm a can of soup, but I'm also a work of art!" 
 (sarcastically) Oh, yeah, real original, Andy.

 But, you know, back in the '60s, it was like, a big deal. People were all about challenging the
--- a/docs/source/en/image_processors.md
+++ b/docs/source/en/image_processors.md
@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.

 # Image processors

-Image processors converts images into pixel values, tensors that represent image colors and size. The pixel values are inputs to a vision model. To ensure a pretrained model receives the correct input, an image processor can perform the following operations to make sure an image is exactly like the images a model was pretrained on.
+Image processors converts images into pixel values, tensors that represent image colors and size. The pixel values are inputs to a vision or video model. To ensure a pretrained model receives the correct input, an image processor can perform the following operations to make sure an image is exactly like the images a model was pretrained on.

 - [`~BaseImageProcessor.center_crop`] to resize an image
 - [`~BaseImageProcessor.normalize`] or [`~BaseImageProcessor.rescale`] pixel values
--- a/docs/source/en/internal/import_utils.md
+++ b/docs/source/en/internal/import_utils.md
@ -84,19 +84,6 @@ class Trainer:

 Backends that can be added here are all the backends that are available in the `import_utils.py` module.

-Additionally, specific versions can be specified in each backend. For example, this is how you would specify
-a requirement on torch>=2.6 on the `Trainer` class:
-
-```python
-from .utils.import_utils import requires
-
-@requires(backends=("torch>=2.6", "accelerate"))
-class Trainer:
-    ...
-```
-
-You can specify the following operators: `==`, `>`, `>=`, `<`, `<=`, `!=`.
-
 ## Methods

 [[autodoc]] utils.import_utils.define_import_structure
--- a/docs/source/en/llm_tutorial.md
+++ b/docs/source/en/llm_tutorial.md
@ -20,13 +20,9 @@ rendered properly in your Markdown viewer.

 Text generation is the most popular application for large language models (LLMs). A LLM is trained to generate the next word (token) given some initial text (prompt) along with its own generated outputs up to a predefined length or when it reaches an end-of-sequence (`EOS`) token.

-In Transformers, the [`~GenerationMixin.generate`] API handles text generation, and it is available for all models with generative capabilities. This guide will show you the basics of text generation with [`~GenerationMixin.generate`] and some common pitfalls to avoid.
+In Transformers, the [`~GenerationMixin.generate`] API handles text generation, and it is available for all models with generative capabilities.

-> [!TIP]
-> You can also chat with a model directly from the command line. ([reference](./conversations.md#transformers-cli))
-> ```shell
-> transformers chat Qwen/Qwen2.5-0.5B-Instruct
-> ```
+This guide will show you the basics of text generation with [`~GenerationMixin.generate`] and some common pitfalls to avoid.

 ## Default generate

@ -138,20 +134,6 @@ outputs = model.generate(**inputs, generation_config=generation_config)
 print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
 ```

-## Common Options
-
-[`~GenerationMixin.generate`] is a powerful tool that can be heavily customized. This can be daunting for a new users. This section contains a list of popular generation options that you can define in most text generation tools in Transformers: [`~GenerationMixin.generate`], [`GenerationConfig`], `pipelines`, the `chat` CLI, ...
-
-| Option name | Type | Simplified description |
-|---|---|---|
-| `max_new_tokens` | `int` | Controls the maximum generation length. Be sure to define it, as it usually defaults to a small value. |
-| `do_sample` | `bool` | Defines whether generation will sample the next token (`True`), or is greedy instead (`False`). Most use cases should set this flag to `True`. Check [this guide](./generation_strategies.md) for more information. |
-| `temperature` | `float` | How unpredictable the next selected token will be. High values (`>0.8`) are good for creative tasks, low values (e.g. `<0.4`) for tasks that require "thinking". Requires `do_sample=True`. |
-| `num_beams` | `int` | When set to `>1`, activates the beam search algorithm. Beam search is good on input-grounded tasks. Check [this guide](./generation_strategies.md) for more information. |
-| `repetition_penalty` | `float` | Set it to `>1.0` if you're seeing the model repeat itself often. Larger values apply a larger penalty. |
-| `eos_token_id` | `List[int]` | The token(s) that will cause generation to stop. The default value is usually good, but you can specify a different token. |
-
-
 ## Pitfalls

 The section below covers some common issues you may encounter during text generation and how to solve them.
@ -304,4 +286,4 @@ Take a look below for some more specific and specialized text generation librari
 - [SynCode](https://github.com/uiuc-focal-lab/syncode): a library for context-free grammar guided generation (JSON, SQL, Python).
 - [Text Generation Inference](https://github.com/huggingface/text-generation-inference): a production-ready server for LLMs.
 - [Text generation web UI](https://github.com/oobabooga/text-generation-webui): a Gradio web UI for text generation.
- [logits-processor-zoo](https://github.com/NVIDIA/logits-processor-zoo): additional logits processors for controlling text generation.
+- [logits-processor-zoo](https://github.com/NVIDIA/logits-processor-zoo): additional logits processors for controlling text generation.
--- a/docs/source/en/main_classes/video_processor.md
+++ b/docs/source/en/main_classes/video_processor.md
@ -1,55 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-
-# Video Processor
-
-A **Video Processor** is a utility responsible for preparing input features for video models, as well as handling the post-processing of their outputs. It provides transformations such as resizing, normalization, and conversion into PyTorch. 
-
-The video processor extends the functionality of image processors by allowing Vision Large Language Models (VLMs) to handle videos with a distinct set of arguments compared to images. It serves as the bridge between raw video data and the model, ensuring that input features are optimized for the VLM.
-
-When adding a new VLM or updating an existing one to enable distinct video preprocessing, saving and reloading the processor configuration will store the video related arguments in a dedicated file named `video_preprocessing_config.json`. Don't worry if you haven't upadted your VLM, the processor will try to load video related configurations from a file named `preprocessing_config.json`.
-
-
-### Usage Example
-Here's an example of how to load a video processor with [`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf) model:
-
-```python
-from transformers import AutoVideoProcessor
-
-processor = AutoVideoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
-```
-
-Currently, if using base image processor for videos, it processes video data by treating each frame as an individual image and applying transformations frame-by-frame. While functional, this approach is not highly efficient. Using `AutoVideoProcessor` allows us to take advantage of **fast video processors**, leveraging the [torchvision](https://pytorch.org/vision/stable/index.html) library. Fast processors handle the whole batch of videos at once, without iterating over each video or frame. These updates introduce GPU acceleration and significantly enhance processing speed, especially for tasks requiring high throughput.
-
-Fast video processors are available for all models and are loaded by default when an `AutoVideoProcessor` is initialized. When using a fast video processor, you can also set the `device` argument to specify the device on which the processing should be done. By default, the processing is done on the same device as the inputs if the inputs are tensors, or on the CPU otherwise. For even more speed improvement, we can compile the processor when using 'cuda' as device.
-
-```python
-import torch
-from transformers.video_utils import load_video
-from transformers import AutoVideoProcessor
-
-video = load_video("video.mp4")
-processor = AutoVideoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", device="cuda")
-processor = torch.compile(processor)
-processed_video = processor(video, return_tensors="pt")
-```
-
-
-## BaseVideoProcessor
-
-[[autodoc]] video_processing_utils.BaseVideoProcessor
-
--- a/docs/source/en/model_doc/aria.md
+++ b/docs/source/en/model_doc/aria.md
@ -102,10 +102,6 @@ response = processor.decode(output_ids, skip_special_tokens=True)

 [[autodoc]] AriaTextModel

-## AriaModel
-
-[[autodoc]] AriaModel
-
 ## AriaTextForCausalLM

 [[autodoc]] AriaTextForCausalLM
--- a/docs/source/en/model_doc/auto.md
+++ b/docs/source/en/model_doc/auto.md
@ -74,10 +74,6 @@ Likewise, if your `NewModel` is a subclass of [`PreTrainedModel`], make sure its

 [[autodoc]] AutoImageProcessor

-## AutoVideoProcessor
-
-[[autodoc]] AutoVideoProcessor
-
 ## AutoProcessor

 [[autodoc]] AutoProcessor
--- a/docs/source/en/model_doc/aya_vision.md
+++ b/docs/source/en/model_doc/aya_vision.md
@ -237,10 +237,6 @@ for i, output in enumerate(batch_outputs):

 [[autodoc]] AyaVisionConfig

-## AyaVisionModel
-
-[[autodoc]] AyaVisionModel
-
 ## AyaVisionForConditionalGeneration

 [[autodoc]] AyaVisionForConditionalGeneration
--- a/docs/source/en/model_doc/beit.md
+++ b/docs/source/en/model_doc/beit.md
@ -151,12 +151,6 @@ If you're interested in submitting a resource to be included here, please feel f
    - preprocess
    - post_process_semantic_segmentation

-## BeitImageProcessorFast
-
-[[autodoc]] BeitImageProcessorFast
-    - preprocess
-    - post_process_semantic_segmentation
-
 <frameworkcontent>
 <pt>

--- a/docs/source/en/model_doc/bert.md
+++ b/docs/source/en/model_doc/bert.md
@ -81,10 +81,10 @@ print(f"The predicted token is: {predicted_token}")
 ```

 </hfoption>
-<hfoption id="transformers CLI">
+<hfoption id="transformers-cli">

 ```bash
-echo -e "Plants create [MASK] through a process known as photosynthesis." | transformers run --task fill-mask --model google-bert/bert-base-uncased --device 0
+echo -e "Plants create [MASK] through a process known as photosynthesis." | transformers-cli run --task fill-mask --model google-bert/bert-base-uncased --device 0
 ```

 </hfoption>
@ -256,4 +256,4 @@ echo -e "Plants create [MASK] through a process known as photosynthesis." | tran

 [[autodoc]] models.bert.modeling_tf_bert.TFBertForPreTrainingOutput

-[[autodoc]] models.bert.modeling_flax_bert.FlaxBertForPreTrainingOutput
+[[autodoc]] models.bert.modeling_flax_bert.FlaxBertForPreTrainingOutput
--- a/docs/source/en/model_doc/code_llama.md
+++ b/docs/source/en/model_doc/code_llama.md
@ -35,7 +35,7 @@ The example below demonstrates how to generate code with [`Pipeline`], or the [`

 <hfoptions id="usage">
 <hfoption id="Pipeline">
-
+    
 ```py
 import torch
 from transformers import pipeline
@ -76,7 +76,7 @@ prompt = "# Function to calculate the factorial of a number\ndef factorial(n):"
 input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

 output = model.generate(
-    **input_ids,
+    **input_ids, 
    max_new_tokens=256,
    cache_implementation="static"
 )
@ -92,10 +92,10 @@ print(filled_text)
 ```

 </hfoption>
-<hfoption id="transformers CLI">
-
+<hfoption id="transformers-cli">
+    
 ```bash
-echo -e "# Function to calculate the factorial of a number\ndef factorial(n):" | transformers run --task text-generation --model meta-llama/CodeLlama-7b-hf --device 0
+echo -e "# Function to calculate the factorial of a number\ndef factorial(n):" | transformers-cli run --task text-generation --model meta-llama/CodeLlama-7b-hf --device 0
 ```

 </hfoption>
@ -146,7 +146,7 @@ visualizer("""def func(a, b):
 - Use the `<FILL_ME>` token where you want your input to be filled. The tokenizer splits this token to create a formatted input string that follows the [original training pattern](https://github.com/facebookresearch/codellama/blob/cb51c14ec761370ba2e2bc351374a79265d0465e/llama/generation.py#L402). This is more robust than preparing the pattern yourself.
    ```py
    from transformers import LlamaForCausalLM, CodeLlamaTokenizer
-
+    
    tokenizer = CodeLlamaTokenizer.from_pretrained("meta-llama/CodeLlama-7b-hf")
    model = LlamaForCausalLM.from_pretrained("meta-llama/CodeLlama-7b-hf")
    PROMPT = '''def remove_non_ascii(s: str) -> str:
@ -155,7 +155,7 @@ visualizer("""def func(a, b):
    '''
    input_ids = tokenizer(PROMPT, return_tensors="pt")["input_ids"]
    generated_ids = model.generate(input_ids, max_new_tokens=128)
-
+    
    filling = tokenizer.batch_decode(generated_ids[:, input_ids.shape[1]:], skip_special_tokens = True)[0]
    print(PROMPT.replace("<FILL_ME>", filling))
    ```
--- a/docs/source/en/model_doc/cohere.md
+++ b/docs/source/en/model_doc/cohere.md
@ -49,9 +49,9 @@ model = AutoModelForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01", t
 messages = [{"role": "user", "content": "How do plants make energy?"}]
 input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
 output = model.generate(
-    input_ids,
-    max_new_tokens=100,
-    do_sample=True,
+    input_ids, 
+    max_new_tokens=100, 
+    do_sample=True, 
    temperature=0.3,
    cache_implementation="static",
 )
@ -59,11 +59,11 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```

 </hfoption>
-<hfoption id="transformers CLI">
+<hfoption id="transformers-cli">

 ```bash
 # pip install -U flash-attn --no-build-isolation
-transformers chat CohereForAI/c4ai-command-r-v01 --torch_dtype auto --attn_implementation flash_attention_2
+transformers-cli chat --model_name_or_path CohereForAI/c4ai-command-r-v01 --torch_dtype auto --attn_implementation flash_attention_2
 ```

 </hfoption>
@ -85,9 +85,9 @@ model = AutoModelForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01", t
 messages = [{"role": "user", "content": "How do plants make energy?"}]
 input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
 output = model.generate(
-    input_ids,
-    max_new_tokens=100,
-    do_sample=True,
+    input_ids, 
+    max_new_tokens=100, 
+    do_sample=True, 
    temperature=0.3,
    cache_implementation="static",
 )
--- a/docs/source/en/model_doc/csm.md
+++ b/docs/source/en/model_doc/csm.md
@ -1,377 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Csm
-
-## Overview
-
-The Conversational Speech Model (CSM) is the first open-source contextual text-to-speech model [released by Sesame](https://www.sesame.com/research/crossing_the_uncanny_valley_of_voice). It is designed to generate natural-sounding speech with or without conversational context. This context typically consists of multi-turn dialogue between speakers, represented as sequences of text and corresponding spoken audio.
-
-**Model Architecture:**
-CSM is composed of two LLaMA-style auto-regressive transformer decoders: a backbone decoder that predicts the first codebook token and a depth decoder that generates the remaining tokens. It uses the pretrained codec model [Mimi](./mimi.md), introduced by Kyutai, to encode speech into discrete codebook tokens and decode them back into audio.
-
-The original csm-1b checkpoint is available under the [Sesame](https://huggingface.co/sesame/csm-1b) organization on Hugging Face.
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/eustlb/documentation-images/resolve/main/csm_architecture.png"/>
-</div>
-
-## Usage Tips
-
-### Without Conversational Context
-
-CSM can be used to simply generate speech from a text prompt:
-
-```python
-import torch
-from transformers import CsmForConditionalGeneration, AutoProcessor
-
-model_id = "eustlb/csm-1b"
-device = "cuda" if torch.cuda.is_available() else "cpu"
-
-# load the model and the processor
-processor = AutoProcessor.from_pretrained(model_id)
-model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device)
-
-# prepare the inputs
-text = "[0]The past is just a story we tell ourselves." # `[0]` for speaker id 0
-inputs = processor(text, add_special_tokens=True).to(device)
-
-# another equivalent way to prepare the inputs
-conversation = [
-    {"role": "0", "content": [{"type": "text", "text": "The past is just a story we tell ourselves."}]},
-]
-inputs = processor.apply_chat_template(
-    conversation,
-    tokenize=True,
-    return_dict=True,
-).to(device)
-
-# infer the model
-audio = model.generate(**inputs, output_audio=True)
-processor.save_audio(audio, "example_without_context.wav")
-```
-
-### With Conversational Context
-
-CSM can be used to generate speech given a conversation, allowing consistency in the voices and content-aware generation:
-
-```python
-import torch
-from transformers import CsmForConditionalGeneration, AutoProcessor
-from datasets import load_dataset, Audio
-
-model_id = "eustlb/csm-1b"
-device = "cuda" if torch.cuda.is_available() else "cpu"
-
-# load the model and the processor
-processor = AutoProcessor.from_pretrained(model_id)
-model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device)
-
-# prepare the inputs
-ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
-# ensure the audio is 24kHz
-ds = ds.cast_column("audio", Audio(sampling_rate=24000))
-conversation = []
-
-# 1. context
-for text, audio, speaker_id in zip(ds[:4]["text"], ds[:4]["audio"], ds[:4]["speaker_id"]):
-    conversation.append(
-        {
-            "role": f"{speaker_id}",
-            "content": [{"type": "text", "text": text}, {"type": "audio", "path": audio["array"]}],
-        }
-    )
-
-# 2. text prompt
-conversation.append({"role": f"{ds[4]['speaker_id']}", "content": [{"type": "text", "text": ds[4]["text"]}]})
-
-inputs = processor.apply_chat_template(
-    conversation,
-    tokenize=True,
-    return_dict=True,
-).to(device)
-
-# infer the model
-audio = model.generate(**inputs, output_audio=True)
-processor.save_audio(audio, "example_with_context.wav")
-```
-
-### Batched Inference
-
-CSM supports batched inference!
-
-```python
-import torch
-from transformers import CsmForConditionalGeneration, AutoProcessor
-from datasets import load_dataset, Audio
-
-model_id = "eustlb/csm-1b"
-device = "cuda" if torch.cuda.is_available() else "cpu"
-
-# load the model and the processor
-processor = AutoProcessor.from_pretrained(model_id)
-model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device)
-
-# prepare the inputs 
-ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
-# ensure the audio is 24kHz
-ds = ds.cast_column("audio", Audio(sampling_rate=24000))
-# here a batch with two prompts
-conversation = [
-    [
-        {
-            "role": f"{ds[0]['speaker_id']}",
-            "content": [
-                {"type": "text", "text": ds[0]["text"]},
-                {"type": "audio", "path": ds[0]["audio"]["array"]},
-            ],
-        },
-        {
-            "role": f"{ds[1]['speaker_id']}",
-            "content": [
-                {"type": "text", "text": ds[1]["text"]},
-            ],
-        },
-    ],
-    [
-        {
-            "role": f"{ds[0]['speaker_id']}",
-            "content": [
-                {"type": "text", "text": ds[0]["text"]},
-            ],
-        }
-    ],
-]
-inputs = processor.apply_chat_template(
-    conversation,
-    tokenize=True,
-    return_dict=True,
-).to(device)
-
-audio = model.generate(**inputs, output_audio=True)
-processor.save_audio(audio, [f"speech_batch_idx_{i}.wav" for i in range(len(audio))])
-```
-
-### Making The Model Go Brrr
-
-CSM supports full-graph compilation with CUDA graphs!
-
-```python
-import torch
-import copy
-from transformers import CsmForConditionalGeneration, AutoProcessor
-from datasets import load_dataset
-
-model_id = "eustlb/csm-1b"
-device = "cuda"
-
-# set logs to ensure no recompilation and graph breaks
-torch._logging.set_logs(graph_breaks=True, recompiles=True, cudagraphs=True)
-
-# load the model and the processor
-processor = AutoProcessor.from_pretrained(model_id)
-model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device)
-
-# use static cache, enabling automatically torch compile with fullgraph and reduce-overhead
-model.generation_config.max_length = 250 # big enough to avoid recompilation
-model.generation_config.max_new_tokens = None # would take precedence over max_length
-model.generation_config.cache_implementation = "static"
-model.depth_decoder.generation_config.cache_implementation = "static"
-
-# generation kwargs
-gen_kwargs = {
-    "do_sample": False,
-    "depth_decoder_do_sample": False,
-    "temperature": 1.0,
-    "depth_decoder_temperature": 1.0,
-}
-
-# Define a timing decorator
-class TimerContext:
-    def __init__(self, name="Execution"):
-        self.name = name
-        self.start_event = None
-        self.end_event = None
-        
-    def __enter__(self):
-        # Use CUDA events for more accurate GPU timing
-        self.start_event = torch.cuda.Event(enable_timing=True)
-        self.end_event = torch.cuda.Event(enable_timing=True)
-        self.start_event.record()
-        return self
-
-    def __exit__(self, *args):
-        self.end_event.record()
-        torch.cuda.synchronize()
-        elapsed_time = self.start_event.elapsed_time(self.end_event) / 1000.0
-        print(f"{self.name} time: {elapsed_time:.4f} seconds")
-
-# prepare the inputs 
-ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
-
-conversation = [
-    {
-        "role": f"{ds[0]['speaker_id']}",
-        "content": [
-            {"type": "text", "text": ds[0]["text"]},
-            {"type": "audio", "path": ds[0]["audio"]["array"]},
-        ],
-    },
-    {
-        "role": f"{ds[1]['speaker_id']}",
-        "content": [
-            {"type": "text", "text": ds[1]["text"]},
-            {"type": "audio", "path": ds[1]["audio"]["array"]},
-        ],
-    },
-    {
-        "role": f"{ds[2]['speaker_id']}",
-        "content": [
-            {"type": "text", "text": ds[2]["text"]},
-        ],
-    },
-]
-
-padded_inputs_1 = processor.apply_chat_template(
-    conversation,
-    tokenize=True,
-    return_dict=True,
-).to(device)
-
-print("\n" + "="*50)
-print("First generation - compiling and recording CUDA graphs...")
-with TimerContext("First generation"):
-    _ = model.generate(**padded_inputs_1, **gen_kwargs)
-print("="*50)
-
-print("\n" + "="*50)
-print("Second generation - fast !!!")
-with TimerContext("Second generation"):
-    _ = model.generate(**padded_inputs_1, **gen_kwargs)
-print("="*50)
-
-# now with different inputs
-conversation = [
-    {
-        "role": f"{ds[0]['speaker_id']}",
-        "content": [
-            {"type": "text", "text": ds[2]["text"]},
-            {"type": "audio", "path": ds[2]["audio"]["array"]},
-        ],
-    },
-    {
-        "role": f"{ds[1]['speaker_id']}",
-        "content": [
-            {"type": "text", "text": ds[3]["text"]},
-            {"type": "audio", "path": ds[3]["audio"]["array"]},
-        ],
-    },
-    {
-        "role": f"{ds[2]['speaker_id']}",
-        "content": [
-            {"type": "text", "text": ds[4]["text"]},
-        ],
-    },
-]
-padded_inputs_2 = processor.apply_chat_template(
-    conversation,
-    tokenize=True,
-    return_dict=True,
-).to(device)
-
-print("\n" + "="*50)
-print("Generation with other inputs!")
-with TimerContext("Generation with different inputs"):
-    _ = model.generate(**padded_inputs_2, **gen_kwargs)
-print("="*50)
-```
-
-### Training
-
-CSM Transformers integration supports training!
-
-```python
-from transformers import CsmForConditionalGeneration, AutoProcessor
-from datasets import load_dataset, Audio
-
-model_id = "eustlb/csm-1b"
-device = "cuda"
-
-# load the model and the processor
-processor = AutoProcessor.from_pretrained(model_id)
-model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device)
-model.train()
-
-ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
-# ensure the audio is 24kHz
-ds = ds.cast_column("audio", Audio(sampling_rate=24000))
-conversation = []
-
-# context
-for text, audio, speaker_id in zip(ds[:4]["text"], ds[:4]["audio"], ds[:4]["speaker_id"]):
-    conversation.append(
-        {
-            "role": f"{speaker_id}",
-            "content": [{"type": "text", "text": text}, {"type": "audio", "path": audio["array"]}],
-        }
-    )
-
-inputs = processor.apply_chat_template(
-    conversation,
-    tokenize=True,
-    return_dict=True,
-    output_labels=True,
-).to(device)
-
-out = model(**inputs)
-out.loss.backward()
-```
-
-This model was contributed by [Eustache Le Bihan](https://huggingface.co/eustlb).
-The original code can be found [here](https://github.com/SesameAILabs/csm).
-
-
-## CsmConfig
-
-[[autodoc]] CsmConfig
-
-## CsmDepthDecoderConfig
-
-[[autodoc]] CsmDepthDecoderConfig
-
-## CsmProcessor
-
-[[autodoc]] CsmProcessor
-    - __call__
-
-## CsmForConditionalGeneration
-
-[[autodoc]] CsmForConditionalGeneration
-    - forward
-    - generate
-
-## CsmDepthDecoderForCausalLM
-
-[[autodoc]] CsmDepthDecoderForCausalLM
-
-## CsmDepthDecoderModel
-
-[[autodoc]] CsmDepthDecoderModel
-
-## CsmBackboneModel
-
-[[autodoc]] CsmBackboneModel
--- a/docs/source/en/model_doc/dinov2.md
+++ b/docs/source/en/model_doc/dinov2.md
@ -111,68 +111,33 @@ print("Predicted class:", model.config.id2label[predicted_class_idx])

 ## Notes

- The example below shows how to split the output tensor into:
-  - one embedding for the whole image, commonly referred to as a `CLS` token,
-    useful for classification and retrieval
-  - a set of local embeddings, one for each `14x14` patch of the input image,
-    useful for dense tasks, such as semantic segmentation
+- Use [torch.jit.trace](https://pytorch.org/docs/stable/generated/torch.jit.trace.html) to speedup inference. However, it will produce some mismatched elements. The difference between the original and traced model is 1e-4.

-  ```py
-  from transformers import AutoImageProcessor, AutoModel
-  from PIL import Image
-  import requests
-  
-  url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
-  image = Image.open(requests.get(url, stream=True).raw)
-  print(image.height, image.width)  # [480, 640]
-  
-  processor = AutoImageProcessor.from_pretrained('facebook/dinov2-base')
-  model = AutoModel.from_pretrained('facebook/dinov2-base')
-  patch_size = model.config.patch_size
-  
-  inputs = processor(images=image, return_tensors="pt")
-  print(inputs.pixel_values.shape)  # [1, 3, 224, 224]
-  batch_size, rgb, img_height, img_width = inputs.pixel_values.shape
-  num_patches_height, num_patches_width = img_height // patch_size, img_width // patch_size
-  num_patches_flat = num_patches_height * num_patches_width
-  
-  outputs = model(**inputs)
-  last_hidden_states = outputs[0]
-  print(last_hidden_states.shape)  # [1, 1 + 256, 768]
-  assert last_hidden_states.shape == (batch_size, 1 + num_patches_flat, model.config.hidden_size)
-  
-  cls_token = last_hidden_states[:, 0, :]
-  patch_features = last_hidden_states[:, 1:, :].unflatten(1, (num_patches_height, num_patches_width))
-  ```
+    ```py
+    import torch
+    from transformers import AutoImageProcessor, AutoModel
+    from PIL import Image
+    import requests

- Use [torch.jit.trace](https://pytorch.org/docs/stable/generated/torch.jit.trace.html) to speedup inference.
-  However, it will produce some mismatched elements. The difference between the original and traced model is 1e-4.
+    url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+    image = Image.open(requests.get(url, stream=True).raw)

-  ```py
-  import torch
-  from transformers import AutoImageProcessor, AutoModel
-  from PIL import Image
-  import requests
-  
-  url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
-  image = Image.open(requests.get(url, stream=True).raw)
-  
-  processor = AutoImageProcessor.from_pretrained('facebook/dinov2-base')
-  model = AutoModel.from_pretrained('facebook/dinov2-base')
-  
-  inputs = processor(images=image, return_tensors="pt")
-  outputs = model(**inputs)
-  last_hidden_states = outputs[0]
-  
-  # We have to force return_dict=False for tracing
-  model.config.return_dict = False
-  
-  with torch.no_grad():
-      traced_model = torch.jit.trace(model, [inputs.pixel_values])
-      traced_outputs = traced_model(inputs.pixel_values)
-  
-  print((last_hidden_states - traced_outputs[0]).abs().max())
-  ```
+    processor = AutoImageProcessor.from_pretrained('facebook/dinov2-base')
+    model = AutoModel.from_pretrained('facebook/dinov2-base')
+
+    inputs = processor(images=image, return_tensors="pt")
+    outputs = model(**inputs)
+    last_hidden_states = outputs[0]
+
+    # We have to force return_dict=False for tracing
+    model.config.return_dict = False
+
+    with torch.no_grad():
+        traced_model = torch.jit.trace(model, [inputs.pixel_values])
+        traced_outputs = traced_model(inputs.pixel_values)
+
+    print((last_hidden_states - traced_outputs[0]).abs().max())
+    ```

 ## Dinov2Config

--- a/docs/source/en/model_doc/distilbert.md
+++ b/docs/source/en/model_doc/distilbert.md
@ -83,10 +83,10 @@ print(f"Predicted label: {predicted_label}")

 </hfoption>

-<hfoption id="transformers CLI">
+<hfoption id="transformers-cli">

 ```bash
-echo -e "I love using Hugging Face Transformers!" | transformers run --task text-classification --model distilbert-base-uncased-finetuned-sst-2-english
+echo -e "I love using Hugging Face Transformers!" | transformers-cli run --task text-classification --model distilbert-base-uncased-finetuned-sst-2-english
 ```

 </hfoption>
@ -213,3 +213,7 @@ echo -e "I love using Hugging Face Transformers!" | transformers run --task text

 </jax>
 </frameworkcontent>
+
+
+
+
--- a/docs/source/en/model_doc/electra.md
+++ b/docs/source/en/model_doc/electra.md
@ -45,9 +45,9 @@ import torch
 from transformers import pipeline

 classifier = pipeline(
-    task="text-classification",
-    model="bhadresh-savani/electra-base-emotion",
-    torch_dtype=torch.float16,
+    task="text-classification", 
+    model="bhadresh-savani/electra-base-emotion", 
+    torch_dtype=torch.float16, 
    device=0
 )
 classifier("This restaurant has amazing food!")
@ -64,7 +64,7 @@ tokenizer = AutoTokenizer.from_pretrained(
    "bhadresh-savani/electra-base-emotion",
 )
 model = AutoModelForSequenceClassification.from_pretrained(
-    "bhadresh-savani/electra-base-emotion",
+    "bhadresh-savani/electra-base-emotion", 
    torch_dtype=torch.float16
 )
 inputs = tokenizer("ELECTRA is more efficient than BERT", return_tensors="pt")
@ -78,10 +78,10 @@ print(f"Predicted label: {predicted_label}")
 ```

 </hfoption>
-<hfoption id="transformers CLI">
+<hfoption id="transformers-cli">

 ```bash
-echo -e "This restaurant has amazing food." | transformers run --task text-classification --model bhadresh-savani/electra-base-emotion --device 0
+echo -e "This restaurant has amazing food." | transformers-cli run --task text-classification --model bhadresh-savani/electra-base-emotion --device 0
 ```

 </hfoption>
@ -96,12 +96,12 @@ echo -e "This restaurant has amazing food." | transformers run --task text-class

    ```py
    # Example of properly handling padding with attention masks
-    inputs = tokenizer(["Short text", "This is a much longer text that needs padding"],
-                    padding=True,
+    inputs = tokenizer(["Short text", "This is a much longer text that needs padding"], 
+                    padding=True, 
                    return_tensors="pt")
    outputs = model(**inputs)  # automatically uses the attention_mask
    ```
-
+    
 - When using the discriminator for a downstream task, you can load it into any of the ELECTRA model classes ([`ElectraForSequenceClassification`], [`ElectraForTokenClassification`], etc.).

 ## ElectraConfig
--- a/docs/source/en/model_doc/emu3.md
+++ b/docs/source/en/model_doc/emu3.md
@ -174,10 +174,6 @@ for i, image in enumerate(images['pixel_values']):
 [[autodoc]] Emu3TextModel
    - forward

-## Emu3Model
-
-[[autodoc]] Emu3Model
-
 ## Emu3ForCausalLM

 [[autodoc]] Emu3ForCausalLM
--- a/docs/source/en/model_doc/falcon.md
+++ b/docs/source/en/model_doc/falcon.md
@ -41,7 +41,7 @@ import torch
 from transformers import pipeline

 pipeline = pipeline(
-    task="text-generation",
+    task="text-generation", 
    model="tiiuae/falcon-7b-instruct",
    torch_dtype=torch.bfloat16,
    device=0
@ -76,11 +76,11 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```

 </hfoption>
-<hfoption id="transformers CLI">
+<hfoption id="transformers-cli">

 ```bash
 # pip install -U flash-attn --no-build-isolation
-transformers chat tiiuae/falcon-7b-instruct --torch_dtype auto --attn_implementation flash_attention_2 --device 0
+transformers-cli chat --model_name_or_path tiiuae/falcon-7b-instruct --torch_dtype auto --attn_implementation flash_attention_2 --device 0
 ```

 </hfoption>
@ -150,4 +150,4 @@ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 ## FalconForQuestionAnswering

 [[autodoc]] FalconForQuestionAnswering
-    - forward
+    - forward
--- a/docs/source/en/model_doc/falcon_mamba.md
+++ b/docs/source/en/model_doc/falcon_mamba.md
@ -39,7 +39,7 @@ import torch
 from transformers import pipeline

 pipeline = pipeline(
-    "text-generation",
+    "text-generation", 
    model="tiiuae/falcon-mamba-7b-instruct",
    torch_dtype=torch.bfloat16,
    device=0
@ -73,10 +73,10 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```

 </hfoption>
-<hfoption id="transformers CLI">
+<hfoption id="transformers-cli">

 ```bash
-transformers chat tiiuae/falcon-mamba-7b-instruct --torch_dtype auto --device 0
+transformers-cli chat --model_name_or_path tiiuae/falcon-mamba-7b-instruct --torch_dtype auto --device 0
 ```

 </hfoption>
--- a/docs/source/en/model_doc/fuyu.md
+++ b/docs/source/en/model_doc/fuyu.md
@ -103,10 +103,6 @@ The `LlamaTokenizer` is used as it is a standard wrapper around sentencepiece.

 [[autodoc]] FuyuConfig

-## FuyuModel
-
-[[autodoc]] FuyuModel
-
 ## FuyuForCausalLM

 [[autodoc]] FuyuForCausalLM
--- a/docs/source/en/model_doc/gemma.md
+++ b/docs/source/en/model_doc/gemma.md
@ -80,10 +80,10 @@ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 ```

 </hfoption>
-<hfoption id="transformers CLI">
+<hfoption id="transformers-cli">

 ```bash
-echo -e "LLMs generate text through a process known as" | transformers run --task text-generation --model google/gemma-2b --device 0
+echo -e "LLMs generate text through a process known as" | transformers-cli run --task text-generation --model google/gemma-2b --device 0
 ```

 </hfoption>
@ -114,8 +114,8 @@ model = AutoModelForCausalLM.from_pretrained(
 input_text = "LLMs generate text through a process known as."
 input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
 outputs = model.generate(
-    **input_ids,
-    max_new_tokens=50,
+    **input_ids, 
+    max_new_tokens=50, 
    cache_implementation="static"
 )
 print(tokenizer.decode(outputs[0], skip_special_tokens=True))
@ -127,7 +127,7 @@ Use the [AttentionMaskVisualizer](https://github.com/huggingface/transformers/bl
 from transformers.utils.attention_visualizer import AttentionMaskVisualizer

 visualizer = AttentionMaskVisualizer("google/gemma-2b")
-visualizer("LLMs generate text through a process known as")
+visualizer("LLMs generate text through a process known as") 
 ```

 <div class="flex justify-center">
--- a/docs/source/en/model_doc/gemma2.md
+++ b/docs/source/en/model_doc/gemma2.md
@ -58,7 +58,7 @@ pipe("Explain quantum computing simply. ", max_new_tokens=50)

 </hfoption>
 <hfoption id="AutoModel">
-
+    
 ```python
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
@ -80,16 +80,16 @@ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 ```

 </hfoption>
-<hfoption id="transformers CLI">
+<hfoption id="transformers-cli">

 ```
-echo -e "Explain quantum computing simply." | transformers run --task text-generation --model google/gemma-2-2b --device 0
+echo -e "Explain quantum computing simply." | transformers-cli run --task text-generation --model google/gemma-2-2b --device 0
 ```
 </hfoption>
 </hfoptions>

 Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
-
+	
 The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to int4.

 ```python
@ -118,7 +118,7 @@ Use the [AttentionMaskVisualizer](https://github.com/huggingface/transformers/bl
 ```python
 from transformers.utils.attention_visualizer import AttentionMaskVisualizer
 visualizer = AttentionMaskVisualizer("google/gemma-2b")
-visualizer("You are an assistant. Make sure you print me")
+visualizer("You are an assistant. Make sure you print me") 
 ```

 <div class="flex justify-center">
@ -137,7 +137,7 @@ visualizer("You are an assistant. Make sure you print me")

    inputs = tokenizer(text="My name is Gemma", return_tensors="pt")
    max_generated_length = inputs.input_ids.shape[1] + 10
-    past_key_values = HybridCache(config=model.config, max_batch_size=1,
+    past_key_values = HybridCache(config=model.config, max_batch_size=1, 
    max_cache_len=max_generated_length, device=model.device, dtype=model.dtype)
    outputs = model(**inputs, past_key_values=past_key_values, use_cache=True)
    ```
--- a/docs/source/en/model_doc/gemma3.md
+++ b/docs/source/en/model_doc/gemma3.md
@ -28,7 +28,7 @@ rendered properly in your Markdown viewer.

 The instruction-tuned variant was post-trained with knowledge distillation and reinforcement learning.

-You can find all the original Gemma 3 checkpoints under the [Gemma 3](https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d) release.
+You can find all the original Gemma 3 checkpoints under the [Gemma 3](https://huggingface.co/collections/meta-llama/llama-2-family-661da1f90a9d678b6f55773b) release.

 > [!TIP]
 > Click on the Gemma 3 models in the right sidebar for more examples of how to apply Gemma to different vision and language tasks.
@ -99,10 +99,10 @@ print(processor.decode(output[0], skip_special_tokens=True))
 ```

 </hfoption>
-<hfoption id="transformers CLI">
+<hfoption id="transformers-cli">

 ```bash
-echo -e "Plants create energy through a process known as" | transformers run --task text-generation --model google/gemma-3-1b-pt --device 0
+echo -e "Plants create energy through a process known as" | transformers-cli run --task text-generation --model google/gemma-3-1b-pt --device 0
 ```

 </hfoption>
@ -254,10 +254,6 @@ visualizer("<img>What is shown in this image?")
 [[autodoc]] Gemma3TextModel
    - forward

-## Gemma3Model
-
-[[autodoc]] Gemma3Model
-
 ## Gemma3ForCausalLM

 [[autodoc]] Gemma3ForCausalLM
--- a/docs/source/en/model_doc/got_ocr2.md
+++ b/docs/source/en/model_doc/got_ocr2.md
@ -277,10 +277,6 @@ alt="drawing" width="600"/>

 [[autodoc]] GotOcr2Processor

-## GotOcr2Model
-
-[[autodoc]] GotOcr2Model
-
 ## GotOcr2ForConditionalGeneration

 [[autodoc]] GotOcr2ForConditionalGeneration
--- a/docs/source/en/model_doc/gpt2.md
+++ b/docs/source/en/model_doc/gpt2.md
@ -64,21 +64,15 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```

 </hfoption>
-<hfoption id="transformers CLI">
+<hfoption id="transformers-cli">

 ```bash
-echo -e "Hello, I'm a language model" | transformers run --task text-generation --model openai-community/gpt2 --device 0
+echo -e "Hello, I'm a language model" | transformers-cli run --task text-generation --model openai-community/gpt2 --device 0
 ```

 </hfoption>
 </hfoptions>

-One can also serve the model using vLLM with the `transformers backend`.
-
-```
-vllm serve openai-community/gpt2 --model-imp transformers
-```
-
 Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.

 The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to 4-bits.
@ -88,16 +82,16 @@ import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline

 quantization_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_quant_type="nf4",
-    bnb_4bit_compute_dtype="float16",
-    bnb_4bit_use_double_quant=True
+    load_in_4bit=True,  
+    bnb_4bit_quant_type="nf4",  
+    bnb_4bit_compute_dtype="float16",  
+    bnb_4bit_use_double_quant=True 
 )

 model = AutoModelForCausalLM.from_pretrained(
    "openai-community/gpt2-xl",
    quantization_config=quantization_config,
-    device_map="auto"
+    device_map="auto"  
 )

 tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2-xl")
--- a/docs/source/en/model_doc/granitemoehybrid.md
+++ b/docs/source/en/model_doc/granitemoehybrid.md
@ -1,64 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# GraniteMoeHybrid
-
-## Overview
-
-
-The `GraniteMoeHybrid` model builds on top of `GraniteMoeSharedModel` and `Bamba`. Its decoding layers consist of state space layers or MoE attention layers with shared experts. By default, the attention layers do not use positional encoding.
-
-
-```python
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-model_path = "ibm-granite/granite-4.0-tiny-preview"
-tokenizer = AutoTokenizer.from_pretrained(model_path)
-
-# drop device_map if running on CPU
-model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
-model.eval()
-
-# change input text as desired
-prompt = "Write a code to find the maximum value in a list of numbers."
-
-# tokenize the text
-input_tokens = tokenizer(prompt, return_tensors="pt")
-# generate output tokens
-output = model.generate(**input_tokens, max_new_tokens=100)
-# decode output tokens into text
-output = tokenizer.batch_decode(output)
-# loop over the batch to print, in this example the batch size is 1
-for i in output:
-    print(i)
-```
-
-This HF implementation is contributed by [Sukriti Sharma](https://huggingface.co/SukritiSharma) and [Alexander Brooks](https://huggingface.co/abrooks9944).
-
-
-## GraniteMoeHybridConfig
-
-[[autodoc]] GraniteMoeHybridConfig
-
-## GraniteMoeHybridModel
-
-[[autodoc]] GraniteMoeHybridModel
-    - forward
-
-## GraniteMoeHybridForCausalLM
-
-[[autodoc]] GraniteMoeHybridForCausalLM
-    - forward
--- a/docs/source/en/model_doc/instructblip.md
+++ b/docs/source/en/model_doc/instructblip.md
@ -69,10 +69,6 @@ The attributes can be obtained from model config, as `model.config.num_query_tok
 [[autodoc]] InstructBlipQFormerModel
    - forward

-## InstructBlipModel
-
-[[autodoc]] InstructBlipModel
-
 ## InstructBlipForConditionalGeneration

 [[autodoc]] InstructBlipForConditionalGeneration
--- a/docs/source/en/model_doc/instructblipvideo.md
+++ b/docs/source/en/model_doc/instructblipvideo.md
@ -58,12 +58,6 @@ The attributes can be obtained from model config, as `model.config.num_query_tok

 [[autodoc]] InstructBlipVideoProcessor

-
-## InstructBlipVideoVideoProcessor
-
-[[autodoc]] InstructBlipVideoVideoProcessor
-    - preprocess
-
 ## InstructBlipVideoImageProcessor

 [[autodoc]] InstructBlipVideoImageProcessor
@ -79,10 +73,6 @@ The attributes can be obtained from model config, as `model.config.num_query_tok
 [[autodoc]] InstructBlipVideoQFormerModel
    - forward

-## InstructBlipVideoModel
-[[autodoc]] InstructBlipVideoModel
-    - forward
-
 ## InstructBlipVideoForConditionalGeneration

 [[autodoc]] InstructBlipVideoForConditionalGeneration
--- a/docs/source/en/model_doc/internvl.md
+++ b/docs/source/en/model_doc/internvl.md
@ -340,11 +340,6 @@ This example showcases how to handle a batch of chat conversations with interlea
 [[autodoc]] InternVLVisionModel
    - forward

-## InternVLModel
-
-[[autodoc]] InternVLModel
-    - forward
-
 ## InternVLForConditionalGeneration

 [[autodoc]] InternVLForConditionalGeneration
@ -353,7 +348,3 @@ This example showcases how to handle a batch of chat conversations with interlea
 ## InternVLProcessor

 [[autodoc]] InternVLProcessor
-
-## InternVLVideoProcessor
-
-[[autodoc]] InternVLVideoProcessor
--- a/docs/source/en/model_doc/jamba.md
+++ b/docs/source/en/model_doc/jamba.md
@ -75,10 +75,10 @@ output = model.generate(**input_ids, cache_implementation="static")
 print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```
 </hfoption>
-<hfoption id="transformers CLI">
+<hfoption id="transformers-cli">

 ```bash
-echo -e "Plants create energy through a process known as" | transformers run --task text-generation --model ai21labs/AI21-Jamba-Mini-1.6 --device 0
+echo -e "Plants create energy through a process known as" | transformers-cli run --task text-generation --model ai21labs/AI21-Jamba-Mini-1.6 --device 0
 ```

 </hfoption>
--- a/docs/source/en/model_doc/llama.md
+++ b/docs/source/en/model_doc/llama.md
@ -74,10 +74,10 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```

 </hfoption>
-<hfoption id="transformers CLI">
+<hfoption id="transformers-cli">

 ```bash
-echo -e "Plants create energy through a process known as" | transformers run --task text-generation --model huggyllama/llama-7b --device 0
+echo -e "Plants create energy through a process known as" | transformers-cli run --task text-generation --model huggyllama/llama-7b --device 0
 ```

 </hfoption>
--- a/docs/source/en/model_doc/llama2.md
+++ b/docs/source/en/model_doc/llama2.md
@ -74,10 +74,10 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```

 </hfoption>
-<hfoption id="transformers CLI">
+<hfoption id="transformers-cli">

 ```bash
-transformers chat meta-llama/Llama-2-7b-chat-hf --torch_dtype auto --attn_implementation flash_attention_2
+transformers-cli chat --model_name_or_path meta-llama/Llama-2-7b-chat-hf --torch_dtype auto --attn_implementation flash_attention_2
 ```

 </hfoption>
@ -175,3 +175,4 @@ visualizer("Plants create energy through a process known as")

 [[autodoc]] LlamaForSequenceClassification
    - forward
+
--- a/docs/source/en/model_doc/llava.md
+++ b/docs/source/en/model_doc/llava.md
@ -256,10 +256,6 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h

 [[autodoc]] LlavaProcessor

-## LlavaModel
-
-[[autodoc]] LlavaModel
-
 ## LlavaForConditionalGeneration

 [[autodoc]] LlavaForConditionalGeneration
--- a/docs/source/en/model_doc/llava_next.md
+++ b/docs/source/en/model_doc/llava_next.md
@ -315,10 +315,6 @@ model = AutoModelForImageTextToText.from_pretrained(

 [[autodoc]] LlavaNextProcessor

-## LlavaNextModel
-
-[[autodoc]] LlavaNextModel
-
 ## LlavaNextForConditionalGeneration

 [[autodoc]] LlavaNextForConditionalGeneration
--- a/docs/source/en/model_doc/llava_next_video.md
+++ b/docs/source/en/model_doc/llava_next_video.md
@ -262,14 +262,6 @@ model = LlavaNextVideoForConditionalGeneration.from_pretrained(

 [[autodoc]] LlavaNextVideoImageProcessor

-## LlavaNextVideoVideoProcessor
-
-[[autodoc]] LlavaNextVideoVideoProcessor
-
-## LlavaNextVideoModel
-
-[[autodoc]] LlavaNextVideoModel
-
 ## LlavaNextVideoForConditionalGeneration

 [[autodoc]] LlavaNextVideoForConditionalGeneration
--- a/docs/source/en/model_doc/llava_onevision.md
+++ b/docs/source/en/model_doc/llava_onevision.md
@ -303,7 +303,6 @@ model = LlavaOnevisionForConditionalGeneration.from_pretrained(
 ## LlavaOnevisionImageProcessor

 [[autodoc]] LlavaOnevisionImageProcessor
-    - preprocess

 ## LlavaOnevisionImageProcessorFast

@ -314,14 +313,6 @@ model = LlavaOnevisionForConditionalGeneration.from_pretrained(

 [[autodoc]] LlavaOnevisionVideoProcessor

-## LlavaOnevisionVideoProcessor
-
-[[autodoc]] LlavaOnevisionVideoProcessor
-
-## LlavaOnevisionModel
-
-[[autodoc]] LlavaOnevisionModel
-
 ## LlavaOnevisionForConditionalGeneration

 [[autodoc]] LlavaOnevisionForConditionalGeneration
--- a/docs/source/en/model_doc/longformer.md
+++ b/docs/source/en/model_doc/longformer.md
@ -76,10 +76,10 @@ tokenizer.decode(predictions).split()
 ```

 </hfoption>
-<hfoption id="transformers CLI">
+<hfoption id="transformers-cli">

 ```bash
-echo -e "San Francisco 49ers cornerback Shawntae Spencer will miss the rest of the <mask> with a torn ligament in his left knee." | transformers run --task fill-mask --model allenai/longformer-base-4096 --device 0
+echo -e "San Francisco 49ers cornerback Shawntae Spencer will miss the rest of the <mask> with a torn ligament in his left knee." | transformers-cli run --task fill-mask --model allenai/longformer-base-4096 --device 0
 ```

 </hfoption>
@ -147,42 +147,42 @@ echo -e "San Francisco 49ers cornerback Shawntae Spencer will miss the rest of t

 ## LongformerForMaskedLM

-[[autodoc]] LongformerForMaskedLM
+[[autodoc]] LongformerForMaskedLM 
    - forward

 ## LongformerForSequenceClassification

-[[autodoc]] LongformerForSequenceClassification
+[[autodoc]] LongformerForSequenceClassification 
    - forward

 ## LongformerForMultipleChoice

-[[autodoc]] LongformerForMultipleChoice
+[[autodoc]] LongformerForMultipleChoice 
    - forward

 ## LongformerForTokenClassification

-[[autodoc]] LongformerForTokenClassification
+[[autodoc]] LongformerForTokenClassification 
    - forward

 ## LongformerForQuestionAnswering

-[[autodoc]] LongformerForQuestionAnswering
+[[autodoc]] LongformerForQuestionAnswering 
    - forward

 ## TFLongformerModel

-[[autodoc]] TFLongformerModel
+[[autodoc]] TFLongformerModel    
    - call

 ## TFLongformerForMaskedLM

-[[autodoc]] TFLongformerForMaskedLM
+[[autodoc]] TFLongformerForMaskedLM 
    - call

 ## TFLongformerForQuestionAnswering

-[[autodoc]] TFLongformerForQuestionAnswering
+[[autodoc]] TFLongformerForQuestionAnswering 
    - call

 ## TFLongformerForSequenceClassification
@ -192,10 +192,10 @@ echo -e "San Francisco 49ers cornerback Shawntae Spencer will miss the rest of t

 ## TFLongformerForTokenClassification

-[[autodoc]] TFLongformerForTokenClassification
+[[autodoc]] TFLongformerForTokenClassification 
    - call

 ## TFLongformerForMultipleChoice

-[[autodoc]] TFLongformerForMultipleChoice
+[[autodoc]] TFLongformerForMultipleChoice 
    - call
--- a/docs/source/en/model_doc/mistral.md
+++ b/docs/source/en/model_doc/mistral.md
@ -27,7 +27,7 @@ rendered properly in your Markdown viewer.

 # Mistral

-[Mistral](https://huggingface.co/papers/2310.06825) is a 7B parameter language model, available as a pretrained and instruction-tuned variant, focused on balancing
+[Mistral](https://huggingface.co/papers/2310.06825) is a 7B parameter language model, available as a pretrained and instruction-tuned variant, focused on balancing 
 the scaling costs of large models with performance and efficient inference. This model uses sliding window attention (SWA) trained with a 8K context length and a fixed cache size to handle longer sequences more effectively. Grouped-query attention (GQA) speeds up inference and reduces memory requirements. Mistral also features a byte-fallback BPE tokenizer to improve token handling and efficiency by ensuring characters are never mapped to out-of-vocabulary tokens.

 You can find all the original Mistral checkpoints under the [Mistral AI_](https://huggingface.co/mistralai) organization.
@ -78,10 +78,10 @@ The example below demonstrates how to chat with [`Pipeline`] or the [`AutoModel`
 ```

 </hfoption>
-<hfoption id="transformers CLI">
+<hfoption id="transformers-cli">

 ```python
-echo -e "My favorite condiment is" | transformers chat mistralai/Mistral-7B-v0.3 --torch_dtype auto --device 0 --attn_implementation flash_attention_2
+echo -e "My favorite condiment is" | transformers-cli chat --model_name_or_path mistralai/Mistral-7B-v0.3 --torch_dtype auto --device 0 --attn_implementation flash_attention_2
 ```

 </hfoption>
--- a/docs/source/en/model_doc/mistral3.md
+++ b/docs/source/en/model_doc/mistral3.md
@ -227,9 +227,6 @@ This example also how to use `BitsAndBytes` to load the model in 4bit quantizati

 [[autodoc]] Mistral3Config

-## Mistral3Model
-
-[[autodoc]] Mistral3Model

 ## Mistral3ForConditionalGeneration

--- a/docs/source/en/model_doc/mllama.md
+++ b/docs/source/en/model_doc/mllama.md
@ -130,10 +130,6 @@ print(processor.decode(output[0], skip_special_tokens=True))
 [[autodoc]] MllamaTextModel
    - forward

-## MllamaModel
-
-[[autodoc]] MllamaModel
-
 ## MllamaForCausalLM

 [[autodoc]] MllamaForCausalLM
--- a/docs/source/en/model_doc/mobilebert.md
+++ b/docs/source/en/model_doc/mobilebert.md
@ -76,10 +76,10 @@ print(f"The predicted token is: {predicted_token}")
 ```

 </hfoption>
-<hfoption id="transformers CLI">
+<hfoption id="transformers-cli">

 ```bash
-echo -e "The capital of France is [MASK]." | transformers run --task fill-mask --model google/mobilebert-uncased --device 0
+echo -e "The capital of France is [MASK]." | transformers-cli run --task fill-mask --model google/mobilebert-uncased --device 0
 ```

 </hfoption>
--- a/docs/source/en/model_doc/modernbert.md
+++ b/docs/source/en/model_doc/modernbert.md
@ -79,10 +79,10 @@ print(f"The predicted token is: {predicted_token}")
 ```

 </hfoption>
-<hfoption id="transformers CLI">
+<hfoption id="transformers-cli">

 ```bash
-echo -e "Plants create [MASK] through a process known as photosynthesis." | transformers run --task fill-mask --model answerdotai/ModernBERT-base --device 0
+echo -e "Plants create [MASK] through a process known as photosynthesis." | transformers-cli run --task fill-mask --model answerdotai/ModernBERT-base --device 0
 ```

 </hfoption>
--- a/docs/source/en/model_doc/openai-gpt.md
+++ b/docs/source/en/model_doc/openai-gpt.md
@ -70,10 +70,10 @@ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 ```

 </hfoption>
-<hfoption id="transformers CLI">
+<hfoption id="transformers-cli">

 ```bash
-echo -e "The future of AI is" | transformers run --task text-generation --model openai-community/openai-gpt --device 0
+echo -e "The future of AI is" | transformers-cli run --task text-generation --model openai-community/openai-gpt --device 0

 ```
 </hfoption>
--- a/docs/source/en/model_doc/paligemma.md
+++ b/docs/source/en/model_doc/paligemma.md
@ -174,10 +174,6 @@ visualizer("<img> What is in this image?")

 [[autodoc]] PaliGemmaProcessor

-## PaliGemmaModel
-
-[[autodoc]] PaliGemmaModel
-
 ## PaliGemmaForConditionalGeneration

 [[autodoc]] PaliGemmaForConditionalGeneration
--- a/docs/source/en/model_doc/phi.md
+++ b/docs/source/en/model_doc/phi.md
@ -65,10 +65,10 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```

 </hfoption>
-<hfoption id="transformers CLI">
+<hfoption id="transformers-cli">

 ```bash
-echo -e "'''def print_prime(n): """ Print all primes between 1 and n"""'''" | transformers run --task text-classification --model microsoft/phi-1.5 --device 0
+echo -e "'''def print_prime(n): """ Print all primes between 1 and n"""'''" | transformers-cli run --task text-classification --model microsoft/phi-1.5 --device 0
 ```

 </hfoption>
@ -102,7 +102,7 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
    ```py
    import torch
    from transformers import AutoTokenizer, AutoModelForCausalLM
-
+    
    tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1")
    model = AutoModelForCausalLM.from_pretrained(
        "microsoft/phi-1",
@ -110,12 +110,12 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
        device_map="auto",
        trust_remote_code=True,
        attn_implementation="sdpa")
-
+    
    input_ids = tokenizer('''def print_prime(n):
       """
       Print all primes between 1 and n
       """''', return_tensors="pt").to("cuda")
-
+    
    output = model.generate(**input_ids, cache_implementation="static")
    print(tokenizer.decode(output[0], skip_special_tokens=True))
    ```
--- a/docs/source/en/model_doc/qwen2.md
+++ b/docs/source/en/model_doc/qwen2.md
@ -64,7 +64,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer

 model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2-1.5B-Instruct",
-    torch_dtype=torch.bfloat16,
+    torch_dtype=torch.bfloat16, 
    device_map="auto",
    attn_implementation="sdpa"
 )
@ -86,10 +86,10 @@ generated_ids = model.generate(
    model_inputs.input_ids,
    cache_implementation="static",
    max_new_tokens=512,
-    do_sample=True,
-    temperature=0.7,
-    top_k=50,
-    top_p=0.95
+    do_sample=True, 
+    temperature=0.7, 
+    top_k=50,        
+    top_p=0.95       
 )
 generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
@ -100,11 +100,11 @@ print(response)
 ```

 </hfoption>
-<hfoption id="transformers CLI">
+<hfoption id="transformers-cli">

 ```bash
 # pip install -U flash-attn --no-build-isolation
-transformers chat Qwen/Qwen2-7B-Instruct --torch_dtype auto --attn_implementation flash_attention_2 --device 0
+transformers-cli chat --model_name_or_path Qwen/Qwen2-7B-Instruct --torch_dtype auto --attn_implementation flash_attention_2 --device 0
 ```

 </hfoption>
@ -121,21 +121,21 @@ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

 quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
-    bnb_4bit_compute_dtype=torch.bfloat16,
-    bnb_4bit_quant_type="nf4",
-    bnb_4bit_use_double_quant=True,
+    bnb_4bit_compute_dtype=torch.bfloat16, 
+    bnb_4bit_quant_type="nf4",             
+    bnb_4bit_use_double_quant=True,       
 )

-tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B")
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B") 
 model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2-7B",
    torch_dtype=torch.bfloat16,
    device_map="auto",
    quantization_config=quantization_config,
-    attn_implementation="flash_attention_2"
+    attn_implementation="flash_attention_2" 
 )

-inputs = tokenizer("The Qwen2 model family is", return_tensors="pt").to("cuda")
+inputs = tokenizer("The Qwen2 model family is", return_tensors="pt").to("cuda") 
 outputs = model.generate(**inputs, max_new_tokens=100)
 print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 ```
--- a/docs/source/en/model_doc/qwen2_5_vl.md
+++ b/docs/source/en/model_doc/qwen2_5_vl.md
@ -118,7 +118,7 @@ The example below uses [torchao](../quantization/torchao) to only quantize the w

 ```python
 import torch
-from transformers import TorchAoConfig, Qwen2_5_VLForConditionalGeneration, AutoProcessor
+from transformers import TorchAoConfig, Gemma3ForConditionalGeneration, AutoProcessor

 quantization_config = TorchAoConfig("int4_weight_only", group_size=128)
 model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
@ -240,10 +240,6 @@ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(

 [[autodoc]] Qwen2_5_VLProcessor

-## Qwen2_5_VLTextModel
-
-[[autodoc]] Qwen2_5_VLTextModel
-    - forward

 ## Qwen2_5_VLModel

--- a/docs/source/en/model_doc/qwen2_vl.md
+++ b/docs/source/en/model_doc/qwen2_vl.md
@ -287,11 +287,6 @@ model = Qwen2VLForConditionalGeneration.from_pretrained(
 [[autodoc]] Qwen2VLImageProcessor
    - preprocess

-## Qwen2VLVideoProcessor
-
-[[autodoc]] Qwen2VLVideoProcessor
-    - preprocess
-
 ## Qwen2VLImageProcessorFast

 [[autodoc]] Qwen2VLImageProcessorFast
@ -301,11 +296,6 @@ model = Qwen2VLForConditionalGeneration.from_pretrained(

 [[autodoc]] Qwen2VLProcessor

-## Qwen2VLTextModel
-
-[[autodoc]] Qwen2VLTextModel
-    - forward
-    
 ## Qwen2VLModel

 [[autodoc]] Qwen2VLModel
--- a/docs/source/en/model_doc/roberta.md
+++ b/docs/source/en/model_doc/roberta.md
@ -23,7 +23,6 @@ rendered properly in your Markdown viewer.
 ">
 <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
 </div>
-
 ## Overview

 The RoBERTa model was proposed in [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, [Myle Ott](https://huggingface.co/myleott), Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer
--- a/docs/source/en/model_doc/smolvlm.md
+++ b/docs/source/en/model_doc/smolvlm.md
@ -197,9 +197,6 @@ print(generated_texts[0])
 [[autodoc]] SmolVLMImageProcessor
    - preprocess

-## SmolVLMVideoProcessor
-[[autodoc]] SmolVLMVideoProcessor
-    - preprocess

 ## SmolVLMProcessor
 [[autodoc]] SmolVLMProcessor
--- a/docs/source/en/model_doc/swin2sr.md
+++ b/docs/source/en/model_doc/swin2sr.md
@ -50,11 +50,6 @@ A demo Space for image super-resolution with SwinSR can be found [here](https://
 [[autodoc]] Swin2SRImageProcessor
    - preprocess

-## Swin2SRImageProcessorFast
-
-[[autodoc]] Swin2SRImageProcessorFast
-    - preprocess
-
 ## Swin2SRConfig

 [[autodoc]] Swin2SRConfig
--- a/docs/source/en/model_doc/t5.md
+++ b/docs/source/en/model_doc/t5.md
@ -75,10 +75,10 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```

 </hfoption>
-<hfoption id="transformers CLI">
+<hfoption id="transformers-cli">

 ```bash
-echo -e "translate English to French: The weather is nice today." | transformers run --task text2text-generation --model google-t5/t5-base --device 0
+echo -e "translate English to French: The weather is nice today." | transformers-cli run --task text2text-generation --model google-t5/t5-base --device 0
 ```

 </hfoption>
--- a/docs/source/en/model_doc/video_llava.md
+++ b/docs/source/en/model_doc/video_llava.md
@ -211,19 +211,10 @@ model = VideoLlavaForConditionalGeneration.from_pretrained(

 [[autodoc]] VideoLlavaImageProcessor

-
-## VideoLlavaVideoProcessor
-
-[[autodoc]] VideoLlavaVideoProcessor
-
 ## VideoLlavaProcessor

 [[autodoc]] VideoLlavaProcessor

-## VideoLlavaModel
-
-[[autodoc]] VideoLlavaModel
-
 ## VideoLlavaForConditionalGeneration

 [[autodoc]] VideoLlavaForConditionalGeneration
--- a/docs/source/en/model_doc/vilt.md
+++ b/docs/source/en/model_doc/vilt.md
@ -72,11 +72,6 @@ This model was contributed by [nielsr](https://huggingface.co/nielsr). The origi
 [[autodoc]] ViltImageProcessor
    - preprocess

-## ViltImageProcessorFast
-
-[[autodoc]] ViltImageProcessorFast
-    - preprocess
-
 ## ViltProcessor

 [[autodoc]] ViltProcessor
--- a/docs/source/en/model_doc/vipllava.md
+++ b/docs/source/en/model_doc/vipllava.md
@ -101,10 +101,6 @@ A chat between a curious human and an artificial intelligence assistant. The ass

 [[autodoc]] VipLlavaConfig

-## VipLlavaModel
-
-[[autodoc]] VipLlavaModel
-
 ## VipLlavaForConditionalGeneration

 [[autodoc]] VipLlavaForConditionalGeneration
--- a/docs/source/en/perf_infer_gpu_one.md
+++ b/docs/source/en/perf_infer_gpu_one.md
@ -44,7 +44,7 @@ Place all inputs on the same device as the model.
 from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM

 quantization_config = BitsAndBytesConfig(load_in_8bit=True)
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B")
+tokenizer = AutoTokenizer("meta-llama/Llama-3.1-8B")
 model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B", device_map="auto", quantization_config=quantization_config)

 prompt = "Hello, my llama is cute"
@ -196,7 +196,7 @@ model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B", device_m
 input_text = "Hello, my llama is cute"
 inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

-with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
+with sdpa_kernel(SDPBackend.FLASH_ATTENTION)::
    outputs = model.generate(**inputs)

 print(tokenizer.decode(outputs[0], skip_special_tokens=True))
--- a/docs/source/en/perf_train_gaudi.md
+++ b/docs/source/en/perf_train_gaudi.md
@ -1,34 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Intel Gaudi
-
-The Intel Gaudi AI accelerator family includes [Intel Gaudi 1](https://habana.ai/products/gaudi/), [Intel Gaudi 2](https://habana.ai/products/gaudi2/), and [Intel Gaudi 3](https://habana.ai/products/gaudi3/). Each server is equipped with 8 devices, known as Habana Processing Units (HPUs), providing 128GB of memory on Gaudi 3, 96GB on Gaudi 2, and 32GB on the first-gen Gaudi. For more details on the underlying hardware architecture, check out the [Gaudi Architecture](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html) overview.
-
-[`TrainingArguments`], [`Trainer`] and [`Pipeline`] detect and set the backend device to `hpu` if an Intel Gaudi device is available. No additional changes are required to enable training and inference on your device.
-
-Some modeling code in Transformers is not optimized for HPU lazy mode. If you encounter any errors, set the environment variable below to use eager mode:
-```
-PT_HPU_LAZY_MODE=0
-```
-
-In some cases, you'll also need to enable int64 support to avoid casting issues with long integers:
-```
-PT_ENABLE_INT64_SUPPORT=1
-```
-Refer to the [Gaudi docs](https://docs.habana.ai/en/latest/index.html) for more details.
-
-> [!TIP]
-> For training and inference with Gaudi-optimized model implementations, we recommend using [Optimum for Intel Gaudi](https://huggingface.co/docs/optimum/main/en/habana/index).
--- a/docs/source/en/quantization/torchao.md
+++ b/docs/source/en/quantization/torchao.md
@ -40,8 +40,6 @@ torchao supports the [quantization techniques](https://github.com/pytorch/ao/blo
 - A16W4 Int4 Weight Only Quantization
 - Autoquantization

-torchao also supports module level configuration by specifying a dictionary from fully qualified name of module and its corresponding quantization config. This allows skip quantizing certain layers and using different quantization config for different modules.
-

 Check the table below to see if your hardware is compatible.

@ -91,7 +89,7 @@ We'll show examples for recommended quantization methods based on hardwares, e.g
 ```py
 import torch
 from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
-from torchao.quantization import Float8DynamicActivationFloat8WeightConfig, Float8WeightOnlyConfig
+from torchao.quantization import Float8DynamicActivationFloat8WeightConfig

 quant_config = Float8DynamicActivationFloat8WeightConfig()
 # or float8 weight only quantization
@ -151,7 +149,7 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```py
 import torch
 from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
-from torchao.quantization import Int8DynamicActivationInt8WeightConfig, Int8WeightOnlyConfig
+from torchao.quantization import Int8DynamicActivationInt8WeightConfig

 quant_config = Int8DynamicActivationInt8WeightConfig()
 # or int8 weight only quantization
@ -181,7 +179,7 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```py
 import torch
 from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
-from torchao.quantization import GemliteUIntXWeightOnlyConfig, Int4WeightOnlyConfig
+from torchao.quantization import GemliteUIntXWeightOnlyConfig

 # For batch size N, we recommend gemlite, which may require autotuning
 # default is 4 bit, 8 bit is also supported by passing `bit_width=8`
@ -218,7 +216,7 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```py
 import torch
 from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
-from torchao.quantization import Int8DynamicActivationInt8WeightConfig, Int8WeightOnlyConfig
+from torchao.quantization import Int8DynamicActivationInt8WeightConfig

 quant_config = Int8DynamicActivationInt8WeightConfig()
 # quant_config = Int8WeightOnlyConfig()
@ -274,74 +272,6 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 </hfoption>
 </hfoptions>

-### Per Module Quantization
-#### 1. Skip quantization for certain layers
-With `AOPerModuleConfig` we can specify a default configuration for all layers while skipping quantization for certain layers.
-```py
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
-
-model_id = "meta-llama/Llama-3.1-8B-Instruct"
-
-from torchao.quantization import Int4WeightOnlyConfig, AOPerModuleConfig
-config = Int4WeightOnlyConfig(group_size=128)
-
-# set default to int4 (for linears), and skip quantizing `model.layers.0.self_attn.q_proj`
-quant_config = AOPerModuleConfig({"_default": config, "model.layers.0.self_attn.q_proj": None})
-quantization_config = TorchAoConfig(quant_type=quant_config)
-quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16, quantization_config=quantization_config)
-# lm_head is not quantized and model.layers.0.self_attn.q_proj is not quantized
-print("quantized model:", quantized_model)
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-
-# Manual Testing
-prompt = "Hey, are you conscious? Can you talk to me?"
-inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
-generated_ids = quantized_model.generate(**inputs, max_new_tokens=128)
-output_text = tokenizer.batch_decode(
-    generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
-)
-print(output_text)
-```
-
-#### 2. Quantizing different layers with different quantization configs
-```py
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
-
-model_id = "facebook/opt-125m"
-
-from torchao.quantization import Int4WeightOnlyConfig, AOPerModuleConfig, Int8DynamicActivationInt4WeightConfig, IntxWeightOnlyConfig, PerAxis, MappingType
-
-weight_dtype = torch.int8
-granularity = PerAxis(0)
-mapping_type = MappingType.ASYMMETRIC
-embedding_config = IntxWeightOnlyConfig(
-    weight_dtype=weight_dtype,
-    granularity=granularity,
-    mapping_type=mapping_type,
-)
-linear_config = Int8DynamicActivationInt4WeightConfig(group_size=128)
-quant_config = AOPerModuleConfig({"_default": linear_config, "model.decoder.embed_tokens": embedding_config, "model.decoder.embed_positions": None})
-# set `include_embedding` to True in order to include embedding in quantization
-# when `include_embedding` is True, we'll remove input embedding from `modules_not_to_convert` as well
-quantization_config = TorchAoConfig(quant_type=quant_config, include_embedding=True)
-quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cpu", torch_dtype=torch.bfloat16, quantization_config=quantization_config)
-print("quantized model:", quantized_model)
-# make sure embedding is quantized
-print("embed_tokens weight:", quantized_model.model.decoder.embed_tokens.weight)
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-
-# Manual Testing
-prompt = "Hey, are you conscious? Can you talk to me?"
-inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
-generated_ids = quantized_model.generate(**inputs, max_new_tokens=128, cache_implementation="static")
-output_text = tokenizer.batch_decode(
-    generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
-)
-print(output_text)
-```
-
 ### Autoquant

 If you want to automatically choose a quantization type for quantizable layers (`nn.Linear`) you can use the [autoquant](https://pytorch.org/ao/stable/generated/torchao.quantization.autoquant.html#torchao.quantization.autoquant) API.
--- a/docs/source/en/serving.md
+++ b/docs/source/en/serving.md
@ -51,7 +51,7 @@ By default, vLLM serves the native implementation and if it doesn't exist, it fa
 ```shell
 vllm serve Qwen/Qwen2.5-1.5B-Instruct \
    --task generate \
-    --model-impl transformers
+    --model-impl transformers \
 ```

 Add the `trust-remote-code` parameter to enable loading a remote code model.
@ -60,5 +60,5 @@ Add the `trust-remote-code` parameter to enable loading a remote code model.
 vllm serve Qwen/Qwen2.5-1.5B-Instruct \
    --task generate \
    --model-impl transformers \
-    --trust-remote-code
+    --trust-remote-code \
 ```
--- a/docs/source/en/tasks/prompting.md
+++ b/docs/source/en/tasks/prompting.md
@ -78,62 +78,32 @@ Crafting a good prompt alone, also known as zero-shot prompting, may not be enou

 This section covers a few prompting techniques.

-### Few-shot prompting
+### Few-shot

-Few-shot prompting improves accuracy and performance by including specific examples of what a model should generate given an input. The explicit examples give the model a better understanding of the task and the output format you’re looking for. Try experimenting with different numbers of examples (2, 4, 8, etc.) to see how it affects performance. The example below provides the model with 1 example (1-shot) of the output format (a date in MM/DD/YYYY format) it should return.
+Few-shot prompting improves accuracy and performance by including specific examples of what a model should generate given an input. The explicit examples give the model a better understanding of the task and the output format you're looking for. Try experimenting with different numbers of examples (2, 4, 8, etc.) to see how it affects performance.

-```python
+The example below provides the model with 1 example (1-shot) of the output format (a date in MM/DD/YYYY format) it should return.
+
+```py
 from transformers import pipeline
 import torch

 pipeline = pipeline(model="mistralai/Mistral-7B-Instruct-v0.1", torch_dtype=torch.bfloat16, device_map="auto")
 prompt = """Text: The first human went into space and orbited the Earth on April 12, 1961.
 Date: 04/12/1961
-Text: The first-ever televised presidential debate in the United States took place on September 28, 1960, between presidential candidates John F. Kennedy and Richard Nixon.
+Text: The first-ever televised presidential debate in the United States took place on September 28, 1960, between presidential candidates John F. Kennedy and Richard Nixon. 
 Date:"""

 outputs = pipeline(prompt, max_new_tokens=12, do_sample=True, top_k=10)
 for output in outputs:
    print(f"Result: {output['generated_text']}")
-# Result: Text: The first human went into space and orbited the Earth on April 12, 1961.
-# Date: 04/12/1961
-# Text: The first-ever televised presidential debate in the United States took place on September 28, 1960, between presidential candidates John F. Kennedy and Richard Nixon.
-# Date: 09/28/1960
+Result: Text: The first human went into space and orbited the Earth on April 12, 1961.
+Date: 04/12/1961
+Text: The first-ever televised presidential debate in the United States took place on September 28, 1960, between presidential candidates John F. Kennedy and Richard Nixon. 
+Date: 09/28/1960
 ```

-The downside of few-shot prompting is that you need to create lengthier prompts which increases computation and latency. There is also a limit to prompt lengths. Finally, a model can learn unintended patterns from your examples, and it may not work well on complex reasoning tasks.
-
-To improve few-shot prompting for modern instruction-tuned LLMs, use a model's specific [chat template](../conversations). These models are trained on datasets with turn-based conversations between a "user" and "assistant". Structuring your prompt to align with this can improve performance.
-
-Structure your prompt as a turn-based conversation and use the [`apply_chat_template`] method to tokenize and format it.
-
-```python
-from transformers import pipeline
-import torch
-
-pipeline = pipeline(model="mistralai/Mistral-7B-Instruct-v0.1", torch_dtype=torch.bfloat16, device_map="auto")
-
-messages = [
-    {"role": "user", "content": "Text: The first human went into space and orbited the Earth on April 12, 1961."},
-    {"role": "assistant", "content": "Date: 04/12/1961"},
-    {"role": "user", "content": "Text: The first-ever televised presidential debate in the United States took place on September 28, 1960, between presidential candidates John F. Kennedy and Richard Nixon."}
-]
-
-prompt = pipeline.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-
-outputs = pipeline(prompt, max_new_tokens=12, do_sample=True, top_k=10)
-
-for output in outputs:
-    print(f"Result: {output['generated_text']}")
-```
-
-
-While the basic few-shot prompting approach embedded examples within a single text string, the chat template format offers the following benefits.
-
- The model may have a potentially improved understanding because it can better recognize the pattern and the expected roles of user input and assistant output.
- The model may more consistently output the desired output format because it is structured like its input during training.
-
-Always consult a specific instruction-tuned model's documentation to learn more about the format of their chat template so that you can structure your few-shot prompts accordingly.
+The downside of few-shot prompting is that you need to create lengthier prompts which increases computation and latency. There is also a limit to prompt lengths. Finally, a model can learn unintended patterns from your examples and it doesn't work well on complex reasoning tasks.

 ### Chain-of-thought

--- a/docs/source/en/torchscript.md
+++ b/docs/source/en/torchscript.md
@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.

 # TorchScript

-[TorchScript](https://pytorch.org/docs/stable/jit.html) serializes PyTorch models into programs that can be executed in non-Python processes. This is especially advantageous in production environments where Python may not be the most performant choice.
+[TorchScript](https://pytorch.org/docs/stable/jit.html) serializes PyTorch models into programs that can be executed in non-Python processes. This is especially advantageous in production environments where Python may the most performant choice.

 Transformers can export a model to TorchScript by:

--- a/docs/source/en/video_processors.md
+++ b/docs/source/en/video_processors.md
@ -1,49 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-
-# Video Processor
-
-A **Video Processor** is a utility responsible for preparing input features for video models, as well as handling the post-processing of their outputs. It provides transformations such as resizing, normalization, and conversion into PyTorch. 
-
-The video processor extends the functionality of image processors by allowing the models to handle videos with a distinct set of arguments compared to images. It serves as the bridge between raw video data and the model, ensuring that input features are optimized for the VLM.
-
-Use [`~BaseVideoProcessor.from_pretrained`] to load a video processors configuration (image size, whether to normalize and rescale, etc.) from a video model on the Hugging Face [Hub](https://hf.co) or local directory. The configuration for each pretrained model should be saved in a [video_preprocessor_config.json] file but older models might have the config saved in [preprocessor_config.json](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf/blob/main/preprocessor_config.json) file. Note that the latter is less preferred and will be removed in the future.
-
-
-### Usage Example
-Here's an example of how to load a video processor with [`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf) model:
-
-```python
-from transformers import AutoVideoProcessor
-
-processor = AutoVideoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
-```
-
-Currently, if using base image processor for videos, it processes video data by treating each frame as an individual image and applying transformations frame-by-frame. While functional, this approach is not highly efficient. Using `AutoVideoProcessor` allows us to take advantage of **fast video processors**, leveraging the [torchvision](https://pytorch.org/vision/stable/index.html) library. Fast processors handle the whole batch of videos at once, without iterating over each video or frame. These updates introduce GPU acceleration and significantly enhance processing speed, especially for tasks requiring high throughput.
-
-Fast video processors are available for all models and are loaded by default when an `AutoVideoProcessor` is initialized. When using a fast video processor, you can also set the `device` argument to specify the device on which the processing should be done. By default, the processing is done on the same device as the inputs if the inputs are tensors, or on the CPU otherwise. For even more speed improvement, we can compile the processor when using 'cuda' as device.
-
-```python
-import torch
-from transformers.video_utils import load_video
-from transformers import AutoVideoProcessor
-
-video = load_video("video.mp4")
-processor = AutoVideoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", device="cuda")
-processor = torch.compile(processor)
-processed_video = processor(video, return_tensors="pt")
-```
--- a/docs/source/es/converting_tensorflow_models.md
+++ b/docs/source/es/converting_tensorflow_models.md
@ -20,9 +20,9 @@ Te proporcionamos una interfaz de línea de comando (`CLI`, por sus siglas en in

 <Tip>

-Desde 2.3.0, el script para convertir es parte de la CLI de transformers (**transformers**) disponible en cualquier instalación de transformers >= 2.3.0.
+Desde 2.3.0, el script para convertir es parte de la CLI de transformers (**transformers-cli**) disponible en cualquier instalación de transformers >= 2.3.0.

-La siguiente documentación refleja el formato para el comando **transformers convert**.
+La siguiente documentación refleja el formato para el comando **transformers-cli convert**.

 </Tip>

@ -41,7 +41,7 @@ Aquí hay un ejemplo del proceso para convertir un modelo `BERT-Base Uncased` pr
 ```bash
 export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12

-transformers convert --model_type bert \
+transformers-cli convert --model_type bert \
  --tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \
  --config $BERT_BASE_DIR/bert_config.json \
  --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin
@ -60,7 +60,7 @@ Aquí hay un ejemplo del proceso para convertir un modelo `ALBERT Base` pre-entr
 ```bash
 export ALBERT_BASE_DIR=/path/to/albert/albert_base

-transformers convert --model_type albert \
+transformers-cli convert --model_type albert \
  --tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \
  --config $ALBERT_BASE_DIR/albert_config.json \
  --pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin
@ -75,7 +75,7 @@ Este es un ejemplo del proceso para convertir un modelo OpenAI GPT pre-entrenado
 ```bash
 export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights

-transformers convert --model_type gpt \
+transformers-cli convert --model_type gpt \
  --tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
  [--config OPENAI_GPT_CONFIG] \
@ -89,7 +89,7 @@ Aquí hay un ejemplo del proceso para convertir un modelo OpenAI GPT-2 pre-entre
 ```bash
 export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/openai-community/gpt2/pretrained/weights

-transformers convert --model_type gpt2 \
+transformers-cli convert --model_type gpt2 \
  --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
  [--config OPENAI_GPT2_CONFIG] \
@ -104,7 +104,7 @@ Aquí hay un ejemplo del proceso para convertir un modelo XLNet pre-entrenado:
 export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
 export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config

-transformers convert --model_type xlnet \
+transformers-cli convert --model_type xlnet \
  --tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \
  --config $TRANSFO_XL_CONFIG_PATH \
  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
@ -118,7 +118,7 @@ Aquí hay un ejemplo del proceso para convertir un modelo XLM pre-entrenado:
 ```bash
 export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint

-transformers convert --model_type xlm \
+transformers-cli convert --model_type xlm \
  --tf_checkpoint $XLM_CHECKPOINT_PATH \
  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT
 [--config XML_CONFIG] \
@ -132,7 +132,7 @@ Aquí hay un ejemplo del proceso para convertir un modelo T5 pre-entrenado:
 ```bash
 export T5=/path/to/t5/uncased_L-12_H-768_A-12

-transformers convert --model_type t5 \
+transformers-cli convert --model_type t5 \
  --tf_checkpoint $T5/t5_model.ckpt \
  --config $T5/t5_config.json \
  --pytorch_dump_output $T5/pytorch_model.bin
--- a/docs/source/it/add_new_model.md
+++ b/docs/source/it/add_new_model.md
@ -15,51 +15,51 @@ rendered properly in your Markdown viewer.

 # Come aggiungere un modello a 🤗 Transformers?

-Aggiungere un nuovo modello é spesso difficile e richiede una profonda conoscenza della libreria 🤗 Transformers e anche
-della repository originale del modello. A Hugging Face cerchiamo di dare alla community sempre piú poteri per aggiungere
-modelli independentemente. Quindi, per alcuni nuovi modelli che la community vuole aggiungere a 🤗 Transformers, abbiamo
-creato una specifica *call-for-model-addition* che spiega passo dopo passo come aggiungere il modello richiesto. Con
+Aggiungere un nuovo modello é spesso difficile e richiede una profonda conoscenza della libreria 🤗 Transformers e anche 
+della repository originale del modello. A Hugging Face cerchiamo di dare alla community sempre piú poteri per aggiungere 
+modelli independentemente. Quindi, per alcuni nuovi modelli che la community vuole aggiungere a 🤗 Transformers, abbiamo 
+creato una specifica *call-for-model-addition* che spiega passo dopo passo come aggiungere il modello richiesto. Con 
 questo *call-for-model-addition* vogliamo insegnare a volenterosi e esperti collaboratori della community come implementare
 un modello in 🤗 Transformers.

 Se questo é qualcosa che può interessarvi, siete liberi di controllare l'attuale “calls-for-model-addition” [qui](https://github.com/huggingface/transformers/tree/main/templates/adding_a_new_model/open_model_proposals/README.md)
-e contattarci.
+e contattarci. 

 Se il modello sarà selezionato, allora potrete lavorare insieme a un membro di Hugging Face per integrare il modello in 🤗
-Transformers. Così facendo, ci guadagnerai in una comprensione totale, sia teorica che pratica, del modello proposto. Inoltre,
+Transformers. Così facendo, ci guadagnerai in una comprensione totale, sia teorica che pratica, del modello proposto. Inoltre, 
 sarai l'artefice di un importante contributo open-source a 🤗 Transformers. Durante l'implementazione avrai l'opportunità di:

 - ottenere più comprensione delle best practices in open-source
- capire i principi di design di una della librerie NLP più popolari
+- capire i principi di design di una della librerie NLP più popolari 
 - capire come efficientemente testare complessi modelli NLP
- capire come integrare utilit Python come `black`, `ruff`, `make fix-copies` in una libreria per garantire sempre di avere un codice leggibile e pulito
+- capire come integrare utilit Python come `black`, `ruff`, `make fix-copies` in una libreria per garantire sempre di avere un codice leggibile e pulito 

-Siamo anche contenti se vuoi aggiungere un modello che non può essere trovato nella cartella “calls-for-model-addition”.
+Siamo anche contenti se vuoi aggiungere un modello che non può essere trovato nella cartella “calls-for-model-addition”. 
 Le seguenti sezioni spiegano in dettaglio come aggiungere un nuovo modello. Può anche essere molto utile controllare modelli
 già aggiunti [qui](https://github.com/huggingface/transformers/pulls?q=is%3Apr+label%3A%22PR+for+Model+Addition%22+is%3Aclosed),
-per capire se richiamano il modello che vorreste aggiungere.
+per capire se richiamano il modello che vorreste aggiungere. 

 Per cominciare, vediamo una panoramica general della libreria Transformers.

 ## Panoramica generale su 🤗 Transformers

 Prima di tutto, vediamo in generale 🤗 Transformers. 🤗 Transformers é una libreria molto strutturata, quindi
-puà essere che a volte ci sia un disaccordo con alcune filosofie della libreria o scelte di design. Dalla nostra esperienza,
+puà essere che a volte ci sia un disaccordo con alcune filosofie della libreria o scelte di design. Dalla nostra esperienza, 
 tuttavia, abbiamo trovato che le scelte fondamentali di design della libreria sono cruciali per usare 🤗 Transformers efficacemente
-su larga scala, mantenendo i costi a un livello accettabile.
+su larga scala, mantenendo i costi a un livello accettabile.  

 Un buon primo punto di partenza per capire al meglio la libreria é leggere la [documentazione sulla nostra filosofia](filosofia)
 Da qui, ci sono alcune scelte sul modo di lavorare che cerchiamo di applicare a tutti i modelli:

 - La composizione é generalmente favorita sulla sovra-astrazione
 - Duplicare il codice non é sempre male, soprattutto se migliora notevolmente la leggibilità e accessibilità del modello
- Tutti i files creati per il nuovo modello devono il piu possibile "compatti". Questo vuol dire che quando qualcuno leggerá il codice
+- Tutti i files creati per il nuovo modello devono il piu possibile "compatti". Questo vuol dire che quando qualcuno leggerá il codice 
 di uno specifico modello, potrá vedere solo il corrispettivo file `modeling_....py` senza avere multiple dipendenze.


-La cosa piú importante, é che consideriamo la libreria non solo un mezzo per dare un prodotto, *per esempio* dare la possibilità
-di usare BERT per inferenza, ma é anche il prodotto reale che noi vogliamo migliorare sempre più. Quindi, quando aggiungi
-un modello, non sei solo la persona che userà il modello, ma rappresenti anche tutti coloro che leggeranno,
+La cosa piú importante, é che consideriamo la libreria non solo un mezzo per dare un prodotto, *per esempio* dare la possibilità 
+di usare BERT per inferenza, ma é anche il prodotto reale che noi vogliamo migliorare sempre più. Quindi, quando aggiungi 
+un modello, non sei solo la persona che userà il modello, ma rappresenti anche tutti coloro che leggeranno, 
 cercheranno di capire e modificare il tuo modello.

 Tenendo questi principi in mente, immergiamoci nel design generale della libreria.
@ -67,25 +67,25 @@ Tenendo questi principi in mente, immergiamoci nel design generale della libreri
 ### Panoramica sui modelli

 Per aggiungere con successo un modello, é importante capire l'interazione tra il tuo modello e la sua configurazione,
-[`PreTrainedModel`], e [`PretrainedConfig`]. Per dare un esempio, chiameremo il modello da aggiungere a 🤗 Transformers
+[`PreTrainedModel`], e [`PretrainedConfig`]. Per dare un esempio, chiameremo il modello da aggiungere a 🤗 Transformers  
 `BrandNewBert`.

 Diamo un'occhiata:

 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_overview.png"/>

-Come potete vedere, ci basiamo sull'ereditarietà in 🤗 Transformers, tenendo però il livello di astrazione a un minimo
-assoluto.  Non ci sono mai più di due livelli di astrazione per ogni modello nella libreria. `BrandNewBertModel` eredita
-da `BrandNewBertPreTrainedModel` che, a sua volta, eredita da [`PreTrainedModel`] -  semplice no?
+Come potete vedere, ci basiamo sull'ereditarietà in 🤗 Transformers, tenendo però il livello di astrazione a un minimo 
+assoluto.  Non ci sono mai più di due livelli di astrazione per ogni modello nella libreria. `BrandNewBertModel` eredita 
+da `BrandNewBertPreTrainedModel` che, a sua volta, eredita da [`PreTrainedModel`] -  semplice no? 
 Come regola generale, vogliamo essere sicuri che un nuovo modello dipenda solo da [`PreTrainedModel`]. Le funzionalità
 importanti che sono automaticamente conferite a ogni nuovo modello sono [`~PreTrainedModel.from_pretrained`]
-e [`~PreTrainedModel.save_pretrained`], che sono usate per serializzazione e deserializzazione. Tutte le altre importanti
+e [`~PreTrainedModel.save_pretrained`], che sono usate per serializzazione e deserializzazione. Tutte le altre importanti 
 funzionalità, come ad esempio `BrandNewBertModel.forward` devono essere definite completamente nel nuovo script
-`modeling_brand_new_bert.py`. Inoltre, vogliamo essere sicuri che un modello con uno specifico head layer, come
+`modeling_brand_new_bert.py`. Inoltre, vogliamo essere sicuri che un modello con uno specifico head layer, come 
 `BrandNewBertForMaskedLM` non erediti da `BrandNewBertModel`, ma piuttosto usi `BrandNewBertModel`
-come componente che può essere chiamata nel passaggio forward per mantenere il livello di astrazione basso. Ogni
-nuovo modello richieste una classe di configurazione, chiamata `BrandNewBertConfig`. Questa configurazione é sempre
-mantenuta come un attributo in [`PreTrainedModel`], e quindi può essere accessibile tramite l'attributo `config`
+come componente che può essere chiamata nel passaggio forward per mantenere il livello di astrazione basso. Ogni 
+nuovo modello richieste una classe di configurazione, chiamata `BrandNewBertConfig`. Questa configurazione é sempre 
+mantenuta come un attributo in [`PreTrainedModel`], e quindi può essere accessibile tramite l'attributo `config` 
 per tutte le classi che ereditano da `BrandNewBertPreTrainedModel`:

 ```python
@ -93,35 +93,35 @@ model = BrandNewBertModel.from_pretrained("brandy/brand_new_bert")
 model.config  # il modello ha accesso al suo config
 ```

-Analogamente al modello, la configurazione eredita le funzionalità base di serializzazione e deserializzazione da
-[`PretrainedConfig`]. É da notare che la configurazione e il modello sono sempre serializzati in due formati differenti -
-il modello é serializzato in un file *pytorch_model.bin* mentre la configurazione con *config.json*. Chiamando
-[`~PreTrainedModel.save_pretrained`] automaticamente chiamerà [`~PretrainedConfig.save_pretrained`], cosicché sia il
+Analogamente al modello, la configurazione eredita le funzionalità base di serializzazione e deserializzazione da 
+[`PretrainedConfig`]. É da notare che la configurazione e il modello sono sempre serializzati in due formati differenti - 
+il modello é serializzato in un file *pytorch_model.bin* mentre la configurazione con *config.json*. Chiamando 
+[`~PreTrainedModel.save_pretrained`] automaticamente chiamerà [`~PretrainedConfig.save_pretrained`], cosicché sia il 
 modello che la configurazione siano salvati.


 ### Stile per il codice

-Quando codifichi un nuovo modello, tieni presente che Transformers ha una sua struttura di fondo come libreria, perciò
+Quando codifichi un nuovo modello, tieni presente che Transformers ha una sua struttura di fondo come libreria, perciò 
 ci sono alcuni fatti da considerare su come scrivere un codice :-)

-1. Il forward pass del tuo modello dev'essere scritto completamente nel file del modello, mentre dev'essere indipendente
+1. Il forward pass del tuo modello dev'essere scritto completamente nel file del modello, mentre dev'essere indipendente 
   da altri modelli nella libreria. Se vuoi riutilizzare un blocco di codice da un altro modello, copia e incolla il codice con un commento `# Copied from` in cima al codice (guarda [qui](https://github.com/huggingface/transformers/blob/v4.17.0/src/transformers/models/roberta/modeling_roberta.py#L160)
   per un ottimo esempio).
-2. Il codice dev'essere interamente comprensibile, anche da persone che non parlano in inglese. Questo significa che le
-   variabili devono avere un nome descrittivo e bisogna evitare abbreviazioni. Per esempio, `activation` é molto meglio
+2. Il codice dev'essere interamente comprensibile, anche da persone che non parlano in inglese. Questo significa che le 
+   variabili devono avere un nome descrittivo e bisogna evitare abbreviazioni. Per esempio, `activation` é molto meglio 
   che `act`. Le variabili con una lettera sono da evitare fortemente, almeno che non sia per un indce in un for loop.
 3. Generamente é meglio avere un codice esplicito e piú lungo che un codice corto e magico.
-4. Evita di subclassare `nn.Sequential` in Pytorch, puoi subclassare `nn.Module` e scrivere il forward pass, cosicché
-   chiunque può effettuare debug sul tuo codice, aggiungendo print o breaking points.
-5. La tua function-signature dev'essere type-annoted. Per il resto, é meglio preferire variabili con un nome accettabile
+4. Evita di subclassare `nn.Sequential` in Pytorch, puoi subclassare `nn.Module` e scrivere il forward pass, cosicché 
+   chiunque può effettuare debug sul tuo codice, aggiungendo print o breaking points. 
+5. La tua function-signature dev'essere type-annoted. Per il resto, é meglio preferire variabili con un nome accettabile 
   piuttosto che annotazioni per aumentare la comprensione e leggibilità del codice.

 ### Panoramica sui tokenizers

 Questa sezione sarà creata al piu presto :-(

-## Aggiungere un modello a 🤗 Transformers passo dopo passo
+## Aggiungere un modello a 🤗 Transformers passo dopo passo 

 Ci sono differenti modi per aggiungere un modello a Hugging Face. Qui trovi una lista di blog posts da parte della community su come aggiungere un modello:

@ -141,11 +141,11 @@ La lista seguente é un sommario di tutto quello che é stato fatto per aggiunge

 -  1. ☐ (Opzionale) Capire gli aspetti teorici del modello
 -  2. ☐ Preparare l'ambiente dev per transformers
-  3. ☐ Preparare l'ambiente debugging della repository originale
-  4. ☐ Create uno script che gestisca con successo il forward pass usando la repository originale e checkpoint
+-  3. ☐ Preparare l'ambiente debugging della repository originale 
+-  4. ☐ Create uno script che gestisca con successo il forward pass usando la repository originale e checkpoint 
 -  5. ☐ Aggiungere con successo lo scheletro del modello a Transformers
 -  6. ☐ Convertire i checkpoint original a Transformers checkpoint
-  7. ☐ Effettuare con successo la forward pass in Transformers, di modo che dia un output identico al checkpoint originale
+-  7. ☐ Effettuare con successo la forward pass in Transformers, di modo che dia un output identico al checkpoint originale 
 -  8. ☐ Finire i tests per il modello in Transformers
 -  9. ☐ Aggiungere con successo Tokenizer in Transformers
 -  10. ☐ Testare e provare gli integration tests da capo a fine
@ -156,22 +156,22 @@ La lista seguente é un sommario di tutto quello che é stato fatto per aggiunge

 Per cominciare di solito consigliamo `BrandNewBert`, partendo dalla teoria, di modo da avere una buona comprensione della teoria generale. TUttavia, se preferisci imparare l'aspetto teorico del modello mentre *lavori* sul modello é ok immergersi direttamente nel codice di `BrandNewBert`. Questa opzione puó essere buona se le tue skills ingegneristiche sono meglio che quelle teoriche, o se il paper `BrandNewBert` ti dá problemi, o se semplicemente ti piace programmare piú che leggere articoli scientifici.

-### 1. (Opzionale) Aspetti teorici di BrandNewBert
+### 1. (Opzionale) Aspetti teorici di BrandNewBert 

 Allora con calma, prendi un po' di tempo per leggere l'articolo su *BrandNewBert* . Sicuramente, alcune sezioni dell'articolo sono molto complesse, ma non preoccuparti! L'obiettivo non é avere una compresione immensa della teoria alla base, ma estrarre le informazioni necessarie per re-implementare con successo il modello in 🤗 Transformers. Quindi, non impazzire sugli aspetti teorici, ma piuttosto focalizzati su quelli pratici, ossia:

- Che tipo di modello é *brand_new_bert*? É solo un encoder in stile BERT? O tipo decoder come GPT2? O encoder e decoder stile BART? Dai un'occhiata a [model_summary](model_summary) se non sei famigliare con le differenze tra questi modelli
- Quali sono le applicazioni di *brand_new_bert*? Classificazione di testo? Generazione di testo? O per tasks del genere seq2seq?
- Quali sono le nuove aggiunte al modello che lo rendono diverso da BERT/GPT-2/BART?
+- Che tipo di modello é *brand_new_bert*? É solo un encoder in stile BERT? O tipo decoder come GPT2? O encoder e decoder stile BART? Dai un'occhiata a [model_summary](model_summary) se non sei famigliare con le differenze tra questi modelli 
+- Quali sono le applicazioni di *brand_new_bert*? Classificazione di testo? Generazione di testo? O per tasks del genere seq2seq? 
+- Quali sono le nuove aggiunte al modello che lo rendono diverso da BERT/GPT-2/BART? 
 - Quali modelli estistenti in [🤗 Transformers models](https://huggingface.co/transformers/#contents) sono molto simili a *brand_new_bert*?
- Che tipo di tokenizer si usa in questo caso? Un sentencepiece tokenizer? O un word piece tokenizer? Il tokenizer é lo stesso di BERT o BART?
+- Che tipo di tokenizer si usa in questo caso? Un sentencepiece tokenizer? O un word piece tokenizer? Il tokenizer é lo stesso di BERT o BART? 

-Una volta che senti che hai avuto una bella overview dell'architettura del modello, puoi scrivere senza problemi al team di Hugging Face per ogni domanda che tu hai. Questo puó includere domande sull'architettura del modello, o sull'attention layer, etc. Saremo molto felici di aiutarti :)
+Una volta che senti che hai avuto una bella overview dell'architettura del modello, puoi scrivere senza problemi al team di Hugging Face per ogni domanda che tu hai. Questo puó includere domande sull'architettura del modello, o sull'attention layer, etc. Saremo molto felici di aiutarti :) 


 ### 2. Prepare il tuo ambiente

-1. Forka la [repository](https://github.com/huggingface/transformers) cliccando sul tasto ‘Fork' nella pagina della repository. Questo crea una copia del codice nel tuo account GitHub
+1. Forka la [repository](https://github.com/huggingface/transformers) cliccando sul tasto ‘Fork' nella pagina della repository. Questo crea una copia del codice nel tuo account GitHub 

 2. Clona il tuo fork `transfomers` sul tuo dico locale, e aggiungi la repository base come remota:

@ -190,7 +190,7 @@ source .env/bin/activate
 pip install -e ".[dev]"
 ```

-quindi torna alla directory principale:
+quindi torna alla directory principale: 

 ```bash
 cd ..
@ -205,7 +205,7 @@ cd ..
 5. Per trasferire *brand_new_bert* To port *brand_new_bert* avrai bisogno anche accesso alla sua repository originale:

 ```bash
-git clone https://github.com/org_that_created_brand_new_bert_org/brand_new_bert.git
+git clone https://github.com/org_that_created_brand_new_bert_org/brand_new_bert.git 
 cd brand_new_bert
 pip install -e .
 ```
@ -213,16 +213,16 @@ pip install -e .
 Ok, ora hai un ambiente di sviluppo per portare *brand_new_bert* in 🤗 Transformers.


-### 3.-4. Provare un pretrained checkpoint usando la repo originale
+### 3.-4. Provare un pretrained checkpoint usando la repo originale 

-Per cominciare, comincerai a lavorare sulla repo originale di *brand_new_bert*. Come spesso accade, l'implementazione originale é molto sullo stile "ricerca". Questo significa che a volte la documentazione non é al top, magari manca qualche cosa e il codice puó essere difficile da capire. Tuttavia, questa é e dev'essere la motivazione per reimplementare *brand_new_bert*. In Hugging Face, uno degli obiettivi principali é di *mettere le persone sulle spalle dei giganti*, il che si traduce, in questo contesto, di prendere un modello funzionante e riscriverlo e renderlo il piú possibile **accessibile, user-friendly, e leggibile**. Questa é la top motivazione per re-implementare modelli in 🤗 Transformers - cercare di creare nuove complesse tecnologie NLP accessibili a **chiunque**.
+Per cominciare, comincerai a lavorare sulla repo originale di *brand_new_bert*. Come spesso accade, l'implementazione originale é molto sullo stile "ricerca". Questo significa che a volte la documentazione non é al top, magari manca qualche cosa e il codice puó essere difficile da capire. Tuttavia, questa é e dev'essere la motivazione per reimplementare *brand_new_bert*. In Hugging Face, uno degli obiettivi principali é di *mettere le persone sulle spalle dei giganti*, il che si traduce, in questo contesto, di prendere un modello funzionante e riscriverlo e renderlo il piú possibile **accessibile, user-friendly, e leggibile**. Questa é la top motivazione per re-implementare modelli in 🤗 Transformers - cercare di creare nuove complesse tecnologie NLP accessibili a **chiunque**. 

 Riuscire a far girare il modello pretrained originale dalla repository ufficiale é spesso il passo **piu arduo**. Dalla nostra esperienza, é molto importante spendere un p' di tempo per diventare familiari con il codice base originale. Come test, prova a capire i seguenti punti:

- Dove si trovano i pretrained weights?
- Come caricare i pretrained weights nel modello corrispondente?
- Come girare un tokenizer independentemente dal modello?
- Prova a tracciare un singolo forward pass, cosicché potrai sapere che classi e funzioni sono richieste per un semplice forward pass. Di solito, dovrai reimplementare queste funzioni e basta
+- Dove si trovano i pretrained weights? 
+- Come caricare i pretrained weights nel modello corrispondente? 
+- Come girare un tokenizer independentemente dal modello? 
+- Prova a tracciare un singolo forward pass, cosicché potrai sapere che classi e funzioni sono richieste per un semplice forward pass. Di solito, dovrai reimplementare queste funzioni e basta 
 - Prova a localizzare i componenti importanti del modello: Dove si trova la classe del modello? Ci sono sotto classi nel modello *per esempio* EngoderModel, DecoderMOdel? Dove si trova il self-attention layer? Ci sono molteplici differenti layer di attention, *per esempio * *self-attention*, *cross-attention*...?
 - Come puoi fare debug sul modello nell'ambiente originale della repo? Devi aggiungere dei *print* o puoi usare *ipdb* come debugger interattivo, o vabene anche un IDE efficiente per debug come PyCharm?

@ -230,14 +230,14 @@ Riuscire a far girare il modello pretrained originale dalla repository ufficiale

 A questo punto, sta a te decidere quale ambiente per debug vuoi usare. Noi consilgiamo di evitare setup con GPU, che potrebbero costare assai, lavorare su una CPU puó essere un ottimo punto di partenza per indagare la repository originale e per cominciare a scrivere il codice per 🤗 Transformers. Solo alla fine, quando il modello é stato portato con successo in  🤗 Transformers, allora si potrá verificare il suo funzionamento su GPU.

-In generale ci sono due possibili ambienti di debug per il testare il modello originale:
+In generale ci sono due possibili ambienti di debug per il testare il modello originale: 

 - [Jupyter notebooks](https://jupyter.org/) / [google colab](https://colab.research.google.com/notebooks/intro.ipynb)
- Scripts locali in Python
+- Scripts locali in Python 

 Il vantaggio dei Jupyter notebooks é la possibilità di eseguire cella per cella, il che può essere utile per decomporre tutte le componenti logiche, cosi da a vere un ciclo di debug più rapido, siccome si possono salvare i risultati da steps intermedi. Inoltre, i notebooks spesso sono molto facili da condividere con altri contributors, il che può essere molto utile se vuoi chiedere aiuto al team di Hugging Face. Se sei famigliare con Jupyter notebooks allora racommandiamo di lavorare in questa maniera.

-Ovviamente se non siete abituati a lavorare con i notebook, questo può essere uno svantaggio nell'usare questa tecnologia, sprecando un sacco di tempo per setup e portare tutto al nuovo ambiente, siccome non potreste neanche usare dei tools di debug come `ipdb`.
+Ovviamente se non siete abituati a lavorare con i notebook, questo può essere uno svantaggio nell'usare questa tecnologia, sprecando un sacco di tempo per setup e portare tutto al nuovo ambiente, siccome non potreste neanche usare dei tools di debug come `ipdb`. 

 Per ogni pratica code-base, é sempre meglio come primo step caricare un **piccolo** checkpoint pretrained e cercare di riprodurre un singolo forward pass usando un vettore fittizio di IDs fatti da numeri interi. Un esempio per uno script simile, in pseudocodice é:

@ -249,42 +249,42 @@ original_output = model.predict(input_ids)

 Per quanto riguarda la strategia di debugging, si può scegliere tra:

- Decomporre il modello originario in piccole componenenti e testare ognuna di esse
- Decomporre il modello originario nel *tokenizer* originale e nel *modello* originale, testare un forward pass su questi,
+- Decomporre il modello originario in piccole componenenti e testare ognuna di esse 
+- Decomporre il modello originario nel *tokenizer* originale e nel *modello* originale, testare un forward pass su questi, 
 e usare dei print statement o breakpoints intermedi per verificare

-Ancora una volta, siete liberi di scegliere quale strategia sia ottimale per voi. Spesso una strategia é piu
+Ancora una volta, siete liberi di scegliere quale strategia sia ottimale per voi. Spesso una strategia é piu 
 avvantaggiosa di un'altra, ma tutto dipende dall'code-base originario.

-Se il code-base vi permette di decomporre il modello in piccole sub-componenenti, *per esempio* se il code-base
-originario può essere facilmente testato in eager mode, allora vale la pena effettuare un debugging di questo genere.
-Ricordate che ci sono dei vantaggi nel decidere di prendere la strada piu impegnativa sin da subito:
+Se il code-base vi permette di decomporre il modello in piccole sub-componenenti, *per esempio* se il code-base 
+originario può essere facilmente testato in eager mode, allora vale la pena effettuare un debugging di questo genere. 
+Ricordate che ci sono dei vantaggi nel decidere di prendere la strada piu impegnativa sin da subito: 

 - negli stage piu finali, quando bisognerà comparare il modello originario all'implementazione in Hugging Face, potrete verificare
 automaticamente ogni componente, individualmente, di modo che ci sia una corrispondenza 1:1
 - avrete l'opportunità di decomporre un problema molto grande in piccoli passi, così da strutturare meglio il vostro lavoro
- separare il modello in componenti logiche vi aiuterà ad avere un'ottima overview sul design del modello, quindi una migliore
-comprensione del modello stesso
+- separare il modello in componenti logiche vi aiuterà ad avere un'ottima overview sul design del modello, quindi una migliore 
+comprensione del modello stesso 
 - verso gli stage finali i test fatti componente per componente vi aiuterà ad essere sicuri di non andare avanti e indietro
 nell'implementazione, così da continuare la modifica del codice senza interruzione

-Un ottimo esempio di come questo può essere fatto é dato da [Lysandre](https://gist.github.com/LysandreJik/db4c948f6b4483960de5cbac598ad4ed)
+Un ottimo esempio di come questo può essere fatto é dato da [Lysandre](https://gist.github.com/LysandreJik/db4c948f6b4483960de5cbac598ad4ed) 
 per il modello ELECTRA

-Tuttavia, se il code-base originale é molto complesso o le componenti intermedie possono essere testate solo in tramite
-compilazione, potrebbe richiedere parecchio tempo o addirittura essere impossibile separare il modello in piccole sotto-componenti.
-Un buon esempio é [MeshTensorFlow di T5](https://github.com/tensorflow/mesh/tree/master/mesh_tensorflow). Questa libreria
-é molto complessa e non offre un metodo semplice di decomposizione in sotto-componenti. Per simili librerie, potrete fare
+Tuttavia, se il code-base originale é molto complesso o le componenti intermedie possono essere testate solo in tramite 
+compilazione, potrebbe richiedere parecchio tempo o addirittura essere impossibile separare il modello in piccole sotto-componenti. 
+Un buon esempio é [MeshTensorFlow di T5](https://github.com/tensorflow/mesh/tree/master/mesh_tensorflow). Questa libreria 
+é molto complessa e non offre un metodo semplice di decomposizione in sotto-componenti. Per simili librerie, potrete fare 
 affidamento ai print statements.

-In ogni caso, indipendentemente da quale strategia scegliete, la procedura raccomandata é di cominciare a fare debug dal
-primo layer al layer finale.
+In ogni caso, indipendentemente da quale strategia scegliete, la procedura raccomandata é di cominciare a fare debug dal 
+primo layer al layer finale. 
 É consigliato recuperare gli output dai layers, tramite print o sotto-componenti, nel seguente ordine:

 1. Recuperare gli IDs di input dati al modello
 2. Recuperare i word embeddings
-3. Recuperare l'input del primo Transformer layer
-4. Recuperare l'output del primo Transformer layer
+3. Recuperare l'input del primo Transformer layer 
+4. Recuperare l'output del primo Transformer layer 
 5. Recuperare l'output dei seguenti `n - 1` Transformer layers
 6. Recuperare l'output dell'intero BrandNewBert Model

@ -303,36 +303,36 @@ Gli output dei seguenti layer di solito dovrebbero essere degli array di float m
 [-0.5334, -0.6403,  0.4271,  ..., -0.3339,  0.6533,  0.8694]]],
 ```

-Ci aspettiamo che ogni modello aggiunto a 🤗 Transformers passi con successo un paio di test d'integrazione. Questo
-significa che il modello originale e la sua implementazione in 🤗 Transformers abbiano lo stesso output con una precisione
-di 0.001! Siccome é normale che lo stesso esatto modello, scritto in librerie diverse, possa dare output leggermente
-diversi, la tolleranza accettata é 1e-3 (0.001). Ricordate che i due modelli devono dare output quasi identici. Dunque,
-é molto conveniente comparare gli output intermedi di 🤗 Transformers molteplici volte con gli output intermedi del
+Ci aspettiamo che ogni modello aggiunto a 🤗 Transformers passi con successo un paio di test d'integrazione. Questo 
+significa che il modello originale e la sua implementazione in 🤗 Transformers abbiano lo stesso output con una precisione 
+di 0.001! Siccome é normale che lo stesso esatto modello, scritto in librerie diverse, possa dare output leggermente 
+diversi, la tolleranza accettata é 1e-3 (0.001). Ricordate che i due modelli devono dare output quasi identici. Dunque, 
+é molto conveniente comparare gli output intermedi di 🤗 Transformers molteplici volte con gli output intermedi del 
 modello originale di *brand_new_bert*. Di seguito vi diamo alcuni consigli per avere un ambiente di debug il piu efficiente
 possibile:

 - Trovate la migliore strategia per fare debug dei risultati intermedi. Per esempio, é la repository originale scritta in PyTorch?
-Se si, molto probabilmente dovrete dedicare un po' di tempo per scrivere degli script piu lunghi, così da decomporre il
-modello originale in piccole sotto-componenti, in modo da poter recuperare i valori intermedi. Oppure, la repo originale
-é scritta in Tensorflow 1? Se é così dovrete fare affidamento ai print di Tensorflow [tf.print](https://www.tensorflow.org/api_docs/python/tf/print)
-per avere i valori intermedi. Altro caso, la repo é scritta in Jax? Allora assicuratevi che il modello non sia in **jit**
-quanto testate il foward pass, *per esempio* controllate [questo link](https://github.com/google/jax/issues/196).
- Usate i più piccoli pretrained checkpoint che potete trovare. Piu piccolo é il checkpoint, piu velocemente sarà il vostro
-ciclo di debug. Non é efficiente avere un pretrained model così gigante che per il forward pass impieghi piu di 10 secondi.
+Se si, molto probabilmente dovrete dedicare un po' di tempo per scrivere degli script piu lunghi, così da decomporre il 
+modello originale in piccole sotto-componenti, in modo da poter recuperare i valori intermedi. Oppure, la repo originale 
+é scritta in Tensorflow 1? Se é così dovrete fare affidamento ai print di Tensorflow [tf.print](https://www.tensorflow.org/api_docs/python/tf/print) 
+per avere i valori intermedi. Altro caso, la repo é scritta in Jax? Allora assicuratevi che il modello non sia in **jit** 
+quanto testate il foward pass, *per esempio* controllate [questo link](https://github.com/google/jax/issues/196). 
+- Usate i più piccoli pretrained checkpoint che potete trovare. Piu piccolo é il checkpoint, piu velocemente sarà il vostro 
+ciclo di debug. Non é efficiente avere un pretrained model così gigante che per il forward pass impieghi piu di 10 secondi. 
 Nel caso in cui i checkpoints siano molto grandi, e non si possa trovare di meglio, allora é buona consuetudine ricorrere
-a fare un dummy model nel nuovo ambiente, con weights inizializzati random e salvare quei weights per comprare la versione 🤗 Transformers
+a fare un dummy model nel nuovo ambiente, con weights inizializzati random e salvare quei weights per comprare la versione 🤗 Transformers 
 con il vostro modello
- Accertatevi di usare la via piu semplice per chiamare il forward pass nella repo originale. Sarebbe opportuno trovare
-la funzione originaria che chiami **solo** un singolo forward pass, *per esempio* questa funzione spesso viene chiamata
-`predict`, `evaluate`, `forward` o `__call__`. Siate sicuri di non fare debug su una funzione che chiami `forward` molteplici
+- Accertatevi di usare la via piu semplice per chiamare il forward pass nella repo originale. Sarebbe opportuno trovare 
+la funzione originaria che chiami **solo** un singolo forward pass, *per esempio* questa funzione spesso viene chiamata 
+`predict`, `evaluate`, `forward` o `__call__`. Siate sicuri di non fare debug su una funzione che chiami `forward` molteplici 
 volte, *per esempio* per generare testo, come `autoregressive_sample`, `generate`.
- Cercate di separare la tokenization dal forward pass del modello. Se la repo originaria mostra esempio dove potete dare
-come input una stringa, provate a cercare dove nella forward call la stringa viene cambiata in input ids e cominciate il
-debug da questo punto. Questo vi garantisce un ottimo punto di partenza per scrivere un piccolo script personale dove dare
-gli input al modello, anziche delle stringhe in input.
- Assicuratevi che il debugging **non** sia in training mode. Spesso questo potra il modello a dare degli output random, per
-via dei molteplici dropout layers. Assicuratevi che il forward pass nell'ambiente di debug sia **deterministico**, cosicche
-i dropout non siano usati. Alternativamente, potete usare *transformers.utils.set_seed* se la vecchia e nuova implementazione
+- Cercate di separare la tokenization dal forward pass del modello. Se la repo originaria mostra esempio dove potete dare 
+come input una stringa, provate a cercare dove nella forward call la stringa viene cambiata in input ids e cominciate il 
+debug da questo punto. Questo vi garantisce un ottimo punto di partenza per scrivere un piccolo script personale dove dare 
+gli input al modello, anziche delle stringhe in input. 
+- Assicuratevi che il debugging **non** sia in training mode. Spesso questo potra il modello a dare degli output random, per 
+via dei molteplici dropout layers. Assicuratevi che il forward pass nell'ambiente di debug sia **deterministico**, cosicche 
+i dropout non siano usati. Alternativamente, potete usare *transformers.utils.set_seed* se la vecchia e nuova implementazione 
 sono nello stesso framework.

 La seguente sezione vi da ulteriori dettagli e accorgimenti su come potete fare tutto questo per *brand_new_bert*.
@ -343,7 +343,7 @@ La seguente sezione vi da ulteriori dettagli e accorgimenti su come potete fare
 Allora cominciamo ad aggiungere un nuovo codice in 🤗 Transformers. Andate nel vostro fork clone di 🤗 Transformers:


-```bash
+```bash 
 cd transformers
 ```

@ -355,52 +355,52 @@ Se questo non é il caso, cominciamo con il generare un nuovo modello. Ti consig
 un modello esistente:

 ```bash
-transformers add-new-model-like
+transformers-cli add-new-model-like
 ```

 Ti verrà richiesto con un questionario di compilare le informazioni di base del tuo modello.

 **Aprire una Pull Request in main huggingface/transformers repo**

-Prime di cominciare ad adattare il codice automaticamente generato, aprite una nuova PR come "Work in progress (WIP)",
+Prime di cominciare ad adattare il codice automaticamente generato, aprite una nuova PR come "Work in progress (WIP)", 
 *per esempio* "[WIP] Aggiungere *brand_new_bert*", cosicché il team di Hugging Face possa lavorare al vostro fianco nell'
 integrare il modello in 🤗 Transformers.

 Questi sarebbero gli step generali da seguire:

-1. Creare un branch dal main branch con un nome descrittivo
+1. Creare un branch dal main branch con un nome descrittivo 

-```bash
-git checkout -b add_brand_new_bert
+```bash 
+git checkout -b add_brand_new_bert 
 ```

-2. Commit del codice automaticamente generato
+2. Commit del codice automaticamente generato 

-```bash
-git add .
-git commit
+```bash 
+git add . 
+git commit 
 ```

 3. Fare fetch e rebase del main esistente

-```bash
-git fetch upstream
-git rebase upstream/main
+```bash 
+git fetch upstream 
+git rebase upstream/main 
 ```

-4. Push dei cambiamenti al proprio account:
+4. Push dei cambiamenti al proprio account: 

 ```bash
 git push -u origin a-descriptive-name-for-my-changes
 ```

-5. Una volte che siete soddisfatti dei nuovi cambiamenti, andate sulla webpage del vostro fork su GitHub. Cliccate "Pull request".
-Assiuratevi di aggiungere alcuni membri di Hugging Face come reviewers, nel riguardo alla destra della pagina della PR, cosicche il team
-Hugging Face verrà notificato anche per i futuri cambiamenti.
+5. Una volte che siete soddisfatti dei nuovi cambiamenti, andate sulla webpage del vostro fork su GitHub. Cliccate "Pull request". 
+Assiuratevi di aggiungere alcuni membri di Hugging Face come reviewers, nel riguardo alla destra della pagina della PR, cosicche il team 
+Hugging Face verrà notificato anche per i futuri cambiamenti. 

 6. Cambiare la PR a draft, cliccando su "Convert to draft" alla destra della pagina della PR

-Da quel punto in poi, ricordate di fare commit di ogni progresso e cambiamento, cosicche venga mostrato nella PR. Inoltre,
+Da quel punto in poi, ricordate di fare commit di ogni progresso e cambiamento, cosicche venga mostrato nella PR. Inoltre, 
 ricordatevi di tenere aggiornato il vostro lavoro con il main esistente:

 ```bash
@ -408,39 +408,39 @@ git fetch upstream
 git merge upstream/main
 ```

-In generale, tutte le domande che avrete riguardo al modello o l'implementazione dovranno essere fatte nella vostra PR
-e discusse/risolte nella PR stessa. In questa maniera, il team di Hugging Face sarà sempre notificato quando farete commit
-di un nuovo codice o se avrete qualche domanda. É molto utile indicare al team di Hugging Face il codice a cui fate riferimento
-nella domanda, cosicche il team potra facilmente capire il problema o la domanda.
+In generale, tutte le domande che avrete riguardo al modello o l'implementazione dovranno essere fatte nella vostra PR 
+e discusse/risolte nella PR stessa. In questa maniera, il team di Hugging Face sarà sempre notificato quando farete commit 
+di un nuovo codice o se avrete qualche domanda. É molto utile indicare al team di Hugging Face il codice a cui fate riferimento 
+nella domanda, cosicche il team potra facilmente capire il problema o la domanda. 

-Per fare questo andate sulla tab "Files changed", dove potrete vedere tutti i vostri cambiamenti al codice, andate sulla linea
-dove volete chiedere una domanda, e cliccate sul simbolo "+" per aggiungere un commento. Ogni volta che una domanda o problema
+Per fare questo andate sulla tab "Files changed", dove potrete vedere tutti i vostri cambiamenti al codice, andate sulla linea 
+dove volete chiedere una domanda, e cliccate sul simbolo "+" per aggiungere un commento. Ogni volta che una domanda o problema 
 é stato risolto, cliccate sul bottone "Resolve".

-In questa stessa maniera, Hugging Face aprirà domande o commenti nel rivedere il vostro codice. Mi raccomando, chiedete più
-domande possibili nella pagina della vostra PR. Se avete domande molto generali, non molto utili per il pubblico, siete liberi
+In questa stessa maniera, Hugging Face aprirà domande o commenti nel rivedere il vostro codice. Mi raccomando, chiedete più 
+domande possibili nella pagina della vostra PR. Se avete domande molto generali, non molto utili per il pubblico, siete liberi 
 di chiedere al team Hugging Face direttamente su slack o email.


 **5. Adattare i codici per brand_new_bert**

-Per prima cosa, ci focalizzeremo sul modello e non sui tokenizer. Tutto il codice relative dovrebbe trovarsi in
+Per prima cosa, ci focalizzeremo sul modello e non sui tokenizer. Tutto il codice relative dovrebbe trovarsi in  
 `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` e
 `src/transformers/models/brand_new_bert/configuration_brand_new_bert.py`.

-Ora potete finalmente cominciare il codice :). Il codice generato in
-`src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` avrà sia la stessa architettura di BERT se é un
-modello encoder-only o BART se é encoder-decoder. A questo punto, ricordatevi cio che avete imparato all'inizio, riguardo
-agli aspetti teorici del modello: *In che maniera il modello che sto implmementando é diverso da BERT o BART?*. Implementare
-questi cambi  spesso vuol dire cambiare il layer *self-attention*, l'ordine dei layer di normalizzazione e così via...
-Ancora una volta ripetiamo, é molto utile vedere architetture simili di modelli gia esistenti in Transformers per avere
-un'idea migliore su come implementare il modello.
+Ora potete finalmente cominciare il codice :). Il codice generato in 
+`src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` avrà sia la stessa architettura di BERT se é un 
+modello encoder-only o BART se é encoder-decoder. A questo punto, ricordatevi cio che avete imparato all'inizio, riguardo 
+agli aspetti teorici del modello: *In che maniera il modello che sto implmementando é diverso da BERT o BART?*. Implementare 
+questi cambi  spesso vuol dire cambiare il layer *self-attention*, l'ordine dei layer di normalizzazione e così via... 
+Ancora una volta ripetiamo, é molto utile vedere architetture simili di modelli gia esistenti in Transformers per avere 
+un'idea migliore su come implementare il modello. 

-**Notate** che a questo punto non dovete avere subito un codice tutto corretto o pulito. Piuttosto, é consigliato cominciare con un
-codice poco pulito, con copia-incolla del codice originale in `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py`
-fino a che non avrete tutto il codice necessario. In base alla nostra esperienza, é molto meglio aggiungere una prima bozza
-del codice richiesto e poi correggere e migliorare iterativamente. L'unica cosa essenziale che deve funzionare qui é la seguente
-instanza:
+**Notate** che a questo punto non dovete avere subito un codice tutto corretto o pulito. Piuttosto, é consigliato cominciare con un 
+codice poco pulito, con copia-incolla del codice originale in `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` 
+fino a che non avrete tutto il codice necessario. In base alla nostra esperienza, é molto meglio aggiungere una prima bozza 
+del codice richiesto e poi correggere e migliorare iterativamente. L'unica cosa essenziale che deve funzionare qui é la seguente 
+instanza: 

 ```python
 from transformers import BrandNewBertModel, BrandNewBertConfig
@ -448,23 +448,23 @@ from transformers import BrandNewBertModel, BrandNewBertConfig
 model = BrandNewBertModel(BrandNewBertConfig())
 ```

-Questo comando creerà un modello con i parametri di default definiti in `BrandNewBergConfig()` e weights random. Questo garantisce
+Questo comando creerà un modello con i parametri di default definiti in `BrandNewBergConfig()` e weights random. Questo garantisce 
 che `init()` di tutte le componenti funzioni correttamente.


 **6. Scrivere uno script di conversione**

-Il prossimo step é scrivere uno script per convertire il checkpoint che avete usato per fare debug su *brand_new_berts* nella
-repo originale in un checkpoint per la nuova implementazione di *brand_new_bert* in 🤗 Transformers. Non é consigliato scrivere
+Il prossimo step é scrivere uno script per convertire il checkpoint che avete usato per fare debug su *brand_new_berts* nella 
+repo originale in un checkpoint per la nuova implementazione di *brand_new_bert* in 🤗 Transformers. Non é consigliato scrivere 
 lo script di conversione da zero, ma piuttosto cercate e guardate script gia esistenti in 🤗 Transformers, così da trovarne
-uno simile al vostro modello. Di solito basta fare una copia di uno script gia esistente e adattarlo al vostro caso.
+uno simile al vostro modello. Di solito basta fare una copia di uno script gia esistente e adattarlo al vostro caso. 
 Non esistate a chiedre al team di Hugging Face a riguardo.

 - Se state convertendo un modello da TensorFlow a PyTorch, un ottimo inizio é vedere [questo script di conversione per BERT](https://github.com/huggingface/transformers/blob/7acfa95afb8194f8f9c1f4d2c6028224dbed35a2/src/transformers/models/bert/modeling_bert.py#L91)
 - Se state convertendo un modello da PyTorch a PyTorch, [lo script di conversione di BART può esservi utile](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py)

-Qui di seguito spiegheremo come i modelli PyTorch salvano i weights per ogni layer e come i nomi dei layer sono definiti. In PyTorch,
-il nomde del layer é definito dal nome della class attribute che date al layer. Definiamo un modello dummy in PyTorch,
+Qui di seguito spiegheremo come i modelli PyTorch salvano i weights per ogni layer e come i nomi dei layer sono definiti. In PyTorch, 
+il nomde del layer é definito dal nome della class attribute che date al layer. Definiamo un modello dummy in PyTorch, 
 chiamato `SimpleModel`:

 ```python
@ -497,7 +497,7 @@ SimpleModel(
 )
 ```

-Si può vedere come i nomi dei layers siano definiti dal nome della class attribute in PyTorch. I valori dei weights di uno
+Si può vedere come i nomi dei layers siano definiti dal nome della class attribute in PyTorch. I valori dei weights di uno 
 specifico layer possono essere visualizzati:


@ -530,7 +530,7 @@ tensor([[-0.0818,  0.2207, -0.0749, -0.0030,  0.0045, -0.1569, -0.1598,  0.0212,
          0.2220,  0.2358]]).
 ```

-Nello script di conversione, dovreste riempire quei valori di inizializzazione random con gli stessi weights del corrispondente
+Nello script di conversione, dovreste riempire quei valori di inizializzazione random con gli stessi weights del corrispondente 
 layer nel checkpoint. *Per esempio*

 ```python
@ -544,8 +544,8 @@ model_pointer = getattr(model, "dense")
 model_pointer.weight.data = torch.from_numpy(pretrained_weight)
 ```

-Così facendo, dovete verificare che ogni inizializzazione random di un peso del modello PyTorch e il suo corrispondente peso nel pretrained checkpoint
-siano esattamente gli stessi e uguali in **dimensione/shape e nome**. Per fare questo, é **necessario** aggiungere un `assert`
+Così facendo, dovete verificare che ogni inizializzazione random di un peso del modello PyTorch e il suo corrispondente peso nel pretrained checkpoint 
+siano esattamente gli stessi e uguali in **dimensione/shape e nome**. Per fare questo, é **necessario** aggiungere un `assert` 
 per la dimensione/shape e nome:

 ```python
@ -560,19 +560,19 @@ Inoltre, dovrete fare il print sia dei nomi che dei weights per essere sicuri ch
 logger.info(f"Initialize PyTorch weight {layer_name} from {pretrained_weight.name}")
 ```

-Se la dimensione o il nome non sono uguali, probabilmente avete sbagliato ad assegnare il peso nel checkpoint o nel layer costrutture di
+Se la dimensione o il nome non sono uguali, probabilmente avete sbagliato ad assegnare il peso nel checkpoint o nel layer costrutture di 
 🤗 Transformers.

-Una dimensione sbagliata può essere dovuta ad un errore nei parameteri in `BrandNewBertConfig()`. Tuttavia, può essere anche
-che l'implementazione del layer in PyTorch richieda di fare una transposizione della matrice dei weights.
+Una dimensione sbagliata può essere dovuta ad un errore nei parameteri in `BrandNewBertConfig()`. Tuttavia, può essere anche 
+che l'implementazione del layer in PyTorch richieda di fare una transposizione della matrice dei weights. 

-Infine, controllate **tutti** che tutti i weights inizializzati e fate print di tutti i weights del checkpoint che non sono stati
-usati per l'inizializzazione, di modo da essere sicuri che il modello sia correttamente convertito. É normale che ci siano
-errori nel test di conversione, fai per un errore in `BrandNewBertConfig()`, o un errore nell'architettura in 🤗 Transformers,
-o un bug in `init()`.
+Infine, controllate **tutti** che tutti i weights inizializzati e fate print di tutti i weights del checkpoint che non sono stati 
+usati per l'inizializzazione, di modo da essere sicuri che il modello sia correttamente convertito. É normale che ci siano 
+errori nel test di conversione, fai per un errore in `BrandNewBertConfig()`, o un errore nell'architettura in 🤗 Transformers, 
+o un bug in `init()`. 

-Questo step dev'essere fatto tramite iterazioni fino a che non si raggiungano gli stessi valori per i weights. Una volta che
-il checkpoint é stato correttamente caricato in 🤗 Transformers, potete salvare il modello in una cartella di vostra scelta
+Questo step dev'essere fatto tramite iterazioni fino a che non si raggiungano gli stessi valori per i weights. Una volta che 
+il checkpoint é stato correttamente caricato in 🤗 Transformers, potete salvare il modello in una cartella di vostra scelta 
 `/path/to/converted/checkpoint/folder` che contenga sia
 `pytorch_model.bin` che `config.json`:

@ -583,9 +583,9 @@ model.save_pretrained("/path/to/converted/checkpoint/folder")

 **7. Implementare il forward pass**

-Una volta che i weights pretrained sono stati correttamente caricati in 🤗 Transformers, dovrete assicurarvi che il forward pass
+Una volta che i weights pretrained sono stati correttamente caricati in 🤗 Transformers, dovrete assicurarvi che il forward pass 
 sia correttamente implementato. [Qui](#3-4-provare-un-pretrained-checkpoint-usando-la-repo-originale), avete give creato e provato
-uno script che testi il forward pass del modello usando la repo originaria. Ora dovrete fare lo stesso con uno script analogo
+uno script che testi il forward pass del modello usando la repo originaria. Ora dovrete fare lo stesso con uno script analogo 
 usando l'implementazione in 🤗 Transformers anziché l'originale. Piu o meno lo script dovrebbe essere:

 ```python
@ -594,27 +594,27 @@ input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]
 output = model(input_ids).last_hidden_states
 ```

-Di solito l'output da 🤗 Transformers non é uguale uguale all'output originario, sopratto la prima volta. Non vi abbattete -
-é normale! Prima di tutto assicuratevi che non ci siano errori o che non vengano segnalati degli errori nella forward pass.
-Spesso capita che ci siano dimensioni sbagliate o data type sbagliati, *ad esempio* `torch.long` anziche `torch.float32`.
+Di solito l'output da 🤗 Transformers non é uguale uguale all'output originario, sopratto la prima volta. Non vi abbattete - 
+é normale! Prima di tutto assicuratevi che non ci siano errori o che non vengano segnalati degli errori nella forward pass. 
+Spesso capita che ci siano dimensioni sbagliate o data type sbagliati, *ad esempio* `torch.long` anziche `torch.float32`. 
 Non esistate a chiedere al team Hugging Face!

-Nella parte finale assicuratevi che l'implementazione 🤗 Transformers funzioni correttamente cosi da testare che gli output
-siano equivalenti a una precisione di `1e-3`. Controllate che `outputs.shape` siano le stesse tra 🤗 Transformers e l'implementazione
-originaria. Poi, controllate che i valori in output siano identici. Questa é sicuramente la parte più difficile, qui una serie
+Nella parte finale assicuratevi che l'implementazione 🤗 Transformers funzioni correttamente cosi da testare che gli output 
+siano equivalenti a una precisione di `1e-3`. Controllate che `outputs.shape` siano le stesse tra 🤗 Transformers e l'implementazione 
+originaria. Poi, controllate che i valori in output siano identici. Questa é sicuramente la parte più difficile, qui una serie 
 di errori comuni quando gli output non sono uguali:

- Alcuni layers non sono stati aggiunti, *ad esempio* un *activation* layer non é stato aggiunto, o ci si é scordati di una connessione
- La matrice del word embedding non é stata ripareggiata
- Ci sono degli embeddings posizionali sbagliati perché l'implementazione originaria ha un offset
- Il dropout é in azione durante il forward pass. Per sistemare questo errore controllate che *model.training = False* e che
+- Alcuni layers non sono stati aggiunti, *ad esempio* un *activation* layer non é stato aggiunto, o ci si é scordati di una connessione 
+- La matrice del word embedding non é stata ripareggiata 
+- Ci sono degli embeddings posizionali sbagliati perché l'implementazione originaria ha un offset 
+- Il dropout é in azione durante il forward pass. Per sistemare questo errore controllate che *model.training = False* e che 
 il dropout non sia stato attivato nel forward pass, * per esempio * passate *self.training* a [PyTorch's functional dropout](https://pytorch.org/docs/stable/nn.functional.html?highlight=dropout#torch.nn.functional.dropout)

-La miglior maniera per sistemare il problema é di vedere all'implementazione originaria del forward pass e in 🤗 Transformers
-fianco a fianco e vedere se ci sono delle differenze. In teoria, con debug e print degli output intermedie di entrambe le
-implementazioni nel forward pass nell'esatta posizione del network dovrebbe aiutarvi a vedere dove ci sono differenze tra
-i due frameworks. Come prima mossa controllate che `input_ids` siano identici in entrambi gli scripts. Da lì andate fino
-all'ultimo layer. Potrete notare una differenza tra le due implementazioni a quel punto.
+La miglior maniera per sistemare il problema é di vedere all'implementazione originaria del forward pass e in 🤗 Transformers 
+fianco a fianco e vedere se ci sono delle differenze. In teoria, con debug e print degli output intermedie di entrambe le 
+implementazioni nel forward pass nell'esatta posizione del network dovrebbe aiutarvi a vedere dove ci sono differenze tra 
+i due frameworks. Come prima mossa controllate che `input_ids` siano identici in entrambi gli scripts. Da lì andate fino 
+all'ultimo layer. Potrete notare una differenza tra le due implementazioni a quel punto. 

 Una volta che lo stesso output é stato ragguingi, verificate gli output con `torch.allclose(original_output, output, atol=1e-3)`.
 A questo punto se é tutto a posto: complimenti! Le parti seguenti saranno una passeggiata 😊.
@ -622,9 +622,9 @@ A questo punto se é tutto a posto: complimenti! Le parti seguenti saranno una p

 **8. Aggiungere i test necessari per il modello**

-A questo punto avete aggiunto con successo il vostro nuovo modello. Tuttavia, é molto probabile che il modello non sia
+A questo punto avete aggiunto con successo il vostro nuovo modello. Tuttavia, é molto probabile che il modello non sia 
 del tutto ok con il design richiesto. Per essere sicuri che l'implementazione sia consona e compatibile con 🤗 Transformers é
-necessario implementare dei tests. Il Cookiecutter dovrebbe fornire automaticamente dei file per test per il vostro modello,
+necessario implementare dei tests. Il Cookiecutter dovrebbe fornire automaticamente dei file per test per il vostro modello, 
 di solito nella folder `tests/test_modeling_brand_new_bert.py`. Provate questo per verificare l'ok nei test piu comuni:

 ```bash
@ -636,8 +636,8 @@ Una volta sistemati i test comuni, bisogna assicurarsi che il vostro lavoro sia
 - a) La community puo capire in maniera semplice il vostro lavoro controllando tests specifici del modello *brand_new_bert*,
 - b) Implementazioni future del vostro modello non rompano alcune feature importante del modello.

-Per prima cosa agguingete dei test d'integrazione. Questi sono essenziali perche fanno la stessa funzione degli scripts di
-debug usati precedentemente. Un template per questi tests esiste gia nel Cookiecutter ed é sotto il nome di `BrandNewBertModelIntegrationTests`,
+Per prima cosa agguingete dei test d'integrazione. Questi sono essenziali perche fanno la stessa funzione degli scripts di 
+debug usati precedentemente. Un template per questi tests esiste gia nel Cookiecutter ed é sotto il nome di `BrandNewBertModelIntegrationTests`, 
 voi dovrete solo completarlo. Una volta che questi tests sono OK, provate:

 ```bash
@ -650,7 +650,7 @@ Nel caso siate su Windows, sostituite `RUN_SLOW=1` con `SET RUN_SLOW=1`

 </Tip>

-Di seguito, tutte le features che sono utili e necessarire per *brand_new_bert* devono essere testate in test separati,
+Di seguito, tutte le features che sono utili e necessarire per *brand_new_bert* devono essere testate in test separati, 
 contenuti in `BrandNewBertModelTester`/ `BrandNewBertModelTest`. spesso la gente si scorda questi test, ma ricordate che sono utili per:


@ -664,7 +664,7 @@ A questo punto avremo bisogno un tokenizer per *brand_new_bert*. Di solito il to

 É importante che troviate il file con il tokenizer originale e che lo carichiate in 🤗 Transformers.

-Per controllare che il tokenizer funzioni in modo corretto, create uno script nella repo originaria che riceva come input
+Per controllare che il tokenizer funzioni in modo corretto, create uno script nella repo originaria che riceva come input 
 una stringa e ritorni gli `input_ids`. Piu o meno questo potrebbe essere il codice:

 ```python
@ -673,8 +673,8 @@ model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/")
 input_ids = model.tokenize(input_str)
 ```

-Potrebbe richiedere un po' di tempo, ma guardate ancora alla repo originaria per trovare la funzione corretta del tokenizer.
-A volte capita di dover riscrivere il tokenizer nella repo originaria, di modo da avere come output gli `input_ids`.
+Potrebbe richiedere un po' di tempo, ma guardate ancora alla repo originaria per trovare la funzione corretta del tokenizer. 
+A volte capita di dover riscrivere il tokenizer nella repo originaria, di modo da avere come output gli `input_ids`. 
 A quel punto uno script analogo é necessario in 🤗 Transformers:

 ```python
@ -687,7 +687,7 @@ tokenizer = BrandNewBertTokenizer.from_pretrained("/path/to/tokenizer/folder/")
 input_ids = tokenizer(input_str).input_ids
 ```

-Una volta che `input_ids` sono uguali, bisogna aggiungere un test per il tokenizer.
+Una volta che `input_ids` sono uguali, bisogna aggiungere un test per il tokenizer. 

 Il file test per tokenizer di *brand_new_brand* dovrebbe avere un paio di hard-coded test d'integrazione.

@ -696,22 +696,22 @@ Il file test per tokenizer di *brand_new_brand* dovrebbe avere un paio di hard-c

 Ora che avete il tokenizer, dovrete aggiungere dei test d'integrazione per l'intero workflow in `tests/test_modeling_brand_new_bert.py` in 🤗 Transformer.
 Questi test devono mostrare che un significante campione text-to-text funzioni come ci si aspetta nell'implementazione di  🤗 Transformers.
-*Per esempio* potreste usare dei source-to-target-translation, o un sommario di un articolo, o un domanda-risposta e cosi via.
-Se nessuno dei checkpoints é stato ultra parametrizzato per task simili, allora i tests per il modello sono piu che sufficienti.
-Nello step finale dovete assicurarvi che il modello sia totalmente funzionale, e consigliamo anche di provare a testare su GPU.
+*Per esempio* potreste usare dei source-to-target-translation, o un sommario di un articolo, o un domanda-risposta e cosi via. 
+Se nessuno dei checkpoints é stato ultra parametrizzato per task simili, allora i tests per il modello sono piu che sufficienti. 
+Nello step finale dovete assicurarvi che il modello sia totalmente funzionale, e consigliamo anche di provare a testare su GPU. 
 Puo succedere che ci si scordi un `.to(self.device)` ad esempio. Se non avete accesso a GPU, il team Hugging Face puo provvedere
-a testare questo aspetto per voi.
+a testare questo aspetto per voi. 

 **11. Aggiungere una Docstring**

-Siete quasi alla fine! L'ultima cosa rimasta é avere una bella docstring e una pagina doc. Il Cookiecutter dovrebbe provvedere già
-un template chiamato `docs/source/model_doc/brand_new_bert.rst`, che dovrete compilare. La prima cosa che un utente farà
-per usare il vostro modello sarà dare una bella lettura al doc. Quindi proponete una documentazione chiara e concisa. É molto
-utile per la community avere anche delle *Tips* per mostrare come il modello puo' essere usato. Non esitate a chiedere a Hugging Face
-riguardo alle docstirng.
+Siete quasi alla fine! L'ultima cosa rimasta é avere una bella docstring e una pagina doc. Il Cookiecutter dovrebbe provvedere già 
+un template chiamato `docs/source/model_doc/brand_new_bert.rst`, che dovrete compilare. La prima cosa che un utente farà 
+per usare il vostro modello sarà dare una bella lettura al doc. Quindi proponete una documentazione chiara e concisa. É molto 
+utile per la community avere anche delle *Tips* per mostrare come il modello puo' essere usato. Non esitate a chiedere a Hugging Face 
+riguardo alle docstirng. 

-Quindi, assicuratevi che la docstring sia stata aggiunta a `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py`.
-Assicuratevi che la docstring sia corretta e che includa tutti i necessari input e output. Abbiamo una guida dettagliata per
+Quindi, assicuratevi che la docstring sia stata aggiunta a `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py`. 
+Assicuratevi che la docstring sia corretta e che includa tutti i necessari input e output. Abbiamo una guida dettagliata per 
 scrivere la documentazione e docstring.


@ -729,8 +729,8 @@ E che il codice passi i quality check:
 make quality
 ```

-A volte capita che manchino delle informazioninella docstring o alcuni nomi sbagliati, questo farà fallire i tests sopra.
-Ripetiamo: chiedete pure a Hugging Face, saremo lieti di aiutarvi.
+A volte capita che manchino delle informazioninella docstring o alcuni nomi sbagliati, questo farà fallire i tests sopra. 
+Ripetiamo: chiedete pure a Hugging Face, saremo lieti di aiutarvi. 

 Per ultimo, fare del refactoring del codice una volta che é stato creato.

@ -738,10 +738,10 @@ Avete finito con il codice, congratulazioni! 🎉 Siete fantasticiiiiiii! 😎

 **12. Caricare il modello sul model hub**

-In questa ultima parte dovrete convertire e caricare il modello, con tutti i checkpoints, nel model hub e aggiungere una
-model card per ogni checkpoint caricato. Leggete la nostra guida [Model sharing and uploading Page](model_sharing) per
-avere familiarità con l'hub. Di solito in questa parte lavorate a fianco di Hugging face per decidere un nome che sia ok
-per ogni checkpoint, per ottenere i permessi necessari per caricare il modello nell'organizzazione dell'autore di *brand_new_bert*.
+In questa ultima parte dovrete convertire e caricare il modello, con tutti i checkpoints, nel model hub e aggiungere una 
+model card per ogni checkpoint caricato. Leggete la nostra guida [Model sharing and uploading Page](model_sharing) per 
+avere familiarità con l'hub. Di solito in questa parte lavorate a fianco di Hugging face per decidere un nome che sia ok 
+per ogni checkpoint, per ottenere i permessi necessari per caricare il modello nell'organizzazione dell'autore di *brand_new_bert*. 
 Il metodo `push_to_hub`, presente in tutti i modelli `transformers`, é una maniera rapida e indolore per caricare il vostro checkpoint sull'hub:

 ```python
@ -754,27 +754,27 @@ brand_new_bert.push_to_hub(
 )
 ```

-Vale la pena spendere un po' di tempo per creare una model card ad-hoc per ogni checkpoint. Le model cards dovrebbero
-suggerire le caratteristiche specifiche del checkpoint, *per esempio* su che dataset il checkpoint é stato pretrained o fine-tuned.
+Vale la pena spendere un po' di tempo per creare una model card ad-hoc per ogni checkpoint. Le model cards dovrebbero 
+suggerire le caratteristiche specifiche del checkpoint, *per esempio* su che dataset il checkpoint é stato pretrained o fine-tuned. 
 O che su che genere di task il modello lavoro? E anche buona pratica includere del codice su come usare il modello correttamente.


 **13. (Opzionale) Aggiungere un notebook**

-É molto utile aggiungere un notebook, che dimostri in dettaglio come *brand_new_bert* si utilizzi per fare inferenza e/o
+É molto utile aggiungere un notebook, che dimostri in dettaglio come *brand_new_bert* si utilizzi per fare inferenza e/o 
 fine-tuned su specifiche task. Non é una cosa obbligatoria da avere nella vostra PR, ma é molto utile per la community.

 **14. Sottomettere la PR**

-L'ultimissimo step! Ovvero il merge della PR nel main. Di solito il team Hugging face a questo punto vi avrà gia aiutato,
+L'ultimissimo step! Ovvero il merge della PR nel main. Di solito il team Hugging face a questo punto vi avrà gia aiutato, 
 ma é ok prendere un po' di tempo per pulire la descirzione e commenti nel codice.


 ### Condividete il vostro lavoro!!

-É ora tempo di prendere un po' di credito dalla communità per il vostro lavoro! Caricare e implementare un nuovo modello
-é un grandissimo contributo per Transformers e l'intera community NLP. Il codice e la conversione dei modelli pre-trained sara
-sicuramente utilizzato da centinaia o migliaia di sviluppatori e ricercatori. Siate fieri e orgogliosi di condividere il vostro
-traguardo con l'intera community :)
+É ora tempo di prendere un po' di credito dalla communità per il vostro lavoro! Caricare e implementare un nuovo modello 
+é un grandissimo contributo per Transformers e l'intera community NLP. Il codice e la conversione dei modelli pre-trained sara 
+sicuramente utilizzato da centinaia o migliaia di sviluppatori e ricercatori. Siate fieri e orgogliosi di condividere il vostro 
+traguardo con l'intera community :) 

 ** Avete create un altro modello che é super facile da usare per tutti quanti nella community! 🤯**
--- a/docs/source/it/converting_tensorflow_models.md
+++ b/docs/source/it/converting_tensorflow_models.md
@ -18,10 +18,10 @@ in modelli che possono essere caricati utilizzando i metodi `from_pretrained` de

 <Tip>

-A partire dalla versione 2.3.0 lo script di conversione è parte di transformers CLI (**transformers**), disponibile in ogni installazione
+A partire dalla versione 2.3.0 lo script di conversione è parte di transformers CLI (**transformers-cli**), disponibile in ogni installazione
 di transformers >=2.3.0.

-La seguente documentazione riflette il formato dei comandi di **transformers convert**.
+La seguente documentazione riflette il formato dei comandi di **transformers-cli convert**.

 </Tip>

@ -49,7 +49,7 @@ Questo è un esempio del processo di conversione per un modello `BERT-Base Uncas

 ```bash
 export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
-transformers convert --model_type bert \
+transformers-cli convert --model_type bert \
  --tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \
  --config $BERT_BASE_DIR/bert_config.json \
  --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin
@ -70,7 +70,7 @@ Ecco un esempio del procedimento di conversione di un modello `ALBERT Base` pre-

 ```bash
 export ALBERT_BASE_DIR=/path/to/albert/albert_base
-transformers convert --model_type albert \
+transformers-cli convert --model_type albert \
  --tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \
  --config $ALBERT_BASE_DIR/albert_config.json \
  --pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin
@ -84,7 +84,7 @@ Ecco un esempio del processo di conversione di un modello OpenAI GPT pre-allenat
 sia salvato nello stesso formato dei modelli pre-allenati OpenAI (vedi [qui](https://github.com/openai/finetune-transformer-lm)):
 ```bash
 export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
-transformers convert --model_type gpt \
+transformers-cli convert --model_type gpt \
  --tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
  [--config OPENAI_GPT_CONFIG] \
@ -97,7 +97,7 @@ Ecco un esempio del processo di conversione di un modello OpenAI GPT-2 pre-allen

 ```bash
 export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/openai-community/gpt2/pretrained/weights
-transformers convert --model_type gpt2 \
+transformers-cli convert --model_type gpt2 \
  --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
  [--config OPENAI_GPT2_CONFIG] \
@ -111,7 +111,7 @@ Ecco un esempio del processo di conversione di un modello XLNet pre-allenato:
 ```bash
 export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
 export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
-transformers convert --model_type xlnet \
+transformers-cli convert --model_type xlnet \
  --tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \
  --config $TRANSFO_XL_CONFIG_PATH \
  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
@ -124,7 +124,7 @@ Ecco un esempio del processo di conversione di un modello XLM pre-allenato:

 ```bash
 export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint
-transformers convert --model_type xlm \
+transformers-cli convert --model_type xlm \
  --tf_checkpoint $XLM_CHECKPOINT_PATH \
  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT
 [--config XML_CONFIG] \
@ -137,7 +137,7 @@ Ecco un esempio del processo di conversione di un modello T5 pre-allenato:

 ```bash
 export T5=/path/to/t5/uncased_L-12_H-768_A-12
-transformers convert --model_type t5 \
+transformers-cli convert --model_type t5 \
  --tf_checkpoint $T5/t5_model.ckpt \
  --config $T5/t5_config.json \
  --pytorch_dump_output $T5/pytorch_model.bin
--- a/docs/source/it/perf_train_cpu.md
+++ b/docs/source/it/perf_train_cpu.md
@ -19,7 +19,7 @@ Questa guida si concentra su come addestrare in maniera efficiente grandi modell

 ## Mixed precision con IPEX

-IPEX è ottimizzato per CPU con AVX-512 o superiore, e funziona per le CPU con solo AVX2. Pertanto, si prevede che le prestazioni saranno più vantaggiose per le CPU Intel con AVX-512 o superiori, mentre le CPU con solo AVX2 (ad esempio, le CPU AMD o le CPU Intel più vecchie) potrebbero ottenere prestazioni migliori con IPEX, ma non sono garantite. IPEX offre ottimizzazioni delle prestazioni per l'addestramento della CPU sia con Float32 che con BFloat16. L'uso di BFloat16 è l'argomento principale delle seguenti sezioni.
+IPEX è ottimizzato per CPU con AVX-512 o superiore, e funziona per le CPU con solo AVX2. Pertanto, si prevede che le prestazioni saranno più vantaggiose per le le CPU Intel con AVX-512 o superiori, mentre le CPU con solo AVX2 (ad esempio, le CPU AMD o le CPU Intel più vecchie) potrebbero ottenere prestazioni migliori con IPEX, ma non sono garantite. IPEX offre ottimizzazioni delle prestazioni per l'addestramento della CPU sia con Float32 che con BFloat16. L'uso di BFloat16 è l'argomento principale delle seguenti sezioni.

 Il tipo di dati a bassa precisione BFloat16 è stato supportato in modo nativo su 3rd Generation Xeon® Scalable Processors (aka Cooper Lake) con AVX512 e sarà supportata dalla prossima generazione di Intel® Xeon® Scalable Processors con Intel® Advanced Matrix Extensions (Intel® AMX) instruction set con prestazioni ulteriormente migliorate. L'Auto Mixed Precision per il backende della CPU è stato abilitato da PyTorch-1.10. allo stesso tempo, il supporto di Auto Mixed Precision con BFloat16 per CPU e l'ottimizzazione degli operatori BFloat16 è stata abilitata in modo massiccio in Intel® Extension per PyTorch, and parzialmente aggiornato al branch master di PyTorch. Gli utenti possono ottenere prestazioni migliori ed users experience con IPEX Auto Mixed Precision..

--- a/docs/source/ja/add_new_model.md
+++ b/docs/source/ja/add_new_model.md
@ -312,7 +312,7 @@ cd transformers
 既存のモデル:

 ```bash
-transformers add-new-model-like
+transformers-cli add-new-model-like
 ```

 モデルの基本情報を入力するためのアンケートが表示されます。
@ -517,7 +517,7 @@ tensor([[-0.0818,  0.2207, -0.0749, -0.0030,  0.0045, -0.1569, -0.1598,  0.0212,

 スクリプト内の変換スクリプトでは、ランダムに初期化された重みを、対応するチェックポイント内の正確な重みで埋める必要があります。例えば、以下のように翻訳します：

-
+ 
 ```python
 # retrieve matching layer weights, e.g. by
 # recursive algorithm
@ -747,3 +747,5 @@ brand_new_bert.push_to_hub("brand_new_bert")
 さあ、コミュニティからあなたの作業に対する評価を得る時が来ました！モデルの追加を完了することは、TransformersおよびNLPコミュニティにとって重要な貢献です。あなたのコードとポートされた事前学習済みモデルは、何百人、何千人という開発者や研究者によって確実に使用されるでしょう。あなたの仕事に誇りを持ち、コミュニティとあなたの成果を共有しましょう。

 **あなたはコミュニティの誰でも簡単にアクセスできる別のモデルを作成しました！ 🤯**
+
+
--- a/docs/source/ja/model_doc/beit.md
+++ b/docs/source/ja/model_doc/beit.md
@ -107,12 +107,6 @@ BEiT の使用を開始するのに役立つ公式 Hugging Face およびコミ
    - preprocess
    - post_process_semantic_segmentation

-## BeitImageProcessorFast
-
-[[autodoc]] BeitImageProcessorFast
-    - preprocess
-    - post_process_semantic_segmentation
-
 ## BeitModel

 [[autodoc]] BeitModel
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@ -97,8 +97,6 @@
    sections:
    - local: generation_strategies
      title: 텍스트 생성 전략 사용자 정의
-    - local: serving
-      title: 모델 서빙하기
    title: 생성
  - isExpanded: false
    sections:
@ -125,8 +123,6 @@
    title: Amazon SageMaker에서 학습 실행하기
  - local: serialization
    title: ONNX로 내보내기
-  - local: gpu_selection
-    title: GPU 선택하기
  - local: tflite
    title: TFLite로 내보내기
  - local: torchscript
@ -358,8 +354,8 @@
        title: (번역중) DistilBERT
      - local: in_translation
        title: (번역중) DPR
-      - local: model_doc/electra
-        title: ELECTRA
+      - local: in_translation
+        title: (번역중) ELECTRA
      - local: model_doc/encoder-decoder
        title: 인코더 디코더 모델
      - local: in_translation
--- a/docs/source/ko/add_new_model.md
+++ b/docs/source/ko/add_new_model.md
@ -73,7 +73,7 @@ model.config  # model has access to its config
 5. 함수 시그니처에는 타입 주석을 사용해야 합니다. 그 외에는 타입 주석보다 변수 이름이 훨씬 읽기 쉽고 이해하기 쉽습니다.

 ### 토크나이저 개요 [[overview-of-tokenizers]]
-
+ 
 아직 준비되지 않았습니다 :-( 이 섹션은 곧 추가될 예정입니다!

 ## 🤗 Transformers에 모델 추가하는 단계별 방법  [[stepbystep-recipe-to-add-a-model-to-transformers]]
@ -272,7 +272,7 @@ cd transformers
 기존 모델:

 ```bash
-transformers add-new-model-like
+transformers-cli add-new-model-like
 ```

 모델의 기본 정보를 입력하는 설문지가 표시됩니다.
--- a/Show More
+++ b/Show More