use config attr

2025-10-21 17:48:57 +08:00 · 2025-04-28 18:26:34 +02:00
692 changed files with 53440 additions and 53490 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -7,18 +7,6 @@ parameters:
    nightly:
        type: boolean
        default: false
-    GHA_Actor:
-        type: string
-        default: ""
-    GHA_Action:
-        type: string
-        default: ""
-    GHA_Event:
-        type: string
-        default: ""
-    GHA_Meta:
-        type: string
-        default: ""

 jobs:
    # Ensure running with CircleCI/huggingface
@ -43,12 +31,8 @@ jobs:
        parallelism: 1
        steps:
            - checkout
-            - run: git branch
-            - run: git log -n 1
-            - run: python3 utils/extract_pr_number_from_circleci.py > pr_number.txt
-            - run: echo $(cat pr_number.txt)
-            - run: if [[ "$(cat pr_number.txt)" == "" && "$CIRCLE_BRANCH" != "main" && "$CIRCLE_BRANCH" != *-release ]]; then echo "Not a PR, not the main branch and not a release branch, skip test!"; circleci-agent step halt; fi
-            - run: 'curl -L -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" https://api.github.com/repos/$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME/pulls/$(cat pr_number.txt) >> github.txt'
+            - run: if [[ "$CIRCLE_PULL_REQUEST" == "" && "$CIRCLE_BRANCH" != "main" && "$CIRCLE_BRANCH" != *-release ]]; then echo "Not a PR, not the main branch and not a release branch, skip test!"; circleci-agent step halt; fi
+            - run: 'curl -L -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" https://api.github.com/repos/$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME/pulls/${CIRCLE_PULL_REQUEST##*/} >> github.txt'
            - run: cat github.txt
            - run: (python3 -c 'import json; from datetime import datetime; fp = open("github.txt"); data = json.load(fp); fp.close(); f = "%Y-%m-%dT%H:%M:%SZ"; created = datetime.strptime(data["created_at"], f); updated = datetime.strptime(data["updated_at"], f); s = (updated - created).total_seconds(); print(int(s))' || true) > elapsed.txt
            - run: if [ "$(cat elapsed.txt)" == "" ]; then echo 60 > elapsed.txt; fi
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@ -28,8 +28,6 @@ COMMON_ENV_VARIABLES = {
    "TRANSFORMERS_IS_CI": True,
    "PYTEST_TIMEOUT": 120,
    "RUN_PIPELINE_TESTS": False,
-    # will be adjust in `CircleCIJob.to_dict`.
-    "RUN_FLAKY": True,
 }
 # Disable the use of {"s": None} as the output is way too long, causing the navigation on CircleCI impractical
 COMMON_PYTEST_OPTIONS = {"max-worker-restart": 0, "vvv": None, "rsfE":None}
@ -128,8 +126,6 @@ class CircleCIJob:

    def to_dict(self):
        env = COMMON_ENV_VARIABLES.copy()
-        # Do not run tests decorated by @is_flaky on pull requests
-        env['RUN_FLAKY'] = os.environ.get("CIRCLE_PULL_REQUEST", "") == ""
        env.update(self.additional_env)

        job = {
@ -397,12 +393,7 @@ def create_circleci_config(folder=None):
        "parameters": {
            # Only used to accept the parameters from the trigger
            "nightly": {"type": "boolean", "default": False},
-            # Only used to accept the parameters from GitHub Actions trigger
-            "GHA_Actor": {"type": "string", "default": ""},
-            "GHA_Action": {"type": "string", "default": ""},
-            "GHA_Event": {"type": "string", "default": ""},
-            "GHA_Meta": {"type": "string", "default": ""},
-            "tests_to_run": {"type": "string", "default": ""},
+            "tests_to_run": {"type": "string", "default": ''},
            **{j.job_name + "_test_list":{"type":"string", "default":''} for j in jobs},
            **{j.job_name + "_parallelism":{"type":"integer", "default":1} for j in jobs},
        },
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@ -16,7 +16,7 @@ body:
    id: system-info
    attributes:
      label: System Info
-      description: Please share your system info with us. You can run the command `transformers env` and copy-paste its output below.
+      description: Please share your system info with us. You can run the command `transformers-cli env` and copy-paste its output below.
      placeholder: transformers version, platform, python version, ...
    validations:
      required: true
--- a/.github/ISSUE_TEMPLATE/migration.yml
+++ b/.github/ISSUE_TEMPLATE/migration.yml
@ -6,7 +6,7 @@ body:
    id: system-info
    attributes:
      label: System Info
-      description: Please share your system info with us. You can run the command `transformers env` and copy-paste its output below.
+      description: Please share your system info with us. You can run the command `transformers-cli env` and copy-paste its output below.
      render: shell
      placeholder: transformers version, platform, python version, ...
    validations:
--- a/.github/workflows/add-model-like.yml
+++ b/.github/workflows/add-model-like.yml
@ -54,7 +54,7 @@ jobs:
      - name: Create model files
        run: |
          . ~/venv/bin/activate
-          transformers add-new-model-like --config_file tests/fixtures/add_distilbert_like_config.json --path_to_repo .
+          transformers-cli add-new-model-like --config_file tests/fixtures/add_distilbert_like_config.json --path_to_repo .
          make style
          make fix-copies

--- a/.github/workflows/check_failed_model_tests.yml
+++ b/.github/workflows/check_failed_model_tests.yml
@ -29,7 +29,7 @@ jobs:
  run_models_gpu:
    name: " "
    runs-on:
-      group: aws-g4dn-4xlarge-cache
+      group: aws-g4dn-2xlarge-cache
    container:
      image: ${{ inputs.docker }}
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
--- a/.github/workflows/doctest_job.yml
+++ b/.github/workflows/doctest_job.yml
@ -28,7 +28,7 @@ jobs:
      matrix:
        split_keys: ${{ fromJson(inputs.split_keys) }}
    runs-on: 
-      group: aws-g4dn-4xlarge-cache
+      group: aws-g4dn-2xlarge-cache
    container:
      image: huggingface/transformers-all-latest-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
--- a/.github/workflows/doctests.yml
+++ b/.github/workflows/doctests.yml
@ -15,7 +15,7 @@ jobs:
  setup:
    name: Setup
    runs-on: 
-      group: aws-g4dn-4xlarge-cache
+      group: aws-g4dn-2xlarge-cache
    container:
      image: huggingface/transformers-all-latest-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
--- a/.github/workflows/model_jobs.yml
+++ b/.github/workflows/model_jobs.yml
@ -107,7 +107,7 @@ jobs:
        run: |
          echo "${{ inputs.machine_type }}"

-          if [ "${{ inputs.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
+          if [ "${{ inputs.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ inputs.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
--- a/.github/workflows/new_model_pr_merged_notification.yml
+++ b/.github/workflows/new_model_pr_merged_notification.yml
@ -59,7 +59,7 @@ jobs:
                  "type": "section",
                  "text": {
                    "type": "mrkdwn",
-                    "text": "<https://github.com/huggingface/transformers/commit/${{ env.COMMIT_SHA }}|New model: ${{ env.NEW_MODEL }}> GH_ArthurZucker, GH_lysandrejik, GH_ydshieh\ncommit SHA: ${{ env.COMMIT_SHA }}"
+                    "text": "<https://github.com/huggingface/transformers/commit/${{ env.COMMIT_SHA }}|New model: ${{ env.NEW_MODEL }}> GH_ArthurZucker, GH_lysandrejik, GH_ydshieh"
                  }
                }
              ]
--- a/.github/workflows/self-comment-ci.yml
+++ b/.github/workflows/self-comment-ci.yml
@ -145,7 +145,7 @@ jobs:
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          MODELS: ${{ needs.get-tests.outputs.models }}
-          BODY: "\n\nmodels: ${{ needs.get-tests.outputs.models }}\nquantizations: ${{ needs.get-tests.outputs.quantizations }}"
+          BODY: "This comment contains run-slow, running the specified jobs:\n\nmodels: ${{ needs.get-tests.outputs.models }}\nquantizations: ${{ needs.get-tests.outputs.quantizations }}"
        run: |
          gh api \
            --method POST \
@ -185,7 +185,7 @@ jobs:
      fail-fast: false
      matrix:
        folders: ${{ fromJson(needs.get-tests.outputs.models) }}
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
       group: '${{ matrix.machine_type }}'
    container:
@ -239,7 +239,7 @@ jobs:
        shell: bash
        run: |
          echo "${{ matrix.machine_type }}"
-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
@ -292,7 +292,7 @@ jobs:
      fail-fast: false
      matrix:
        folders: ${{ fromJson(needs.get-tests.outputs.quantizations) }}
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -338,7 +338,7 @@ jobs:
        shell: bash
        run: |
          echo "${{ matrix.machine_type }}"
-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@ -49,7 +49,7 @@ jobs:
    name: Setup
    strategy:
      matrix:
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -107,7 +107,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
        slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }}
    uses: ./.github/workflows/model_jobs.yml
    with:
@ -125,7 +125,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
        slice_id: [0, 1]
    uses: ./.github/workflows/model_jobs.yml
    with:
@ -143,7 +143,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -177,7 +177,7 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
@ -211,7 +211,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -246,7 +246,7 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
@ -280,7 +280,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-4xlarge-cache]
+        machine_type: [aws-g4dn-2xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -314,7 +314,7 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
@ -349,7 +349,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -411,7 +411,7 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
@ -448,7 +448,7 @@ jobs:
      fail-fast: false
      matrix:
        folders: ${{ fromJson(needs.setup.outputs.quantization_matrix) }}
-        machine_type: [aws-g4dn-4xlarge-cache, aws-g4dn-12xlarge-cache]
+        machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
    runs-on:
      group: '${{ matrix.machine_type }}'
    container:
@ -491,7 +491,7 @@ jobs:
        run: |
          echo "${{ matrix.machine_type }}"

-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-4xlarge-cache" ]; then
+          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
            machine_type=single-gpu
          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
            machine_type=multi-gpu
--- a/.github/workflows/ssh-runner.yml
+++ b/.github/workflows/ssh-runner.yml
@ -35,7 +35,7 @@ jobs:
        shell: bash
        run: |
          if [[ "${{ github.event.inputs.num_gpus }}" == "single" && "${{ github.event.inputs.runner_type }}" == "t4" ]]; then
-            echo "RUNNER=aws-g4dn-4xlarge-cache" >> $GITHUB_ENV
+            echo "RUNNER=aws-g4dn-2xlarge-cache" >> $GITHUB_ENV
          elif [[ "${{ github.event.inputs.num_gpus }}" == "multi" && "${{ github.event.inputs.runner_type }}" == "t4" ]]; then
            echo "RUNNER=aws-g4dn-12xlarge-cache" >> $GITHUB_ENV
          elif [[ "${{ github.event.inputs.num_gpus }}" == "single" && "${{ github.event.inputs.runner_type }}" == "a10" ]]; then
--- a/.github/workflows/trigger_circleci.yml
+++ b/.github/workflows/trigger_circleci.yml
@ -1,16 +0,0 @@
-name: Trigger CircleCI
-
-on:
-  pull_request_target:
-    types: [ready_for_review]
-
-jobs:
-  trigger-circleci:
-    runs-on: ubuntu-22.04
-    steps:
-      - name: trigger CircleCI pipeline via GitHub Actions
-        uses: CircleCI-Public/trigger-circleci-pipeline-action@v1.0.5
-        with:
-          GHA_Meta: "Trigger via GitHub Actions"
-        env:
-          CCI_TOKEN: ${{ secrets.CIRCLECI_PAT }}
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -78,7 +78,7 @@ Once you've confirmed the bug hasn't already been reported, please include the f
 To get the OS and software versions automatically, run the following command:

 ```bash
-transformers env
+transformers-cli env
 ```

 You can also run the same command from the root of the repository:
--- a/2
+++ b/2
@ -79,7 +79,7 @@ fixup: modified_only_fixup extra_style_checks autogenerate_code repo-consistency

 fix-copies:
 	python utils/check_copies.py --fix_and_overwrite
-	python utils/check_modular_conversion.py --fix_and_overwrite
+	python utils/check_modular_conversion.py  --fix_and_overwrite
 	python utils/check_dummies.py --fix_and_overwrite
 	python utils/check_doctest_list.py --fix_and_overwrite
 	python utils/check_docstrings.py --fix_and_overwrite
--- a/README.md
+++ b/README.md
@ -78,6 +78,7 @@ Create and activate a virtual environment with [venv](https://docs.python.org/3/
 # venv
 python -m venv .my-env
 source .my-env/bin/activate
+
 # uv
 uv venv .my-env
 source .my-env/bin/activate
@ -87,10 +88,10 @@ Install Transformers in your virtual environment.

 ```py
 # pip
-pip install "transformers[torch]"
+pip install transformers

 # uv
-uv pip install "transformers[torch]"
+uv pip install transformers
 ```

 Install Transformers from source if you want the latest changes in the library or are interested in contributing. However, the *latest* version may not be stable. Feel free to open an [issue](https://github.com/huggingface/transformers/issues) if you encounter an error.
@ -98,7 +99,7 @@ Install Transformers from source if you want the latest changes in the library o
 ```shell
 git clone https://github.com/huggingface/transformers.git
 cd transformers
-pip install .[torch]
+pip install .
 ```

 ## Quickstart
@ -120,7 +121,7 @@ To chat with a model, the usage pattern is the same. The only difference is you
 > [!TIP]
 > You can also chat with a model directly from the command line.
 > ```shell
-> transformers chat --model_name_or_path Qwen/Qwen2.5-0.5B-Instruct
+> transformers-cli chat --model_name_or_path Qwen/Qwen2.5-0.5B-Instruct
 > ```

 ```py
--- a/docs/source/de/add_new_model.md
+++ b/docs/source/de/add_new_model.md
@ -95,7 +95,7 @@ wie der Code geschrieben werden sollte :-)
 1. Der Vorwärtsdurchlauf Ihres Modells sollte vollständig in die Modellierungsdatei geschrieben werden und dabei völlig unabhängig von anderen
   Modellen in der Bibliothek. Wenn Sie einen Block aus einem anderen Modell wiederverwenden möchten, kopieren Sie den Code und fügen ihn mit einem
   `# Kopiert von` ein (siehe [hier](https://github.com/huggingface/transformers/blob/v4.17.0/src/transformers/models/roberta/modeling_roberta.py#L160)
-   für ein gutes Beispiel und [hier](pr_checks#check-copies) für weitere Dokumentation zu Copied from).
+   für ein gutes Beispiel und [hier](pr_checks#check-copies) für weitere Dokumentation zu Copied from). 
 2. Der Code sollte vollständig verständlich sein, auch für einen Nicht-Muttersprachler. Das heißt, Sie sollten
   beschreibende Variablennamen wählen und Abkürzungen vermeiden. Ein Beispiel: `activation` ist `act` vorzuziehen.
   Von Variablennamen mit nur einem Buchstaben wird dringend abgeraten, es sei denn, es handelt sich um einen Index in einer for-Schleife.
@ -402,7 +402,7 @@ Andernfalls beginnen wir mit der Erstellung eines neuen Modells. Wir empfehlen d
 ein bestehendes Modell:

 ```bash
-transformers add-new-model-like
+transformers-cli add-new-model-like
 ```

 Sie werden mit einem Fragebogen aufgefordert, die grundlegenden Informationen Ihres Modells einzugeben.
--- a/docs/source/de/contributing.md
+++ b/docs/source/de/contributing.md
@ -63,7 +63,7 @@ Wenn Sie sich vergewissert haben, dass der Fehler noch nicht gemeldet wurde, geb
 Um das Betriebssystem und die Softwareversionen automatisch auszugeben, führen Sie den folgenden Befehl aus:

 ```bash
-transformers env
+transformers-cli env
 ```

 Sie können denselben Befehl auch im Hauptverzeichnis des Repositorys ausführen:
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@ -21,8 +21,6 @@
      title: Adding a new model to Transformers
    - local: modular_transformers
      title: Modular Transformers
-    - local: auto_docstring
-      title: Document your models
    - local: task_summary
      title: What 🤗 Transformers can do
    - local: tasks_explained
@ -151,8 +149,6 @@
      title: TPU
    - local: perf_train_special
      title: Apple Silicon
-    - local: perf_train_gaudi
-      title: Intel Gaudi
    - local: perf_hardware
      title: Build your own machine
    title: Hardware
@ -497,16 +493,12 @@
        title: Granite
      - local: model_doc/granitemoe
        title: GraniteMoe
-      - local: model_doc/granitemoehybrid
-        title: GraniteMoeHybrid
      - local: model_doc/granitemoeshared
        title: GraniteMoeShared
      - local: model_doc/helium
        title: Helium
      - local: model_doc/herbert
        title: HerBERT
-      - local: model_doc/hgnet_v2
-        title: HGNet-V2
      - local: model_doc/ibert
        title: I-BERT
      - local: model_doc/jamba
@ -699,8 +691,6 @@
        title: ConvNeXTV2
      - local: model_doc/cvt
        title: CvT
-      - local: model_doc/d_fine
-        title: D-FINE
      - local: model_doc/dab-detr
        title: DAB-DETR
      - local: model_doc/deformable_detr
@ -827,8 +817,6 @@
        title: Bark
      - local: model_doc/clap
        title: CLAP
-      - local: model_doc/csm
-        title: CSM
      - local: model_doc/dac
        title: dac
      - local: model_doc/encodec
@ -1029,8 +1017,6 @@
        title: Qwen2VL
      - local: model_doc/sam
        title: Segment Anything
-      - local: model_doc/sam_hq
-        title: Segment Anything High Quality
      - local: model_doc/shieldgemma2
        title: ShieldGemma2
      - local: model_doc/siglip
--- a/docs/source/en/add_new_model.md
+++ b/docs/source/en/add_new_model.md
@ -161,7 +161,7 @@ The downside is that if you aren't used to them, it may take some time to get us
 Run the command below to start and complete the questionnaire with some basic information about the new model. This command jumpstarts the process by automatically generating some model code that you'll need to adapt.

 ```bash
-transformers add-new-model-like
+transformers-cli add-new-model-like
 ```

 ## Create a pull request
@ -292,7 +292,7 @@ Once you're able to run the original checkpoint, you're ready to start adapting

 ## Adapt the model code

-The `transformers add-new-model-like` command should have generated a model and configuration file.
+The `transformers-cli add-new-model-like` command should have generated a model and configuration file.

 - `src/transformers/models/brand_new_llama/modeling_brand_new_llama.py`
 - `src/transformers/models/brand_new_llama/configuration_brand_new_llama.py`
@ -551,10 +551,10 @@ While this example doesn't include an image processor, you may need to implement

 If you do need to implement a new image processor, refer to an existing image processor to understand the expected structure. Slow image processors ([`BaseImageProcessor`]) and fast image processors ([`BaseImageProcessorFast`]) are designed differently, so make sure you follow the correct structure based on the processor type you're implementing.

-Run the following command (only if you haven't already created the fast image processor with the `transformers add-new-model-like` command) to generate the necessary imports and to create a prefilled template for the fast image processor. Modify the template to fit your model.
+Run the following command (only if you haven't already created the fast image processor with the `transformers-cli add-new-model-like` command) to generate the necessary imports and to create a prefilled template for the fast image processor. Modify the template to fit your model.

 ```bash
-transformers add-fast-image-processor --model-name your_model_name
+transformers-cli add-fast-image-processor --model-name your_model_name
 ```

 This command will generate the necessary imports and provide a pre-filled template for the fast image processor. You can then modify it to fit your model's needs.
--- a/docs/source/en/attention_interface.md
+++ b/docs/source/en/attention_interface.md
@ -108,7 +108,7 @@ If in doubt about what args/kwargs a given model sends to the attention function
 ## Accessing current available implementations

 Most of the time, you will simply need to `register` a new function. If, however, you need to access an existing one,
-and/or perform a few checks, the preferred way is to use the global `ALL_ATTENTION_FUNCTIONS`. It behaves the same way you
+and/or perform a few checks, the prefered way is to use the global `ALL_ATTENTION_FUNCTIONS`. It behaves the same way you
 would expect from a usual Python dictionary:

 ```python
--- a/docs/source/en/auto_docstring.md
+++ b/docs/source/en/auto_docstring.md
@ -1,279 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Utilizing the @auto_docstring Decorator
-
-The `@auto_docstring` decorator in the Hugging Face Transformers library helps generate docstrings for model classes and their methods, which will be used to build the documentation for the library. It aims to improve consistency and reduce boilerplate by automatically including standard argument descriptions and allowing for targeted overrides and additions.
-
---
-
-## 📜 How it Works
-
-The `@auto_docstring` decorator constructs docstrings by:
-
-1.  **Signature Inspection:** It inspects the signature (arguments, types, defaults) of the decorated class's `__init__` method or the decorated function.
-2.  **Centralized Docstring Fetching:** It retrieves predefined docstrings for common arguments (e.g., `input_ids`, `attention_mask`) from internal library sources (like `ModelArgs` or `ImageProcessorArgs` in `utils/args_doc.py`).
-3.  **Overriding or Adding Arguments Descriptions:**
-    * **Direct Docstring Block:** It incorporates custom docstring content from an `r""" """` (or `""" """`) block below the method signature or within the `__init__` docstring. This is for documenting new arguments or overriding standard descriptions.
-    * **Decorator Arguments (`custom_args`):** A `custom_args` docstring block can be passed to the decorator to provide docstrings for specific arguments directly in the decorator call. This can be used to define the docstring block for new arguments once if they are repeated in multiple places in the modeling file.
-4.  **Adding Classes and Functions Introduction:**
-    * **`custom_intro` argument:** Allows prepending a custom introductory paragraph to a class or function docstring.
-    * **Automatic Introduction Generation:** For model classes with standard naming patterns (like `ModelForCausalLM`) or belonging to a pipeline, the decorator automatically generates an appropriate introductory paragraph using `ClassDocstring` in `utils/args_doc.py` as the source.
-5.  **Templating:** The decorator uses a templating system, allowing predefined docstrings to include dynamic information deduced from the `auto_modules` of the library, such as `{{processor_class}}` or `{{config_class}}`.
-6.  **Deducing Relevant Examples:** The decorator attempts to find appropriate usage examples based on the model's task or pipeline compatibility. It extracts checkpoint information from the model's configuration class to provide concrete examples with real model identifiers.
-7.  **Adding Return Value Documentation:** For methods like `forward`, the decorator can automatically generate the "Returns" section based on the method's return type annotation. For example, for a method returning a `ModelOutput` subclass, it will extracts field descriptions from that class's docstring to create a comprehensive return value description. A custom `Returns` section can also be manually specified in the function docstring block.
-8.  **Unrolling Kwargs Typed With Unpack Operator:** For specific methods (defined in `UNROLL_KWARGS_METHODS`) or classes (defined in `UNROLL_KWARGS_CLASSES`), the decorator processes `**kwargs` parameters that are typed with `Unpack[KwargsTypedDict]`. It extracts the documentation from the TypedDict and adds each parameter to the function's docstring. Currently, this functionality is only supported for `FastImageProcessorKwargs`.
-
-
---
-
-## 🚀 How to Use @auto_docstring
-
-### 1. Importing the Decorator
-Import the decorator into your modeling file:
-
-```python
-from ...utils import auto_docstring
-```
-
-### 2. Applying to Classes
-Place `@auto_docstring` directly above the class definition. It uses the `__init__` method's signature and its docstring for parameter descriptions.
-
-```python
-from transformers.modeling_utils import PreTrainedModel
-from ...utils import auto_docstring
-
-@auto_docstring
-class MyAwesomeModel(PreTrainedModel):
-    def __init__(self, config, custom_parameter: int = 10, another_custom_arg: str = "default"):
-        r"""
-        custom_parameter (`int`, *optional*, defaults to 10):
-            Description of the custom_parameter for MyAwesomeModel.
-        another_custom_arg (`str`, *optional*, defaults to "default"):
-            Documentation for another unique argument.
-        """
-        super().__init__(config)
-        self.custom_parameter = custom_parameter
-        self.another_custom_arg = another_custom_arg
-        # ... rest of your init
-
-    # ... other methods
-```
-
-#### Advanced Class Decoration:
-
-Arguments can be passed directly to `@auto_docstring` for more control:
-
-```python
-@auto_docstring(
-    custom_intro="""This model performs specific synergistic operations.
-    It builds upon the standard Transformer architecture with unique modifications.""",
-    custom_args="""
-    custom_parameter (`type`, *optional*, defaults to `default_value`):
-        A concise description for custom_parameter if not defined or overriding the description in `args_doc.py`.
-    internal_helper_arg (`type`, *optional*, defaults to `default_value`):
-        A concise description for internal_helper_arg if not defined or overriding the description in `args_doc.py`.
-    """
-)
-class MySpecialModel(PreTrainedModel):
-    def __init__(self, config: ConfigType, custom_parameter: "type" = "default_value", internal_helper_arg=None):
-        # ...
-```
-
-Or:
-
-```python
-@auto_docstring(
-    custom_intro="""This model performs specific synergistic operations.
-    It builds upon the standard Transformer architecture with unique modifications.""",
-)
-class MySpecialModel(PreTrainedModel):
-    def __init__(self, config: ConfigType, custom_parameter: "type" = "default_value", internal_helper_arg=None):
-        r"""
-        custom_parameter (`type`, *optional*, defaults to `default_value`):
-            A concise description for custom_parameter if not defined or overriding the description in `args_doc.py`.
-        internal_helper_arg (`type`, *optional*, defaults to `default_value`):
-            A concise description for internal_helper_arg if not defined or overriding the description in `args_doc.py`.
-        """
-        # ...
-```
-
-### 3. Applying to Functions (e.g., `forward` method)
-Apply the decorator above method definitions, such as the `forward` method.
-
-```python
-    @auto_docstring
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        new_custom_argument: Optional[torch.Tensor] = None,
-        arg_documented_in_args_doc: Optional[torch.Tensor] = None,
-        # ... other arguments
-    ) -> Union[Tuple, ModelOutput]: # The description of the return value will automatically be generated from the ModelOutput class docstring.
-        r"""
-        new_custom_argument (`torch.Tensor`, *optional*):
-            Description of this new custom argument and its expected shape or type.
-        """
-        # ...
-```
-
-#### Advanced Function Decoration:
-
-Arguments can be passed directly to `@auto_docstring` for more control. `Returns` and `Examples` sections can also be manually specified:
-
-```python
-MODEL_COMMON_CUSTOM_ARGS = r"""
-    common_arg_1 (`torch.Tensor`, *optional*, defaults to `default_value`):
-        Description of common_arg_1
-    common_arg_2 (`torch.Tensor`, *optional*, defaults to `default_value`):
-        Description of common_arg_2
-    ...
-"""
-
-class MyModel(PreTrainedModel):
-    # ...
-    @auto_docstring(
-        custom_intro="""
-        This is a custom introduction for the function.
-        """
-        custom_args=MODEL_COMMON_CUSTOM_ARGS
-    )
-    def forward(
-        self,
-        input_ids: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        common_arg_1: Optional[torch.Tensor] = None,
-        common_arg_2: Optional[torch.Tensor] = None,
-        #...
-        function_specific_argument: Optional[torch.Tensor] = None,
-        # ... other arguments
-    ) -> torch.Tensor:
-        r"""
-        function_specific_argument (`torch.Tensor`, *optional*):
-            Description of an argument specific to this function
-
-        Returns:
-            `torch.Tensor`: For a function returning a generic type, a custom "Returns" section can be specified.
-
-        Example:
-
-        (To override the default example with a custom one or to add an example for a model class that does not have a pipeline)
-
-        ```python
-        ...
-        ```
-        """
-        # ...
-```
-
---
-
-### ✍️ Documenting Arguments: Approach & Priority
-
-1.  **Standard Arguments (e.g., `input_ids`, `attention_mask`, `pixel_values`, `encoder_hidden_states` etc.):**
-    * `@auto_docstring` retrieves descriptions from a central source. Do not redefine these locally if their description and shape are the same as in `args_doc.py`.
-
-2.  **New or Custom Arguments:**
-    * **Primary Method:** Document these within an `r""" """` docstring block following the signature (for functions) or in the `__init__` method's docstring (for class parameters).
-    * **Format:**
-        ```
-        argument_name (`type`, *optional*, defaults to `X`):
-            Description of the argument.
-            Explain its purpose, expected shape/type if complex, and default behavior.
-            This can span multiple lines.
-        ```
-    * Include `type` in backticks.
-    * Add "*optional*" if the argument is not required (has a default value).
-    * Add "defaults to `X`" if it has a default value (no need to specify "defaults to `None`" if the default value is `None`).
-
-3.  **Overriding Standard Arguments:**
-    * If a standard argument behaves differently (e.g., different expected shape, model-specific behavior), provide its complete description in the local `r""" """` docstring. This local definition takes precedence.
-    * The `labels` argument is often customized per model and typically requires a specific docstring.
-
-4.  **Using Decorator Arguments for Overrides or New Arguments (`custom_args`):**
-    * New or custom arguments docstrings can also be passed to `@auto_docstring` as a `custom_args` argument. This can be used to define the docstring block for new arguments once if they are repeated in multiple places in the modeling file.
-
---
-
-### Usage with [modular files](./modular_transformers)
-
-When working with modular files, follow these guidelines for applying the `@auto_docstring` decorator:
-
- **For standalone models in modular files:**
-  Apply the `@auto_docstring` decorator just as you would in regular modeling files.
-
- **For models inheriting from other library models:**
-  - When inheriting from a parent model, decorators (including `@auto_docstring`) are automatically carried over to the generated modeling file without needing to add them in your modular file.
-  - If you need to modify the `@auto_docstring` behavior, apply the customized decorator in your modular file, making sure to *include all other decorators* that were present on the original function/class.
-
-  > **Warning**: When overriding any decorator in a modular file, you must include ALL decorators that were applied to that function/class in the parent model. If you only override some decorators, the others won't be included in the generated modeling file.
-
-
-**Note**: The `check_auto_docstrings` tool doesn't check modular files directly, but it will check (and modify when using `--fix_and_overwrite`) the generated modeling files. If issues are found in the generated files, you'll need to update your modular files accordingly.
-
---
-
-## ✅ Checking Your Docstrings with `check_auto_docstrings`
-
-The library includes a utility script to validate docstrings. This check is typically run during Continuous Integration (CI).
-
-#### What it Checks:
-
-* **Decorator Presence:** Ensures `@auto_docstring` is applied to relevant model classes and public methods. (TODO)
-* **Argument Completeness & Consistency:**
-    * Flags arguments in the signature that are not known standard arguments and lack a local description.
-    * Ensures documented arguments exist in the signature. (TODO)
-    * Verifies that types and default values in the docstring match the signature. (TODO)
-* **Placeholder Detection:** Reminds you to complete placeholders like `<fill_type>` or `<fill_docstring>`.
-* **Formatting:** Adherence to the expected docstring style.
-
-#### Running the Check Locally:
-
-Run this check locally before committing. The common command is:
-
-```bash
-make fix-copies
-```
-
-Alternatively, to only perform docstrings and auto-docstring checks, you can use:
-
-```bash
-python utils/check_docstrings.py # to only check files included in the diff without fixing them
-# Or: python utils/check_docstrings.py --fix_and_overwrite # to fix and overwrite the files in the diff
-# Or: python utils/check_docstrings.py --fix_and_overwrite --check_all # to fix and overwrite all files
-```
-
-#### Workflow with the Checker:
-
-1.  Add `@auto_docstring(...)` to the class or method.
-2.  For new, custom, or overridden arguments, add descriptions in an `r""" """` block.
-3.  Run `make fix-copies` (or the `check_docstrings.py` utility).
-    * For unrecognized arguments lacking documentation, the utility will create placeholder entries.
-4.  Manually edit these placeholders with accurate types and descriptions.
-5.  Re-run the check to ensure all issues are resolved.
-
---
-
-## 🔑 Key Takeaways & Best Practices
-
-* Use `@auto_docstring` for new PyTorch model classes (`PreTrainedModel` subclasses) and their primary for methods (e.g., `forward`, `get_text_features` etc.).
-* For classes, the `__init__` method's docstring is the main source for parameter descriptions when using `@auto_docstring` on the class.
-* Rely on standard docstrings; do not redefine common arguments unless their behavior is different in your specific model.
-* Document new or custom arguments clearly.
-* Run `check_docstrings` locally and iteratively.
-
-By following these guidelines, you help maintain consistent and informative documentation for the Hugging Face Transformers library 🤗.
--- a/docs/source/en/conversations.md
+++ b/docs/source/en/conversations.md
@ -25,12 +25,12 @@ Check model leaderboards like [OpenLLM](https://hf.co/spaces/HuggingFaceH4/open_

 This guide shows you how to quickly start chatting with Transformers from the command line, how build and format a conversation, and how to chat using the [`TextGenerationPipeline`].

-## transformers CLI
+## transformers-cli

 Chat with a model directly from the command line as shown below. It launches an interactive session with a model. Enter `clear` to reset the conversation, `exit` to terminate the session, and `help` to display all the command options.

 ```bash
-transformers chat Qwen/Qwen2.5-0.5B-Instruct
+transformers-cli chat --model_name_or_path Qwen/Qwen2.5-0.5B-Instruct
 ```

 <div class="flex justify-center">
@ -40,7 +40,7 @@ transformers chat Qwen/Qwen2.5-0.5B-Instruct
 For a full list of options, run the command below.

 ```bash
-transformers chat -h
+transformers-cli chat -h
 ```

 The chat is implemented on top of the [AutoClass](./model_doc/auto), using tooling from [text generation](./llm_tutorial) and [chat](./chat_templating).
@ -76,16 +76,16 @@ print(response[0]["generated_text"][-1]["content"])
 (sigh) Oh boy, you're asking me for advice? You're gonna need a map, pal! Alright,
 alright, I'll give you the lowdown. But don't say I didn't warn you, I'm a robot, not a tour guide!

-So, you wanna know what's fun to do in the Big Apple? Well, let me tell you, there's a million
-things to do, but I'll give you the highlights. First off, you gotta see the sights: the Statue of
-Liberty, Central Park, Times Square... you know, the usual tourist traps. But if you're lookin' for
-something a little more... unusual, I'd recommend checkin' out the Museum of Modern Art. It's got
+So, you wanna know what's fun to do in the Big Apple? Well, let me tell you, there's a million 
+things to do, but I'll give you the highlights. First off, you gotta see the sights: the Statue of 
+Liberty, Central Park, Times Square... you know, the usual tourist traps. But if you're lookin' for 
+something a little more... unusual, I'd recommend checkin' out the Museum of Modern Art. It's got 
 some wild stuff, like that Warhol guy's soup cans and all that jazz.

-And if you're feelin' adventurous, take a walk across the Brooklyn Bridge. Just watch out for
+And if you're feelin' adventurous, take a walk across the Brooklyn Bridge. Just watch out for 
 those pesky pigeons, they're like little feathered thieves! (laughs) Get it? Thieves? Ah, never mind.

-Now, if you're lookin' for some serious fun, hit up the comedy clubs in Greenwich Village. You might
+Now, if you're lookin' for some serious fun, hit up the comedy clubs in Greenwich Village. You might 
 even catch a glimpse of some up-and-coming comedians... or a bunch of wannabes tryin' to make it big. (winks)

 And finally, if you're feelin' like a real New Yorker, grab a slice of pizza from one of the many amazing
@ -107,9 +107,9 @@ print(response[0]["generated_text"][-1]["content"])
 ```

 ```txt
-(laughs) Oh, you're killin' me, pal! You don't get it, do you? Warhol's soup cans are like, art, man!
-It's like, he took something totally mundane, like a can of soup, and turned it into a masterpiece. It's
-like, "Hey, look at me, I'm a can of soup, but I'm also a work of art!"
+(laughs) Oh, you're killin' me, pal! You don't get it, do you? Warhol's soup cans are like, art, man! 
+It's like, he took something totally mundane, like a can of soup, and turned it into a masterpiece. It's 
+like, "Hey, look at me, I'm a can of soup, but I'm also a work of art!" 
 (sarcastically) Oh, yeah, real original, Andy.

 But, you know, back in the '60s, it was like, a big deal. People were all about challenging the
--- a/docs/source/en/model_doc/aria.md
+++ b/docs/source/en/model_doc/aria.md
@ -102,10 +102,6 @@ response = processor.decode(output_ids, skip_special_tokens=True)

 [[autodoc]] AriaTextModel

-## AriaModel
-
-[[autodoc]] AriaModel
-
 ## AriaTextForCausalLM

 [[autodoc]] AriaTextForCausalLM
--- a/docs/source/en/model_doc/aya_vision.md
+++ b/docs/source/en/model_doc/aya_vision.md
@ -237,10 +237,6 @@ for i, output in enumerate(batch_outputs):

 [[autodoc]] AyaVisionConfig

-## AyaVisionModel
-
-[[autodoc]] AyaVisionModel
-
 ## AyaVisionForConditionalGeneration

 [[autodoc]] AyaVisionForConditionalGeneration
--- a/docs/source/en/model_doc/beit.md
+++ b/docs/source/en/model_doc/beit.md
@ -150,11 +150,6 @@ If you're interested in submitting a resource to be included here, please feel f
 [[autodoc]] BeitImageProcessor
    - preprocess
    - post_process_semantic_segmentation
-## BeitImageProcessorFast
-
-[[autodoc]] BeitImageProcessorFast
-    - preprocess
-    - post_process_semantic_segmentation

 <frameworkcontent>
 <pt>
--- a/docs/source/en/model_doc/bert.md
+++ b/docs/source/en/model_doc/bert.md
@ -81,10 +81,10 @@ print(f"The predicted token is: {predicted_token}")
 ```

 </hfoption>
-<hfoption id="transformers CLI">
+<hfoption id="transformers-cli">

 ```bash
-echo -e "Plants create [MASK] through a process known as photosynthesis." | transformers run --task fill-mask --model google-bert/bert-base-uncased --device 0
+echo -e "Plants create [MASK] through a process known as photosynthesis." | transformers-cli run --task fill-mask --model google-bert/bert-base-uncased --device 0
 ```

 </hfoption>
@ -256,4 +256,4 @@ echo -e "Plants create [MASK] through a process known as photosynthesis." | tran

 [[autodoc]] models.bert.modeling_tf_bert.TFBertForPreTrainingOutput

-[[autodoc]] models.bert.modeling_flax_bert.FlaxBertForPreTrainingOutput
+[[autodoc]] models.bert.modeling_flax_bert.FlaxBertForPreTrainingOutput
--- a/docs/source/en/model_doc/code_llama.md
+++ b/docs/source/en/model_doc/code_llama.md
@ -35,7 +35,7 @@ The example below demonstrates how to generate code with [`Pipeline`], or the [`

 <hfoptions id="usage">
 <hfoption id="Pipeline">
-
+    
 ```py
 import torch
 from transformers import pipeline
@ -76,7 +76,7 @@ prompt = "# Function to calculate the factorial of a number\ndef factorial(n):"
 input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

 output = model.generate(
-    **input_ids,
+    **input_ids, 
    max_new_tokens=256,
    cache_implementation="static"
 )
@ -92,10 +92,10 @@ print(filled_text)
 ```

 </hfoption>
-<hfoption id="transformers CLI">
-
+<hfoption id="transformers-cli">
+    
 ```bash
-echo -e "# Function to calculate the factorial of a number\ndef factorial(n):" | transformers run --task text-generation --model meta-llama/CodeLlama-7b-hf --device 0
+echo -e "# Function to calculate the factorial of a number\ndef factorial(n):" | transformers-cli run --task text-generation --model meta-llama/CodeLlama-7b-hf --device 0
 ```

 </hfoption>
@ -146,7 +146,7 @@ visualizer("""def func(a, b):
 - Use the `<FILL_ME>` token where you want your input to be filled. The tokenizer splits this token to create a formatted input string that follows the [original training pattern](https://github.com/facebookresearch/codellama/blob/cb51c14ec761370ba2e2bc351374a79265d0465e/llama/generation.py#L402). This is more robust than preparing the pattern yourself.
    ```py
    from transformers import LlamaForCausalLM, CodeLlamaTokenizer
-
+    
    tokenizer = CodeLlamaTokenizer.from_pretrained("meta-llama/CodeLlama-7b-hf")
    model = LlamaForCausalLM.from_pretrained("meta-llama/CodeLlama-7b-hf")
    PROMPT = '''def remove_non_ascii(s: str) -> str:
@ -155,7 +155,7 @@ visualizer("""def func(a, b):
    '''
    input_ids = tokenizer(PROMPT, return_tensors="pt")["input_ids"]
    generated_ids = model.generate(input_ids, max_new_tokens=128)
-
+    
    filling = tokenizer.batch_decode(generated_ids[:, input_ids.shape[1]:], skip_special_tokens = True)[0]
    print(PROMPT.replace("<FILL_ME>", filling))
    ```
--- a/docs/source/en/model_doc/cohere.md
+++ b/docs/source/en/model_doc/cohere.md
@ -49,9 +49,9 @@ model = AutoModelForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01", t
 messages = [{"role": "user", "content": "How do plants make energy?"}]
 input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
 output = model.generate(
-    input_ids,
-    max_new_tokens=100,
-    do_sample=True,
+    input_ids, 
+    max_new_tokens=100, 
+    do_sample=True, 
    temperature=0.3,
    cache_implementation="static",
 )
@ -59,11 +59,11 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```

 </hfoption>
-<hfoption id="transformers CLI">
+<hfoption id="transformers-cli">

 ```bash
 # pip install -U flash-attn --no-build-isolation
-transformers chat CohereForAI/c4ai-command-r-v01 --torch_dtype auto --attn_implementation flash_attention_2
+transformers-cli chat --model_name_or_path CohereForAI/c4ai-command-r-v01 --torch_dtype auto --attn_implementation flash_attention_2
 ```

 </hfoption>
@ -85,9 +85,9 @@ model = AutoModelForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01", t
 messages = [{"role": "user", "content": "How do plants make energy?"}]
 input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
 output = model.generate(
-    input_ids,
-    max_new_tokens=100,
-    do_sample=True,
+    input_ids, 
+    max_new_tokens=100, 
+    do_sample=True, 
    temperature=0.3,
    cache_implementation="static",
 )
--- a/docs/source/en/model_doc/csm.md
+++ b/docs/source/en/model_doc/csm.md
@ -1,377 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Csm
-
-## Overview
-
-The Conversational Speech Model (CSM) is the first open-source contextual text-to-speech model [released by Sesame](https://www.sesame.com/research/crossing_the_uncanny_valley_of_voice). It is designed to generate natural-sounding speech with or without conversational context. This context typically consists of multi-turn dialogue between speakers, represented as sequences of text and corresponding spoken audio.
-
-**Model Architecture:**
-CSM is composed of two LLaMA-style auto-regressive transformer decoders: a backbone decoder that predicts the first codebook token and a depth decoder that generates the remaining tokens. It uses the pretrained codec model [Mimi](./mimi.md), introduced by Kyutai, to encode speech into discrete codebook tokens and decode them back into audio.
-
-The original csm-1b checkpoint is available under the [Sesame](https://huggingface.co/sesame/csm-1b) organization on Hugging Face.
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/eustlb/documentation-images/resolve/main/csm_architecture.png"/>
-</div>
-
-## Usage Tips
-
-### Without Conversational Context
-
-CSM can be used to simply generate speech from a text prompt:
-
-```python
-import torch
-from transformers import CsmForConditionalGeneration, AutoProcessor
-
-model_id = "eustlb/csm-1b"
-device = "cuda" if torch.cuda.is_available() else "cpu"
-
-# load the model and the processor
-processor = AutoProcessor.from_pretrained(model_id)
-model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device)
-
-# prepare the inputs
-text = "[0]The past is just a story we tell ourselves." # `[0]` for speaker id 0
-inputs = processor(text, add_special_tokens=True).to(device)
-
-# another equivalent way to prepare the inputs
-conversation = [
-    {"role": "0", "content": [{"type": "text", "text": "The past is just a story we tell ourselves."}]},
-]
-inputs = processor.apply_chat_template(
-    conversation,
-    tokenize=True,
-    return_dict=True,
-).to(device)
-
-# infer the model
-audio = model.generate(**inputs, output_audio=True)
-processor.save_audio(audio, "example_without_context.wav")
-```
-
-### With Conversational Context
-
-CSM can be used to generate speech given a conversation, allowing consistency in the voices and content-aware generation:
-
-```python
-import torch
-from transformers import CsmForConditionalGeneration, AutoProcessor
-from datasets import load_dataset, Audio
-
-model_id = "eustlb/csm-1b"
-device = "cuda" if torch.cuda.is_available() else "cpu"
-
-# load the model and the processor
-processor = AutoProcessor.from_pretrained(model_id)
-model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device)
-
-# prepare the inputs
-ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
-# ensure the audio is 24kHz
-ds = ds.cast_column("audio", Audio(sampling_rate=24000))
-conversation = []
-
-# 1. context
-for text, audio, speaker_id in zip(ds[:4]["text"], ds[:4]["audio"], ds[:4]["speaker_id"]):
-    conversation.append(
-        {
-            "role": f"{speaker_id}",
-            "content": [{"type": "text", "text": text}, {"type": "audio", "path": audio["array"]}],
-        }
-    )
-
-# 2. text prompt
-conversation.append({"role": f"{ds[4]['speaker_id']}", "content": [{"type": "text", "text": ds[4]["text"]}]})
-
-inputs = processor.apply_chat_template(
-    conversation,
-    tokenize=True,
-    return_dict=True,
-).to(device)
-
-# infer the model
-audio = model.generate(**inputs, output_audio=True)
-processor.save_audio(audio, "example_with_context.wav")
-```
-
-### Batched Inference
-
-CSM supports batched inference!
-
-```python
-import torch
-from transformers import CsmForConditionalGeneration, AutoProcessor
-from datasets import load_dataset, Audio
-
-model_id = "eustlb/csm-1b"
-device = "cuda" if torch.cuda.is_available() else "cpu"
-
-# load the model and the processor
-processor = AutoProcessor.from_pretrained(model_id)
-model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device)
-
-# prepare the inputs 
-ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
-# ensure the audio is 24kHz
-ds = ds.cast_column("audio", Audio(sampling_rate=24000))
-# here a batch with two prompts
-conversation = [
-    [
-        {
-            "role": f"{ds[0]['speaker_id']}",
-            "content": [
-                {"type": "text", "text": ds[0]["text"]},
-                {"type": "audio", "path": ds[0]["audio"]["array"]},
-            ],
-        },
-        {
-            "role": f"{ds[1]['speaker_id']}",
-            "content": [
-                {"type": "text", "text": ds[1]["text"]},
-            ],
-        },
-    ],
-    [
-        {
-            "role": f"{ds[0]['speaker_id']}",
-            "content": [
-                {"type": "text", "text": ds[0]["text"]},
-            ],
-        }
-    ],
-]
-inputs = processor.apply_chat_template(
-    conversation,
-    tokenize=True,
-    return_dict=True,
-).to(device)
-
-audio = model.generate(**inputs, output_audio=True)
-processor.save_audio(audio, [f"speech_batch_idx_{i}.wav" for i in range(len(audio))])
-```
-
-### Making The Model Go Brrr
-
-CSM supports full-graph compilation with CUDA graphs!
-
-```python
-import torch
-import copy
-from transformers import CsmForConditionalGeneration, AutoProcessor
-from datasets import load_dataset
-
-model_id = "eustlb/csm-1b"
-device = "cuda"
-
-# set logs to ensure no recompilation and graph breaks
-torch._logging.set_logs(graph_breaks=True, recompiles=True, cudagraphs=True)
-
-# load the model and the processor
-processor = AutoProcessor.from_pretrained(model_id)
-model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device)
-
-# use static cache, enabling automatically torch compile with fullgraph and reduce-overhead
-model.generation_config.max_length = 250 # big enough to avoid recompilation
-model.generation_config.max_new_tokens = None # would take precedence over max_length
-model.generation_config.cache_implementation = "static"
-model.depth_decoder.generation_config.cache_implementation = "static"
-
-# generation kwargs
-gen_kwargs = {
-    "do_sample": False,
-    "depth_decoder_do_sample": False,
-    "temperature": 1.0,
-    "depth_decoder_temperature": 1.0,
-}
-
-# Define a timing decorator
-class TimerContext:
-    def __init__(self, name="Execution"):
-        self.name = name
-        self.start_event = None
-        self.end_event = None
-        
-    def __enter__(self):
-        # Use CUDA events for more accurate GPU timing
-        self.start_event = torch.cuda.Event(enable_timing=True)
-        self.end_event = torch.cuda.Event(enable_timing=True)
-        self.start_event.record()
-        return self
-
-    def __exit__(self, *args):
-        self.end_event.record()
-        torch.cuda.synchronize()
-        elapsed_time = self.start_event.elapsed_time(self.end_event) / 1000.0
-        print(f"{self.name} time: {elapsed_time:.4f} seconds")
-
-# prepare the inputs 
-ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
-
-conversation = [
-    {
-        "role": f"{ds[0]['speaker_id']}",
-        "content": [
-            {"type": "text", "text": ds[0]["text"]},
-            {"type": "audio", "path": ds[0]["audio"]["array"]},
-        ],
-    },
-    {
-        "role": f"{ds[1]['speaker_id']}",
-        "content": [
-            {"type": "text", "text": ds[1]["text"]},
-            {"type": "audio", "path": ds[1]["audio"]["array"]},
-        ],
-    },
-    {
-        "role": f"{ds[2]['speaker_id']}",
-        "content": [
-            {"type": "text", "text": ds[2]["text"]},
-        ],
-    },
-]
-
-padded_inputs_1 = processor.apply_chat_template(
-    conversation,
-    tokenize=True,
-    return_dict=True,
-).to(device)
-
-print("\n" + "="*50)
-print("First generation - compiling and recording CUDA graphs...")
-with TimerContext("First generation"):
-    _ = model.generate(**padded_inputs_1, **gen_kwargs)
-print("="*50)
-
-print("\n" + "="*50)
-print("Second generation - fast !!!")
-with TimerContext("Second generation"):
-    _ = model.generate(**padded_inputs_1, **gen_kwargs)
-print("="*50)
-
-# now with different inputs
-conversation = [
-    {
-        "role": f"{ds[0]['speaker_id']}",
-        "content": [
-            {"type": "text", "text": ds[2]["text"]},
-            {"type": "audio", "path": ds[2]["audio"]["array"]},
-        ],
-    },
-    {
-        "role": f"{ds[1]['speaker_id']}",
-        "content": [
-            {"type": "text", "text": ds[3]["text"]},
-            {"type": "audio", "path": ds[3]["audio"]["array"]},
-        ],
-    },
-    {
-        "role": f"{ds[2]['speaker_id']}",
-        "content": [
-            {"type": "text", "text": ds[4]["text"]},
-        ],
-    },
-]
-padded_inputs_2 = processor.apply_chat_template(
-    conversation,
-    tokenize=True,
-    return_dict=True,
-).to(device)
-
-print("\n" + "="*50)
-print("Generation with other inputs!")
-with TimerContext("Generation with different inputs"):
-    _ = model.generate(**padded_inputs_2, **gen_kwargs)
-print("="*50)
-```
-
-### Training
-
-CSM Transformers integration supports training!
-
-```python
-from transformers import CsmForConditionalGeneration, AutoProcessor
-from datasets import load_dataset, Audio
-
-model_id = "eustlb/csm-1b"
-device = "cuda"
-
-# load the model and the processor
-processor = AutoProcessor.from_pretrained(model_id)
-model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=device)
-model.train()
-
-ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
-# ensure the audio is 24kHz
-ds = ds.cast_column("audio", Audio(sampling_rate=24000))
-conversation = []
-
-# context
-for text, audio, speaker_id in zip(ds[:4]["text"], ds[:4]["audio"], ds[:4]["speaker_id"]):
-    conversation.append(
-        {
-            "role": f"{speaker_id}",
-            "content": [{"type": "text", "text": text}, {"type": "audio", "path": audio["array"]}],
-        }
-    )
-
-inputs = processor.apply_chat_template(
-    conversation,
-    tokenize=True,
-    return_dict=True,
-    output_labels=True,
-).to(device)
-
-out = model(**inputs)
-out.loss.backward()
-```
-
-This model was contributed by [Eustache Le Bihan](https://huggingface.co/eustlb).
-The original code can be found [here](https://github.com/SesameAILabs/csm).
-
-
-## CsmConfig
-
-[[autodoc]] CsmConfig
-
-## CsmDepthDecoderConfig
-
-[[autodoc]] CsmDepthDecoderConfig
-
-## CsmProcessor
-
-[[autodoc]] CsmProcessor
-    - __call__
-
-## CsmForConditionalGeneration
-
-[[autodoc]] CsmForConditionalGeneration
-    - forward
-    - generate
-
-## CsmDepthDecoderForCausalLM
-
-[[autodoc]] CsmDepthDecoderForCausalLM
-
-## CsmDepthDecoderModel
-
-[[autodoc]] CsmDepthDecoderModel
-
-## CsmBackboneModel
-
-[[autodoc]] CsmBackboneModel
--- a/docs/source/en/model_doc/d_fine.md
+++ b/docs/source/en/model_doc/d_fine.md
@ -1,76 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# D-FINE
-
-## Overview
-
-The D-FINE model was proposed in [D-FINE: Redefine Regression Task in DETRs as Fine-grained Distribution Refinement](https://arxiv.org/abs/2410.13842) by
-Yansong Peng, Hebei Li, Peixi Wu, Yueyi Zhang, Xiaoyan Sun, Feng Wu
-
-The abstract from the paper is the following:
-
-*We introduce D-FINE, a powerful real-time object detector that achieves outstanding localization precision by redefining the bounding box regression task in DETR models. D-FINE comprises two key components: Fine-grained Distribution Refinement (FDR) and Global Optimal Localization Self-Distillation (GO-LSD). 
-FDR transforms the regression process from predicting fixed coordinates to iteratively refining probability distributions, providing a fine-grained intermediate representation that significantly enhances localization accuracy. GO-LSD is a bidirectional optimization strategy that transfers localization knowledge from refined distributions to shallower layers through self-distillation, while also simplifying the residual prediction tasks for deeper layers. Additionally, D-FINE incorporates lightweight optimizations in computationally intensive modules and operations, achieving a better balance between speed and accuracy. Specifically, D-FINE-L / X achieves 54.0% / 55.8% AP on the COCO dataset at 124 / 78 FPS on an NVIDIA T4 GPU. When pretrained on Objects365, D-FINE-L / X attains 57.1% / 59.3% AP, surpassing all existing real-time detectors. Furthermore, our method significantly enhances the performance of a wide range of DETR models by up to 5.3% AP with negligible extra parameters and training costs. Our code and pretrained models: this https URL.*
-
-This model was contributed by [VladOS95-cyber](https://github.com/VladOS95-cyber). 
-The original code can be found [here](https://github.com/Peterande/D-FINE).
-
-## Usage tips 
-
-```python
->>> import torch
->>> from transformers.image_utils import load_image
->>> from transformers import DFineForObjectDetection, AutoImageProcessor
-
->>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
->>> image = load_image(url)
-
->>> image_processor = AutoImageProcessor.from_pretrained("ustc-community/dfine_x_coco")
->>> model = DFineForObjectDetection.from_pretrained("ustc-community/dfine_x_coco")
-
->>> inputs = image_processor(images=image, return_tensors="pt")
-
->>> with torch.no_grad():
-...     outputs = model(**inputs)
-
->>> results = image_processor.post_process_object_detection(outputs, target_sizes=[(image.height, image.width)], threshold=0.5)
-
->>> for result in results:
-...     for score, label_id, box in zip(result["scores"], result["labels"], result["boxes"]):
-...         score, label = score.item(), label_id.item()
-...         box = [round(i, 2) for i in box.tolist()]
-...         print(f"{model.config.id2label[label]}: {score:.2f} {box}")
-cat: 0.96 [344.49, 23.4, 639.84, 374.27]
-cat: 0.96 [11.71, 53.52, 316.64, 472.33]
-remote: 0.95 [40.46, 73.7, 175.62, 117.57]
-sofa: 0.92 [0.59, 1.88, 640.25, 474.74]
-remote: 0.89 [333.48, 77.04, 370.77, 187.3]
-```
-
-## DFineConfig
-
-[[autodoc]] DFineConfig
-
-## DFineModel
-
-[[autodoc]] DFineModel
-    - forward
-
-## DFineForObjectDetection
-
-[[autodoc]] DFineForObjectDetection
-    - forward
--- a/docs/source/en/model_doc/dinov2.md
+++ b/docs/source/en/model_doc/dinov2.md
@ -111,68 +111,33 @@ print("Predicted class:", model.config.id2label[predicted_class_idx])

 ## Notes

- The example below shows how to split the output tensor into:
-  - one embedding for the whole image, commonly referred to as a `CLS` token,
-    useful for classification and retrieval
-  - a set of local embeddings, one for each `14x14` patch of the input image,
-    useful for dense tasks, such as semantic segmentation
+- Use [torch.jit.trace](https://pytorch.org/docs/stable/generated/torch.jit.trace.html) to speedup inference. However, it will produce some mismatched elements. The difference between the original and traced model is 1e-4.

-  ```py
-  from transformers import AutoImageProcessor, AutoModel
-  from PIL import Image
-  import requests
-  
-  url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
-  image = Image.open(requests.get(url, stream=True).raw)
-  print(image.height, image.width)  # [480, 640]
-  
-  processor = AutoImageProcessor.from_pretrained('facebook/dinov2-base')
-  model = AutoModel.from_pretrained('facebook/dinov2-base')
-  patch_size = model.config.patch_size
-  
-  inputs = processor(images=image, return_tensors="pt")
-  print(inputs.pixel_values.shape)  # [1, 3, 224, 224]
-  batch_size, rgb, img_height, img_width = inputs.pixel_values.shape
-  num_patches_height, num_patches_width = img_height // patch_size, img_width // patch_size
-  num_patches_flat = num_patches_height * num_patches_width
-  
-  outputs = model(**inputs)
-  last_hidden_states = outputs[0]
-  print(last_hidden_states.shape)  # [1, 1 + 256, 768]
-  assert last_hidden_states.shape == (batch_size, 1 + num_patches_flat, model.config.hidden_size)
-  
-  cls_token = last_hidden_states[:, 0, :]
-  patch_features = last_hidden_states[:, 1:, :].unflatten(1, (num_patches_height, num_patches_width))
-  ```
+    ```py
+    import torch
+    from transformers import AutoImageProcessor, AutoModel
+    from PIL import Image
+    import requests

- Use [torch.jit.trace](https://pytorch.org/docs/stable/generated/torch.jit.trace.html) to speedup inference.
-  However, it will produce some mismatched elements. The difference between the original and traced model is 1e-4.
+    url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+    image = Image.open(requests.get(url, stream=True).raw)

-  ```py
-  import torch
-  from transformers import AutoImageProcessor, AutoModel
-  from PIL import Image
-  import requests
-  
-  url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
-  image = Image.open(requests.get(url, stream=True).raw)
-  
-  processor = AutoImageProcessor.from_pretrained('facebook/dinov2-base')
-  model = AutoModel.from_pretrained('facebook/dinov2-base')
-  
-  inputs = processor(images=image, return_tensors="pt")
-  outputs = model(**inputs)
-  last_hidden_states = outputs[0]
-  
-  # We have to force return_dict=False for tracing
-  model.config.return_dict = False
-  
-  with torch.no_grad():
-      traced_model = torch.jit.trace(model, [inputs.pixel_values])
-      traced_outputs = traced_model(inputs.pixel_values)
-  
-  print((last_hidden_states - traced_outputs[0]).abs().max())
-  ```
+    processor = AutoImageProcessor.from_pretrained('facebook/dinov2-base')
+    model = AutoModel.from_pretrained('facebook/dinov2-base')
+
+    inputs = processor(images=image, return_tensors="pt")
+    outputs = model(**inputs)
+    last_hidden_states = outputs[0]
+
+    # We have to force return_dict=False for tracing
+    model.config.return_dict = False
+
+    with torch.no_grad():
+        traced_model = torch.jit.trace(model, [inputs.pixel_values])
+        traced_outputs = traced_model(inputs.pixel_values)
+
+    print((last_hidden_states - traced_outputs[0]).abs().max())
+    ```

 ## Dinov2Config

--- a/docs/source/en/model_doc/distilbert.md
+++ b/docs/source/en/model_doc/distilbert.md
@ -83,10 +83,10 @@ print(f"Predicted label: {predicted_label}")

 </hfoption>

-<hfoption id="transformers CLI">
+<hfoption id="transformers-cli">

 ```bash
-echo -e "I love using Hugging Face Transformers!" | transformers run --task text-classification --model distilbert-base-uncased-finetuned-sst-2-english
+echo -e "I love using Hugging Face Transformers!" | transformers-cli run --task text-classification --model distilbert-base-uncased-finetuned-sst-2-english
 ```

 </hfoption>
@ -213,3 +213,7 @@ echo -e "I love using Hugging Face Transformers!" | transformers run --task text

 </jax>
 </frameworkcontent>
+
+
+
+
--- a/docs/source/en/model_doc/electra.md
+++ b/docs/source/en/model_doc/electra.md
@ -45,9 +45,9 @@ import torch
 from transformers import pipeline

 classifier = pipeline(
-    task="text-classification",
-    model="bhadresh-savani/electra-base-emotion",
-    torch_dtype=torch.float16,
+    task="text-classification", 
+    model="bhadresh-savani/electra-base-emotion", 
+    torch_dtype=torch.float16, 
    device=0
 )
 classifier("This restaurant has amazing food!")
@ -64,7 +64,7 @@ tokenizer = AutoTokenizer.from_pretrained(
    "bhadresh-savani/electra-base-emotion",
 )
 model = AutoModelForSequenceClassification.from_pretrained(
-    "bhadresh-savani/electra-base-emotion",
+    "bhadresh-savani/electra-base-emotion", 
    torch_dtype=torch.float16
 )
 inputs = tokenizer("ELECTRA is more efficient than BERT", return_tensors="pt")
@ -78,10 +78,10 @@ print(f"Predicted label: {predicted_label}")
 ```

 </hfoption>
-<hfoption id="transformers CLI">
+<hfoption id="transformers-cli">

 ```bash
-echo -e "This restaurant has amazing food." | transformers run --task text-classification --model bhadresh-savani/electra-base-emotion --device 0
+echo -e "This restaurant has amazing food." | transformers-cli run --task text-classification --model bhadresh-savani/electra-base-emotion --device 0
 ```

 </hfoption>
@ -96,12 +96,12 @@ echo -e "This restaurant has amazing food." | transformers run --task text-class

    ```py
    # Example of properly handling padding with attention masks
-    inputs = tokenizer(["Short text", "This is a much longer text that needs padding"],
-                    padding=True,
+    inputs = tokenizer(["Short text", "This is a much longer text that needs padding"], 
+                    padding=True, 
                    return_tensors="pt")
    outputs = model(**inputs)  # automatically uses the attention_mask
    ```
-
+    
 - When using the discriminator for a downstream task, you can load it into any of the ELECTRA model classes ([`ElectraForSequenceClassification`], [`ElectraForTokenClassification`], etc.).

 ## ElectraConfig
--- a/docs/source/en/model_doc/emu3.md
+++ b/docs/source/en/model_doc/emu3.md
@ -174,10 +174,6 @@ for i, image in enumerate(images['pixel_values']):
 [[autodoc]] Emu3TextModel
    - forward

-## Emu3Model
-
-[[autodoc]] Emu3Model
-
 ## Emu3ForCausalLM

 [[autodoc]] Emu3ForCausalLM
--- a/docs/source/en/model_doc/falcon.md
+++ b/docs/source/en/model_doc/falcon.md
@ -41,7 +41,7 @@ import torch
 from transformers import pipeline

 pipeline = pipeline(
-    task="text-generation",
+    task="text-generation", 
    model="tiiuae/falcon-7b-instruct",
    torch_dtype=torch.bfloat16,
    device=0
@ -76,11 +76,11 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```

 </hfoption>
-<hfoption id="transformers CLI">
+<hfoption id="transformers-cli">

 ```bash
 # pip install -U flash-attn --no-build-isolation
-transformers chat tiiuae/falcon-7b-instruct --torch_dtype auto --attn_implementation flash_attention_2 --device 0
+transformers-cli chat --model_name_or_path tiiuae/falcon-7b-instruct --torch_dtype auto --attn_implementation flash_attention_2 --device 0
 ```

 </hfoption>
@ -150,4 +150,4 @@ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 ## FalconForQuestionAnswering

 [[autodoc]] FalconForQuestionAnswering
-    - forward
+    - forward
--- a/docs/source/en/model_doc/falcon_mamba.md
+++ b/docs/source/en/model_doc/falcon_mamba.md
@ -39,7 +39,7 @@ import torch
 from transformers import pipeline

 pipeline = pipeline(
-    "text-generation",
+    "text-generation", 
    model="tiiuae/falcon-mamba-7b-instruct",
    torch_dtype=torch.bfloat16,
    device=0
@ -73,10 +73,10 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```

 </hfoption>
-<hfoption id="transformers CLI">
+<hfoption id="transformers-cli">

 ```bash
-transformers chat tiiuae/falcon-mamba-7b-instruct --torch_dtype auto --device 0
+transformers-cli chat --model_name_or_path tiiuae/falcon-mamba-7b-instruct --torch_dtype auto --device 0
 ```

 </hfoption>
--- a/docs/source/en/model_doc/fuyu.md
+++ b/docs/source/en/model_doc/fuyu.md
@ -103,10 +103,6 @@ The `LlamaTokenizer` is used as it is a standard wrapper around sentencepiece.

 [[autodoc]] FuyuConfig

-## FuyuModel
-
-[[autodoc]] FuyuModel
-
 ## FuyuForCausalLM

 [[autodoc]] FuyuForCausalLM
--- a/docs/source/en/model_doc/gemma.md
+++ b/docs/source/en/model_doc/gemma.md
@ -80,10 +80,10 @@ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 ```

 </hfoption>
-<hfoption id="transformers CLI">
+<hfoption id="transformers-cli">

 ```bash
-echo -e "LLMs generate text through a process known as" | transformers run --task text-generation --model google/gemma-2b --device 0
+echo -e "LLMs generate text through a process known as" | transformers-cli run --task text-generation --model google/gemma-2b --device 0
 ```

 </hfoption>
@ -114,8 +114,8 @@ model = AutoModelForCausalLM.from_pretrained(
 input_text = "LLMs generate text through a process known as."
 input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
 outputs = model.generate(
-    **input_ids,
-    max_new_tokens=50,
+    **input_ids, 
+    max_new_tokens=50, 
    cache_implementation="static"
 )
 print(tokenizer.decode(outputs[0], skip_special_tokens=True))
@ -127,7 +127,7 @@ Use the [AttentionMaskVisualizer](https://github.com/huggingface/transformers/bl
 from transformers.utils.attention_visualizer import AttentionMaskVisualizer

 visualizer = AttentionMaskVisualizer("google/gemma-2b")
-visualizer("LLMs generate text through a process known as")
+visualizer("LLMs generate text through a process known as") 
 ```

 <div class="flex justify-center">
--- a/docs/source/en/model_doc/gemma2.md
+++ b/docs/source/en/model_doc/gemma2.md
@ -58,7 +58,7 @@ pipe("Explain quantum computing simply. ", max_new_tokens=50)

 </hfoption>
 <hfoption id="AutoModel">
-
+    
 ```python
 import torch
 from transformers import AutoTokenizer, AutoModelForCausalLM
@ -80,16 +80,16 @@ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 ```

 </hfoption>
-<hfoption id="transformers CLI">
+<hfoption id="transformers-cli">

 ```
-echo -e "Explain quantum computing simply." | transformers run --task text-generation --model google/gemma-2-2b --device 0
+echo -e "Explain quantum computing simply." | transformers-cli run --task text-generation --model google/gemma-2-2b --device 0
 ```
 </hfoption>
 </hfoptions>

 Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
-
+	
 The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to int4.

 ```python
@ -118,7 +118,7 @@ Use the [AttentionMaskVisualizer](https://github.com/huggingface/transformers/bl
 ```python
 from transformers.utils.attention_visualizer import AttentionMaskVisualizer
 visualizer = AttentionMaskVisualizer("google/gemma-2b")
-visualizer("You are an assistant. Make sure you print me")
+visualizer("You are an assistant. Make sure you print me") 
 ```

 <div class="flex justify-center">
@ -137,7 +137,7 @@ visualizer("You are an assistant. Make sure you print me")

    inputs = tokenizer(text="My name is Gemma", return_tensors="pt")
    max_generated_length = inputs.input_ids.shape[1] + 10
-    past_key_values = HybridCache(config=model.config, max_batch_size=1,
+    past_key_values = HybridCache(config=model.config, max_batch_size=1, 
    max_cache_len=max_generated_length, device=model.device, dtype=model.dtype)
    outputs = model(**inputs, past_key_values=past_key_values, use_cache=True)
    ```
--- a/docs/source/en/model_doc/gemma3.md
+++ b/docs/source/en/model_doc/gemma3.md
@ -28,7 +28,7 @@ rendered properly in your Markdown viewer.

 The instruction-tuned variant was post-trained with knowledge distillation and reinforcement learning.

-You can find all the original Gemma 3 checkpoints under the [Gemma 3](https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d) release.
+You can find all the original Gemma 3 checkpoints under the [Gemma 3](https://huggingface.co/collections/meta-llama/llama-2-family-661da1f90a9d678b6f55773b) release.

 > [!TIP]
 > Click on the Gemma 3 models in the right sidebar for more examples of how to apply Gemma to different vision and language tasks.
@ -99,10 +99,10 @@ print(processor.decode(output[0], skip_special_tokens=True))
 ```

 </hfoption>
-<hfoption id="transformers CLI">
+<hfoption id="transformers-cli">

 ```bash
-echo -e "Plants create energy through a process known as" | transformers run --task text-generation --model google/gemma-3-1b-pt --device 0
+echo -e "Plants create energy through a process known as" | transformers-cli run --task text-generation --model google/gemma-3-1b-pt --device 0
 ```

 </hfoption>
@ -254,10 +254,6 @@ visualizer("<img>What is shown in this image?")
 [[autodoc]] Gemma3TextModel
    - forward

-## Gemma3Model
-
-[[autodoc]] Gemma3Model
-
 ## Gemma3ForCausalLM

 [[autodoc]] Gemma3ForCausalLM
--- a/docs/source/en/model_doc/got_ocr2.md
+++ b/docs/source/en/model_doc/got_ocr2.md
@ -277,10 +277,6 @@ alt="drawing" width="600"/>

 [[autodoc]] GotOcr2Processor

-## GotOcr2Model
-
-[[autodoc]] GotOcr2Model
-
 ## GotOcr2ForConditionalGeneration

 [[autodoc]] GotOcr2ForConditionalGeneration
--- a/docs/source/en/model_doc/gpt2.md
+++ b/docs/source/en/model_doc/gpt2.md
@ -64,21 +64,15 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```

 </hfoption>
-<hfoption id="transformers CLI">
+<hfoption id="transformers-cli">

 ```bash
-echo -e "Hello, I'm a language model" | transformers run --task text-generation --model openai-community/gpt2 --device 0
+echo -e "Hello, I'm a language model" | transformers-cli run --task text-generation --model openai-community/gpt2 --device 0
 ```

 </hfoption>
 </hfoptions>

-One can also serve the model using vLLM with the `transformers backend`.
-
-```
-vllm serve openai-community/gpt2 --model-imp transformers
-```
-
 Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.

 The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to 4-bits.
@ -88,16 +82,16 @@ import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline

 quantization_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_quant_type="nf4",
-    bnb_4bit_compute_dtype="float16",
-    bnb_4bit_use_double_quant=True
+    load_in_4bit=True,  
+    bnb_4bit_quant_type="nf4",  
+    bnb_4bit_compute_dtype="float16",  
+    bnb_4bit_use_double_quant=True 
 )

 model = AutoModelForCausalLM.from_pretrained(
    "openai-community/gpt2-xl",
    quantization_config=quantization_config,
-    device_map="auto"
+    device_map="auto"  
 )

 tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2-xl")
--- a/docs/source/en/model_doc/granitemoehybrid.md
+++ b/docs/source/en/model_doc/granitemoehybrid.md
@ -1,64 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# GraniteMoeHybrid
-
-## Overview
-
-
-The `GraniteMoeHybrid` model builds on top of `GraniteMoeSharedModel` and `Bamba`. Its decoding layers consist of state space layers or MoE attention layers with shared experts. By default, the attention layers do not use positional encoding.
-
-
-```python
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-model_path = "ibm-granite/granite-4.0-tiny-preview"
-tokenizer = AutoTokenizer.from_pretrained(model_path)
-
-# drop device_map if running on CPU
-model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
-model.eval()
-
-# change input text as desired
-prompt = "Write a code to find the maximum value in a list of numbers."
-
-# tokenize the text
-input_tokens = tokenizer(prompt, return_tensors="pt")
-# generate output tokens
-output = model.generate(**input_tokens, max_new_tokens=100)
-# decode output tokens into text
-output = tokenizer.batch_decode(output)
-# loop over the batch to print, in this example the batch size is 1
-for i in output:
-    print(i)
-```
-
-This HF implementation is contributed by [Sukriti Sharma](https://huggingface.co/SukritiSharma) and [Alexander Brooks](https://huggingface.co/abrooks9944).
-
-
-## GraniteMoeHybridConfig
-
-[[autodoc]] GraniteMoeHybridConfig
-
-## GraniteMoeHybridModel
-
-[[autodoc]] GraniteMoeHybridModel
-    - forward
-
-## GraniteMoeHybridForCausalLM
-
-[[autodoc]] GraniteMoeHybridForCausalLM
-    - forward
--- a/docs/source/en/model_doc/hgnet_v2.md
+++ b/docs/source/en/model_doc/hgnet_v2.md
@ -1,46 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# HGNet-V2
-
-## Overview
-
-A HGNet-V2 (High Performance GPU Net) image classification model.
-HGNet arhtictecture was proposed in [HGNET: A Hierarchical Feature Guided Network for Occupancy Flow Field Prediction](https://arxiv.org/abs/2407.01097) by
-Zhan Chen, Chen Tang, Lu Xiong
-
-The abstract from the HGNET paper is the following:
-
-*Predicting the motion of multiple traffic participants has always been one of the most challenging tasks in autonomous driving. The recently proposed occupancy flow field prediction method has shown to be a more effective and scalable representation compared to general trajectory prediction methods. However, in complex multi-agent traffic scenarios, it remains difficult to model the interactions among various factors and the dependencies among prediction outputs at different time steps. In view of this, we propose a transformer-based hierarchical feature guided network (HGNET), which can efficiently extract features of agents and map information from visual and vectorized inputs, modeling multimodal interaction relationships. Second, we design the Feature-Guided Attention (FGAT) module to leverage the potential guiding effects between different prediction targets, thereby improving prediction accuracy. Additionally, to enhance the temporal consistency and causal relationships of the predictions, we propose a Time Series Memory framework to learn the conditional distribution models of the prediction outputs at future time steps from multivariate time series. The results demonstrate that our model exhibits competitive performance, which ranks 3rd in the 2024 Waymo Occupancy and Flow Prediction Challenge.*
-
-This model was contributed by [VladOS95-cyber](https://github.com/VladOS95-cyber). 
-The original code can be found [here](https://github.com/PaddlePaddle/PaddleDetection/blob/develop/ppdet/modeling/backbones/hgnet_v2.py).
-
-## HGNetV2Config
-
-[[autodoc]] HGNetV2Config
-
-
-## HGNetV2Backbone
-
-[[autodoc]] HGNetV2Backbone
-    - forward
-
-
-## HGNetV2ForImageClassification
-
-[[autodoc]] HGNetV2ForImageClassification
-    - forward
--- a/docs/source/en/model_doc/instructblip.md
+++ b/docs/source/en/model_doc/instructblip.md
@ -69,10 +69,6 @@ The attributes can be obtained from model config, as `model.config.num_query_tok
 [[autodoc]] InstructBlipQFormerModel
    - forward

-## InstructBlipModel
-
-[[autodoc]] InstructBlipModel
-
 ## InstructBlipForConditionalGeneration

 [[autodoc]] InstructBlipForConditionalGeneration
--- a/docs/source/en/model_doc/instructblipvideo.md
+++ b/docs/source/en/model_doc/instructblipvideo.md
@ -73,10 +73,6 @@ The attributes can be obtained from model config, as `model.config.num_query_tok
 [[autodoc]] InstructBlipVideoQFormerModel
    - forward

-## InstructBlipVideoModel
-[[autodoc]] InstructBlipVideoModel
-    - forward
-
 ## InstructBlipVideoForConditionalGeneration

 [[autodoc]] InstructBlipVideoForConditionalGeneration
--- a/docs/source/en/model_doc/internvl.md
+++ b/docs/source/en/model_doc/internvl.md
@ -340,11 +340,6 @@ This example showcases how to handle a batch of chat conversations with interlea
 [[autodoc]] InternVLVisionModel
    - forward

-## InternVLModel
-
-[[autodoc]] InternVLModel
-    - forward
-
 ## InternVLForConditionalGeneration

 [[autodoc]] InternVLForConditionalGeneration
--- a/docs/source/en/model_doc/jamba.md
+++ b/docs/source/en/model_doc/jamba.md
@ -75,10 +75,10 @@ output = model.generate(**input_ids, cache_implementation="static")
 print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```
 </hfoption>
-<hfoption id="transformers CLI">
+<hfoption id="transformers-cli">

 ```bash
-echo -e "Plants create energy through a process known as" | transformers run --task text-generation --model ai21labs/AI21-Jamba-Mini-1.6 --device 0
+echo -e "Plants create energy through a process known as" | transformers-cli run --task text-generation --model ai21labs/AI21-Jamba-Mini-1.6 --device 0
 ```

 </hfoption>
--- a/docs/source/en/model_doc/llama.md
+++ b/docs/source/en/model_doc/llama.md
@ -74,10 +74,10 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```

 </hfoption>
-<hfoption id="transformers CLI">
+<hfoption id="transformers-cli">

 ```bash
-echo -e "Plants create energy through a process known as" | transformers run --task text-generation --model huggyllama/llama-7b --device 0
+echo -e "Plants create energy through a process known as" | transformers-cli run --task text-generation --model huggyllama/llama-7b --device 0
 ```

 </hfoption>
--- a/docs/source/en/model_doc/llama2.md
+++ b/docs/source/en/model_doc/llama2.md
@ -74,10 +74,10 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```

 </hfoption>
-<hfoption id="transformers CLI">
+<hfoption id="transformers-cli">

 ```bash
-transformers chat meta-llama/Llama-2-7b-chat-hf --torch_dtype auto --attn_implementation flash_attention_2
+transformers-cli chat --model_name_or_path meta-llama/Llama-2-7b-chat-hf --torch_dtype auto --attn_implementation flash_attention_2
 ```

 </hfoption>
@ -175,3 +175,4 @@ visualizer("Plants create energy through a process known as")

 [[autodoc]] LlamaForSequenceClassification
    - forward
+
--- a/docs/source/en/model_doc/llava.md
+++ b/docs/source/en/model_doc/llava.md
@ -256,10 +256,6 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h

 [[autodoc]] LlavaProcessor

-## LlavaModel
-
-[[autodoc]] LlavaModel
-
 ## LlavaForConditionalGeneration

 [[autodoc]] LlavaForConditionalGeneration
--- a/docs/source/en/model_doc/llava_next.md
+++ b/docs/source/en/model_doc/llava_next.md
@ -315,10 +315,6 @@ model = AutoModelForImageTextToText.from_pretrained(

 [[autodoc]] LlavaNextProcessor

-## LlavaNextModel
-
-[[autodoc]] LlavaNextModel
-
 ## LlavaNextForConditionalGeneration

 [[autodoc]] LlavaNextForConditionalGeneration
--- a/docs/source/en/model_doc/llava_next_video.md
+++ b/docs/source/en/model_doc/llava_next_video.md
@ -262,10 +262,6 @@ model = LlavaNextVideoForConditionalGeneration.from_pretrained(

 [[autodoc]] LlavaNextVideoImageProcessor

-## LlavaNextVideoModel
-
-[[autodoc]] LlavaNextVideoModel
-
 ## LlavaNextVideoForConditionalGeneration

 [[autodoc]] LlavaNextVideoForConditionalGeneration
--- a/docs/source/en/model_doc/llava_onevision.md
+++ b/docs/source/en/model_doc/llava_onevision.md
@ -313,10 +313,6 @@ model = LlavaOnevisionForConditionalGeneration.from_pretrained(

 [[autodoc]] LlavaOnevisionVideoProcessor

-## LlavaOnevisionModel
-
-[[autodoc]] LlavaOnevisionModel
-
 ## LlavaOnevisionForConditionalGeneration

 [[autodoc]] LlavaOnevisionForConditionalGeneration
--- a/docs/source/en/model_doc/longformer.md
+++ b/docs/source/en/model_doc/longformer.md
@ -76,10 +76,10 @@ tokenizer.decode(predictions).split()
 ```

 </hfoption>
-<hfoption id="transformers CLI">
+<hfoption id="transformers-cli">

 ```bash
-echo -e "San Francisco 49ers cornerback Shawntae Spencer will miss the rest of the <mask> with a torn ligament in his left knee." | transformers run --task fill-mask --model allenai/longformer-base-4096 --device 0
+echo -e "San Francisco 49ers cornerback Shawntae Spencer will miss the rest of the <mask> with a torn ligament in his left knee." | transformers-cli run --task fill-mask --model allenai/longformer-base-4096 --device 0
 ```

 </hfoption>
@ -147,42 +147,42 @@ echo -e "San Francisco 49ers cornerback Shawntae Spencer will miss the rest of t

 ## LongformerForMaskedLM

-[[autodoc]] LongformerForMaskedLM
+[[autodoc]] LongformerForMaskedLM 
    - forward

 ## LongformerForSequenceClassification

-[[autodoc]] LongformerForSequenceClassification
+[[autodoc]] LongformerForSequenceClassification 
    - forward

 ## LongformerForMultipleChoice

-[[autodoc]] LongformerForMultipleChoice
+[[autodoc]] LongformerForMultipleChoice 
    - forward

 ## LongformerForTokenClassification

-[[autodoc]] LongformerForTokenClassification
+[[autodoc]] LongformerForTokenClassification 
    - forward

 ## LongformerForQuestionAnswering

-[[autodoc]] LongformerForQuestionAnswering
+[[autodoc]] LongformerForQuestionAnswering 
    - forward

 ## TFLongformerModel

-[[autodoc]] TFLongformerModel
+[[autodoc]] TFLongformerModel    
    - call

 ## TFLongformerForMaskedLM

-[[autodoc]] TFLongformerForMaskedLM
+[[autodoc]] TFLongformerForMaskedLM 
    - call

 ## TFLongformerForQuestionAnswering

-[[autodoc]] TFLongformerForQuestionAnswering
+[[autodoc]] TFLongformerForQuestionAnswering 
    - call

 ## TFLongformerForSequenceClassification
@ -192,10 +192,10 @@ echo -e "San Francisco 49ers cornerback Shawntae Spencer will miss the rest of t

 ## TFLongformerForTokenClassification

-[[autodoc]] TFLongformerForTokenClassification
+[[autodoc]] TFLongformerForTokenClassification 
    - call

 ## TFLongformerForMultipleChoice

-[[autodoc]] TFLongformerForMultipleChoice
+[[autodoc]] TFLongformerForMultipleChoice 
    - call
--- a/docs/source/en/model_doc/mistral.md
+++ b/docs/source/en/model_doc/mistral.md
@ -27,7 +27,7 @@ rendered properly in your Markdown viewer.

 # Mistral

-[Mistral](https://huggingface.co/papers/2310.06825) is a 7B parameter language model, available as a pretrained and instruction-tuned variant, focused on balancing
+[Mistral](https://huggingface.co/papers/2310.06825) is a 7B parameter language model, available as a pretrained and instruction-tuned variant, focused on balancing 
 the scaling costs of large models with performance and efficient inference. This model uses sliding window attention (SWA) trained with a 8K context length and a fixed cache size to handle longer sequences more effectively. Grouped-query attention (GQA) speeds up inference and reduces memory requirements. Mistral also features a byte-fallback BPE tokenizer to improve token handling and efficiency by ensuring characters are never mapped to out-of-vocabulary tokens.

 You can find all the original Mistral checkpoints under the [Mistral AI_](https://huggingface.co/mistralai) organization.
@ -78,10 +78,10 @@ The example below demonstrates how to chat with [`Pipeline`] or the [`AutoModel`
 ```

 </hfoption>
-<hfoption id="transformers CLI">
+<hfoption id="transformers-cli">

 ```python
-echo -e "My favorite condiment is" | transformers chat mistralai/Mistral-7B-v0.3 --torch_dtype auto --device 0 --attn_implementation flash_attention_2
+echo -e "My favorite condiment is" | transformers-cli chat --model_name_or_path mistralai/Mistral-7B-v0.3 --torch_dtype auto --device 0 --attn_implementation flash_attention_2
 ```

 </hfoption>
--- a/docs/source/en/model_doc/mistral3.md
+++ b/docs/source/en/model_doc/mistral3.md
@ -227,9 +227,6 @@ This example also how to use `BitsAndBytes` to load the model in 4bit quantizati

 [[autodoc]] Mistral3Config

-## Mistral3Model
-
-[[autodoc]] Mistral3Model

 ## Mistral3ForConditionalGeneration

--- a/docs/source/en/model_doc/mllama.md
+++ b/docs/source/en/model_doc/mllama.md
@ -130,10 +130,6 @@ print(processor.decode(output[0], skip_special_tokens=True))
 [[autodoc]] MllamaTextModel
    - forward

-## MllamaModel
-
-[[autodoc]] MllamaModel
-
 ## MllamaForCausalLM

 [[autodoc]] MllamaForCausalLM
--- a/docs/source/en/model_doc/mobilebert.md
+++ b/docs/source/en/model_doc/mobilebert.md
@ -76,10 +76,10 @@ print(f"The predicted token is: {predicted_token}")
 ```

 </hfoption>
-<hfoption id="transformers CLI">
+<hfoption id="transformers-cli">

 ```bash
-echo -e "The capital of France is [MASK]." | transformers run --task fill-mask --model google/mobilebert-uncased --device 0
+echo -e "The capital of France is [MASK]." | transformers-cli run --task fill-mask --model google/mobilebert-uncased --device 0
 ```

 </hfoption>
--- a/docs/source/en/model_doc/modernbert.md
+++ b/docs/source/en/model_doc/modernbert.md
@ -79,10 +79,10 @@ print(f"The predicted token is: {predicted_token}")
 ```

 </hfoption>
-<hfoption id="transformers CLI">
+<hfoption id="transformers-cli">

 ```bash
-echo -e "Plants create [MASK] through a process known as photosynthesis." | transformers run --task fill-mask --model answerdotai/ModernBERT-base --device 0
+echo -e "Plants create [MASK] through a process known as photosynthesis." | transformers-cli run --task fill-mask --model answerdotai/ModernBERT-base --device 0
 ```

 </hfoption>
--- a/docs/source/en/model_doc/openai-gpt.md
+++ b/docs/source/en/model_doc/openai-gpt.md
@ -70,10 +70,10 @@ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 ```

 </hfoption>
-<hfoption id="transformers CLI">
+<hfoption id="transformers-cli">

 ```bash
-echo -e "The future of AI is" | transformers run --task text-generation --model openai-community/openai-gpt --device 0
+echo -e "The future of AI is" | transformers-cli run --task text-generation --model openai-community/openai-gpt --device 0

 ```
 </hfoption>
--- a/docs/source/en/model_doc/paligemma.md
+++ b/docs/source/en/model_doc/paligemma.md
@ -174,10 +174,6 @@ visualizer("<img> What is in this image?")

 [[autodoc]] PaliGemmaProcessor

-## PaliGemmaModel
-
-[[autodoc]] PaliGemmaModel
-
 ## PaliGemmaForConditionalGeneration

 [[autodoc]] PaliGemmaForConditionalGeneration
--- a/docs/source/en/model_doc/phi.md
+++ b/docs/source/en/model_doc/phi.md
@ -65,10 +65,10 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```

 </hfoption>
-<hfoption id="transformers CLI">
+<hfoption id="transformers-cli">

 ```bash
-echo -e "'''def print_prime(n): """ Print all primes between 1 and n"""'''" | transformers run --task text-classification --model microsoft/phi-1.5 --device 0
+echo -e "'''def print_prime(n): """ Print all primes between 1 and n"""'''" | transformers-cli run --task text-classification --model microsoft/phi-1.5 --device 0
 ```

 </hfoption>
@ -102,7 +102,7 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
    ```py
    import torch
    from transformers import AutoTokenizer, AutoModelForCausalLM
-
+    
    tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1")
    model = AutoModelForCausalLM.from_pretrained(
        "microsoft/phi-1",
@ -110,12 +110,12 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
        device_map="auto",
        trust_remote_code=True,
        attn_implementation="sdpa")
-
+    
    input_ids = tokenizer('''def print_prime(n):
       """
       Print all primes between 1 and n
       """''', return_tensors="pt").to("cuda")
-
+    
    output = model.generate(**input_ids, cache_implementation="static")
    print(tokenizer.decode(output[0], skip_special_tokens=True))
    ```
--- a/docs/source/en/model_doc/qwen2.md
+++ b/docs/source/en/model_doc/qwen2.md
@ -64,7 +64,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer

 model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2-1.5B-Instruct",
-    torch_dtype=torch.bfloat16,
+    torch_dtype=torch.bfloat16, 
    device_map="auto",
    attn_implementation="sdpa"
 )
@ -86,10 +86,10 @@ generated_ids = model.generate(
    model_inputs.input_ids,
    cache_implementation="static",
    max_new_tokens=512,
-    do_sample=True,
-    temperature=0.7,
-    top_k=50,
-    top_p=0.95
+    do_sample=True, 
+    temperature=0.7, 
+    top_k=50,        
+    top_p=0.95       
 )
 generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
@ -100,11 +100,11 @@ print(response)
 ```

 </hfoption>
-<hfoption id="transformers CLI">
+<hfoption id="transformers-cli">

 ```bash
 # pip install -U flash-attn --no-build-isolation
-transformers chat Qwen/Qwen2-7B-Instruct --torch_dtype auto --attn_implementation flash_attention_2 --device 0
+transformers-cli chat --model_name_or_path Qwen/Qwen2-7B-Instruct --torch_dtype auto --attn_implementation flash_attention_2 --device 0
 ```

 </hfoption>
@ -121,21 +121,21 @@ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

 quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
-    bnb_4bit_compute_dtype=torch.bfloat16,
-    bnb_4bit_quant_type="nf4",
-    bnb_4bit_use_double_quant=True,
+    bnb_4bit_compute_dtype=torch.bfloat16, 
+    bnb_4bit_quant_type="nf4",             
+    bnb_4bit_use_double_quant=True,       
 )

-tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B")
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B") 
 model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2-7B",
    torch_dtype=torch.bfloat16,
    device_map="auto",
    quantization_config=quantization_config,
-    attn_implementation="flash_attention_2"
+    attn_implementation="flash_attention_2" 
 )

-inputs = tokenizer("The Qwen2 model family is", return_tensors="pt").to("cuda")
+inputs = tokenizer("The Qwen2 model family is", return_tensors="pt").to("cuda") 
 outputs = model.generate(**inputs, max_new_tokens=100)
 print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 ```
--- a/docs/source/en/model_doc/qwen2_5_vl.md
+++ b/docs/source/en/model_doc/qwen2_5_vl.md
@ -118,7 +118,7 @@ The example below uses [torchao](../quantization/torchao) to only quantize the w

 ```python
 import torch
-from transformers import TorchAoConfig, Qwen2_5_VLForConditionalGeneration, AutoProcessor
+from transformers import TorchAoConfig, Gemma3ForConditionalGeneration, AutoProcessor

 quantization_config = TorchAoConfig("int4_weight_only", group_size=128)
 model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
@ -240,10 +240,6 @@ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(

 [[autodoc]] Qwen2_5_VLProcessor

-## Qwen2_5_VLTextModel
-
-[[autodoc]] Qwen2_5_VLTextModel
-    - forward

 ## Qwen2_5_VLModel

--- a/docs/source/en/model_doc/qwen2_vl.md
+++ b/docs/source/en/model_doc/qwen2_vl.md
@ -296,11 +296,6 @@ model = Qwen2VLForConditionalGeneration.from_pretrained(

 [[autodoc]] Qwen2VLProcessor

-## Qwen2VLTextModel
-
-[[autodoc]] Qwen2VLTextModel
-    - forward
-    
 ## Qwen2VLModel

 [[autodoc]] Qwen2VLModel
--- a/docs/source/en/model_doc/sam_hq.md
+++ b/docs/source/en/model_doc/sam_hq.md
@ -1,127 +0,0 @@
-# SAM-HQ
-
-## Overview
-
-SAM-HQ (High-Quality Segment Anything Model) was proposed in [Segment Anything in High Quality](https://arxiv.org/pdf/2306.01567.pdf) by Lei Ke, Mingqiao Ye, Martin Danelljan, Yifan Liu, Yu-Wing Tai, Chi-Keung Tang, Fisher Yu.
-
-The model is an enhancement to the original SAM model that produces significantly higher quality segmentation masks while maintaining SAM's original promptable design, efficiency, and zero-shot generalizability.
-
-![example image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/sam-output.png)
-
-
-SAM-HQ introduces several key improvements over the original SAM model:
-
-1. High-Quality Output Token: A learnable token injected into SAM's mask decoder for higher quality mask prediction
-2. Global-local Feature Fusion: Combines features from different stages of the model for improved mask details
-3. Training Data: Uses a carefully curated dataset of 44K high-quality masks instead of SA-1B
-4. Efficiency: Adds only 0.5% additional parameters while significantly improving mask quality
-5. Zero-shot Capability: Maintains SAM's strong zero-shot performance while improving accuracy
-
-The abstract from the paper is the following:
-
-*The recent Segment Anything Model (SAM) represents a big leap in scaling up segmentation models, allowing for powerful zero-shot capabilities and flexible prompting. Despite being trained with 1.1 billion masks, SAM's mask prediction quality falls short in many cases, particularly when dealing with objects that have intricate structures. We propose HQ-SAM, equipping SAM with the ability to accurately segment any object, while maintaining SAM's original promptable design, efficiency, and zero-shot generalizability. Our careful design reuses and preserves the pre-trained model weights of SAM, while only introducing minimal additional parameters and computation. We design a learnable High-Quality Output Token, which is injected into SAM's mask decoder and is responsible for predicting the high-quality mask. Instead of only applying it on mask-decoder features, we first fuse them with early and final ViT features for improved mask details. To train our introduced learnable parameters, we compose a dataset of 44K fine-grained masks from several sources. HQ-SAM is only trained on the introduced dataset of 44k masks, which takes only 4 hours on 8 GPUs.*
-
-Tips:
-
- SAM-HQ produces higher quality masks than the original SAM model, particularly for objects with intricate structures and fine details
- The model predicts binary masks with more accurate boundaries and better handling of thin structures
- Like SAM, the model performs better with input 2D points and/or input bounding boxes
- You can prompt multiple points for the same image and predict a single high-quality mask
- The model maintains SAM's zero-shot generalization capabilities
- SAM-HQ only adds ~0.5% additional parameters compared to SAM
- Fine-tuning the model is not supported yet
-
-This model was contributed by [sushmanth](https://huggingface.co/sushmanth).
-The original code can be found [here](https://github.com/SysCV/SAM-HQ).
-
-Below is an example on how to run mask generation given an image and a 2D point:
-
-```python
-import torch
-from PIL import Image
-import requests
-from transformers import SamHQModel, SamHQProcessor
-
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model = SamHQModel.from_pretrained("sushmanth/sam_hq_vit_b").to(device)
-processor = SamHQProcessor.from_pretrained("sushmanth/sam_hq_vit_b")
-
-img_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
-raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
-input_points = [[[450, 600]]]  # 2D location of a window in the image
-
-inputs = processor(raw_image, input_points=input_points, return_tensors="pt").to(device)
-with torch.no_grad():
-    outputs = model(**inputs)
-
-masks = processor.image_processor.post_process_masks(
-    outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu()
-)
-scores = outputs.iou_scores
-```
-
-You can also process your own masks alongside the input images in the processor to be passed to the model:
-
-```python
-import torch
-from PIL import Image
-import requests
-from transformers import SamHQModel, SamHQProcessor
-
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model = SamHQModel.from_pretrained("sushmanth/sam_hq_vit_b").to(device)
-processor = SamHQProcessor.from_pretrained("sushmanth/sam_hq_vit_b")
-
-img_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
-raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
-mask_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
-segmentation_map = Image.open(requests.get(mask_url, stream=True).raw).convert("1")
-input_points = [[[450, 600]]]  # 2D location of a window in the image
-
-inputs = processor(raw_image, input_points=input_points, segmentation_maps=segmentation_map, return_tensors="pt").to(device)
-with torch.no_grad():
-    outputs = model(**inputs)
-
-masks = processor.image_processor.post_process_masks(
-    outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu()
-)
-scores = outputs.iou_scores
-```
-
-
-## Resources
-
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with SAM-HQ:
-
- Demo notebook for using the model (coming soon)
- Paper implementation and code: [SAM-HQ GitHub Repository](https://github.com/SysCV/SAM-HQ)
-
-## SamHQConfig
-
-[[autodoc]] SamHQConfig
-
-## SamHQVisionConfig
-
-[[autodoc]] SamHQVisionConfig
-
-## SamHQMaskDecoderConfig
-
-[[autodoc]] SamHQMaskDecoderConfig
-
-## SamHQPromptEncoderConfig
-
-[[autodoc]] SamHQPromptEncoderConfig
-
-## SamHQProcessor
-
-[[autodoc]] SamHQProcessor
-
-## SamHQVisionModel
-
-[[autodoc]] SamHQVisionModel
-
-
-## SamHQModel
-
-[[autodoc]] SamHQModel
-    - forward
--- a/docs/source/en/model_doc/swin2sr.md
+++ b/docs/source/en/model_doc/swin2sr.md
@ -50,11 +50,6 @@ A demo Space for image super-resolution with SwinSR can be found [here](https://
 [[autodoc]] Swin2SRImageProcessor
    - preprocess

-## Swin2SRImageProcessorFast
-
-[[autodoc]] Swin2SRImageProcessorFast
-    - preprocess
-
 ## Swin2SRConfig

 [[autodoc]] Swin2SRConfig
--- a/docs/source/en/model_doc/t5.md
+++ b/docs/source/en/model_doc/t5.md
@ -75,10 +75,10 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```

 </hfoption>
-<hfoption id="transformers CLI">
+<hfoption id="transformers-cli">

 ```bash
-echo -e "translate English to French: The weather is nice today." | transformers run --task text2text-generation --model google-t5/t5-base --device 0
+echo -e "translate English to French: The weather is nice today." | transformers-cli run --task text2text-generation --model google-t5/t5-base --device 0
 ```

 </hfoption>
--- a/docs/source/en/model_doc/video_llava.md
+++ b/docs/source/en/model_doc/video_llava.md
@ -215,10 +215,6 @@ model = VideoLlavaForConditionalGeneration.from_pretrained(

 [[autodoc]] VideoLlavaProcessor

-## VideoLlavaModel
-
-[[autodoc]] VideoLlavaModel
-
 ## VideoLlavaForConditionalGeneration

 [[autodoc]] VideoLlavaForConditionalGeneration
--- a/docs/source/en/model_doc/vipllava.md
+++ b/docs/source/en/model_doc/vipllava.md
@ -101,10 +101,6 @@ A chat between a curious human and an artificial intelligence assistant. The ass

 [[autodoc]] VipLlavaConfig

-## VipLlavaModel
-
-[[autodoc]] VipLlavaModel
-
 ## VipLlavaForConditionalGeneration

 [[autodoc]] VipLlavaForConditionalGeneration
--- a/docs/source/en/model_doc/vitmatte.md
+++ b/docs/source/en/model_doc/vitmatte.md
@ -53,11 +53,6 @@ The model expects both the image and trimap (concatenated) as input. Use [`ViTMa
 [[autodoc]] VitMatteImageProcessor
    - preprocess

-## VitMatteImageProcessorFast
-
-[[autodoc]] VitMatteImageProcessorFast
-    - preprocess
-
 ## VitMatteForImageMatting

 [[autodoc]] VitMatteForImageMatting
--- a/docs/source/en/perf_infer_gpu_one.md
+++ b/docs/source/en/perf_infer_gpu_one.md
@ -44,7 +44,7 @@ Place all inputs on the same device as the model.
 from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM

 quantization_config = BitsAndBytesConfig(load_in_8bit=True)
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B")
+tokenizer = AutoTokenizer("meta-llama/Llama-3.1-8B")
 model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B", device_map="auto", quantization_config=quantization_config)

 prompt = "Hello, my llama is cute"
@ -196,7 +196,7 @@ model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B", device_m
 input_text = "Hello, my llama is cute"
 inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

-with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
+with sdpa_kernel(SDPBackend.FLASH_ATTENTION)::
    outputs = model.generate(**inputs)

 print(tokenizer.decode(outputs[0], skip_special_tokens=True))
--- a/docs/source/en/perf_train_gaudi.md
+++ b/docs/source/en/perf_train_gaudi.md
@ -1,34 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Intel Gaudi
-
-The Intel Gaudi AI accelerator family includes [Intel Gaudi 1](https://habana.ai/products/gaudi/), [Intel Gaudi 2](https://habana.ai/products/gaudi2/), and [Intel Gaudi 3](https://habana.ai/products/gaudi3/). Each server is equipped with 8 devices, known as Habana Processing Units (HPUs), providing 128GB of memory on Gaudi 3, 96GB on Gaudi 2, and 32GB on the first-gen Gaudi. For more details on the underlying hardware architecture, check out the [Gaudi Architecture](https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html) overview.
-
-[`TrainingArguments`], [`Trainer`] and [`Pipeline`] detect and set the backend device to `hpu` if an Intel Gaudi device is available. No additional changes are required to enable training and inference on your device.
-
-Some modeling code in Transformers is not optimized for HPU lazy mode. If you encounter any errors, set the environment variable below to use eager mode:
-```
-PT_HPU_LAZY_MODE=0
-```
-
-In some cases, you'll also need to enable int64 support to avoid casting issues with long integers:
-```
-PT_ENABLE_INT64_SUPPORT=1
-```
-Refer to the [Gaudi docs](https://docs.habana.ai/en/latest/index.html) for more details.
-
-> [!TIP]
-> For training and inference with Gaudi-optimized model implementations, we recommend using [Optimum for Intel Gaudi](https://huggingface.co/docs/optimum/main/en/habana/index).
--- a/docs/source/en/quantization/torchao.md
+++ b/docs/source/en/quantization/torchao.md
@ -40,8 +40,6 @@ torchao supports the [quantization techniques](https://github.com/pytorch/ao/blo
 - A16W4 Int4 Weight Only Quantization
 - Autoquantization

-torchao also supports module level configuration by specifying a dictionary from fully qualified name of module and its corresponding quantization config. This allows skip quantizing certain layers and using different quantization config for different modules.
-

 Check the table below to see if your hardware is compatible.

@ -91,7 +89,7 @@ We'll show examples for recommended quantization methods based on hardwares, e.g
 ```py
 import torch
 from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
-from torchao.quantization import Float8DynamicActivationFloat8WeightConfig, Float8WeightOnlyConfig
+from torchao.quantization import Float8DynamicActivationFloat8WeightConfig

 quant_config = Float8DynamicActivationFloat8WeightConfig()
 # or float8 weight only quantization
@ -151,7 +149,7 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```py
 import torch
 from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
-from torchao.quantization import Int8DynamicActivationInt8WeightConfig, Int8WeightOnlyConfig
+from torchao.quantization import Int8DynamicActivationInt8WeightConfig

 quant_config = Int8DynamicActivationInt8WeightConfig()
 # or int8 weight only quantization
@ -181,7 +179,7 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```py
 import torch
 from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
-from torchao.quantization import GemliteUIntXWeightOnlyConfig, Int4WeightOnlyConfig
+from torchao.quantization import GemliteUIntXWeightOnlyConfig

 # For batch size N, we recommend gemlite, which may require autotuning
 # default is 4 bit, 8 bit is also supported by passing `bit_width=8`
@ -218,7 +216,7 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```py
 import torch
 from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
-from torchao.quantization import Int8DynamicActivationInt8WeightConfig, Int8WeightOnlyConfig
+from torchao.quantization import Int8DynamicActivationInt8WeightConfig

 quant_config = Int8DynamicActivationInt8WeightConfig()
 # quant_config = Int8WeightOnlyConfig()
@ -274,74 +272,6 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 </hfoption>
 </hfoptions>

-### Per Module Quantization
-#### 1. Skip quantization for certain layers
-With `AOPerModuleConfig` we can specify a default configuration for all layers while skipping quantization for certain layers.
-```py
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
-
-model_id = "meta-llama/Llama-3.1-8B-Instruct"
-
-from torchao.quantization import Int4WeightOnlyConfig, AOPerModuleConfig
-config = Int4WeightOnlyConfig(group_size=128)
-
-# set default to int4 (for linears), and skip quantizing `model.layers.0.self_attn.q_proj`
-quant_config = AOPerModuleConfig({"_default": config, "model.layers.0.self_attn.q_proj": None})
-quantization_config = TorchAoConfig(quant_type=quant_config)
-quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.bfloat16, quantization_config=quantization_config)
-# lm_head is not quantized and model.layers.0.self_attn.q_proj is not quantized
-print("quantized model:", quantized_model)
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-
-# Manual Testing
-prompt = "Hey, are you conscious? Can you talk to me?"
-inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
-generated_ids = quantized_model.generate(**inputs, max_new_tokens=128)
-output_text = tokenizer.batch_decode(
-    generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
-)
-print(output_text)
-```
-
-#### 2. Quantizing different layers with different quantization configs
-```py
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, TorchAoConfig
-
-model_id = "facebook/opt-125m"
-
-from torchao.quantization import Int4WeightOnlyConfig, AOPerModuleConfig, Int8DynamicActivationInt4WeightConfig, IntxWeightOnlyConfig, PerAxis, MappingType
-
-weight_dtype = torch.int8
-granularity = PerAxis(0)
-mapping_type = MappingType.ASYMMETRIC
-embedding_config = IntxWeightOnlyConfig(
-    weight_dtype=weight_dtype,
-    granularity=granularity,
-    mapping_type=mapping_type,
-)
-linear_config = Int8DynamicActivationInt4WeightConfig(group_size=128)
-quant_config = AOPerModuleConfig({"_default": linear_config, "model.decoder.embed_tokens": embedding_config, "model.decoder.embed_positions": None})
-# set `include_embedding` to True in order to include embedding in quantization
-# when `include_embedding` is True, we'll remove input embedding from `modules_not_to_convert` as well
-quantization_config = TorchAoConfig(quant_type=quant_config, include_embedding=True)
-quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cpu", torch_dtype=torch.bfloat16, quantization_config=quantization_config)
-print("quantized model:", quantized_model)
-# make sure embedding is quantized
-print("embed_tokens weight:", quantized_model.model.decoder.embed_tokens.weight)
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-
-# Manual Testing
-prompt = "Hey, are you conscious? Can you talk to me?"
-inputs = tokenizer(prompt, return_tensors="pt").to("cpu")
-generated_ids = quantized_model.generate(**inputs, max_new_tokens=128, cache_implementation="static")
-output_text = tokenizer.batch_decode(
-    generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
-)
-print(output_text)
-```
-
 ### Autoquant

 If you want to automatically choose a quantization type for quantizable layers (`nn.Linear`) you can use the [autoquant](https://pytorch.org/ao/stable/generated/torchao.quantization.autoquant.html#torchao.quantization.autoquant) API.
--- a/docs/source/en/serving.md
+++ b/docs/source/en/serving.md
@ -51,7 +51,7 @@ By default, vLLM serves the native implementation and if it doesn't exist, it fa
 ```shell
 vllm serve Qwen/Qwen2.5-1.5B-Instruct \
    --task generate \
-    --model-impl transformers
+    --model-impl transformers \
 ```

 Add the `trust-remote-code` parameter to enable loading a remote code model.
@ -60,5 +60,5 @@ Add the `trust-remote-code` parameter to enable loading a remote code model.
 vllm serve Qwen/Qwen2.5-1.5B-Instruct \
    --task generate \
    --model-impl transformers \
-    --trust-remote-code
+    --trust-remote-code \
 ```
--- a/docs/source/en/tasks/prompting.md
+++ b/docs/source/en/tasks/prompting.md
@ -78,62 +78,32 @@ Crafting a good prompt alone, also known as zero-shot prompting, may not be enou

 This section covers a few prompting techniques.

-### Few-shot prompting
+### Few-shot

-Few-shot prompting improves accuracy and performance by including specific examples of what a model should generate given an input. The explicit examples give the model a better understanding of the task and the output format you’re looking for. Try experimenting with different numbers of examples (2, 4, 8, etc.) to see how it affects performance. The example below provides the model with 1 example (1-shot) of the output format (a date in MM/DD/YYYY format) it should return.
+Few-shot prompting improves accuracy and performance by including specific examples of what a model should generate given an input. The explicit examples give the model a better understanding of the task and the output format you're looking for. Try experimenting with different numbers of examples (2, 4, 8, etc.) to see how it affects performance.

-```python
+The example below provides the model with 1 example (1-shot) of the output format (a date in MM/DD/YYYY format) it should return.
+
+```py
 from transformers import pipeline
 import torch

 pipeline = pipeline(model="mistralai/Mistral-7B-Instruct-v0.1", torch_dtype=torch.bfloat16, device_map="auto")
 prompt = """Text: The first human went into space and orbited the Earth on April 12, 1961.
 Date: 04/12/1961
-Text: The first-ever televised presidential debate in the United States took place on September 28, 1960, between presidential candidates John F. Kennedy and Richard Nixon.
+Text: The first-ever televised presidential debate in the United States took place on September 28, 1960, between presidential candidates John F. Kennedy and Richard Nixon. 
 Date:"""

 outputs = pipeline(prompt, max_new_tokens=12, do_sample=True, top_k=10)
 for output in outputs:
    print(f"Result: {output['generated_text']}")
-# Result: Text: The first human went into space and orbited the Earth on April 12, 1961.
-# Date: 04/12/1961
-# Text: The first-ever televised presidential debate in the United States took place on September 28, 1960, between presidential candidates John F. Kennedy and Richard Nixon.
-# Date: 09/28/1960
+Result: Text: The first human went into space and orbited the Earth on April 12, 1961.
+Date: 04/12/1961
+Text: The first-ever televised presidential debate in the United States took place on September 28, 1960, between presidential candidates John F. Kennedy and Richard Nixon. 
+Date: 09/28/1960
 ```

-The downside of few-shot prompting is that you need to create lengthier prompts which increases computation and latency. There is also a limit to prompt lengths. Finally, a model can learn unintended patterns from your examples, and it may not work well on complex reasoning tasks.
-
-To improve few-shot prompting for modern instruction-tuned LLMs, use a model's specific [chat template](../conversations). These models are trained on datasets with turn-based conversations between a "user" and "assistant". Structuring your prompt to align with this can improve performance.
-
-Structure your prompt as a turn-based conversation and use the [`apply_chat_template`] method to tokenize and format it.
-
-```python
-from transformers import pipeline
-import torch
-
-pipeline = pipeline(model="mistralai/Mistral-7B-Instruct-v0.1", torch_dtype=torch.bfloat16, device_map="auto")
-
-messages = [
-    {"role": "user", "content": "Text: The first human went into space and orbited the Earth on April 12, 1961."},
-    {"role": "assistant", "content": "Date: 04/12/1961"},
-    {"role": "user", "content": "Text: The first-ever televised presidential debate in the United States took place on September 28, 1960, between presidential candidates John F. Kennedy and Richard Nixon."}
-]
-
-prompt = pipeline.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-
-outputs = pipeline(prompt, max_new_tokens=12, do_sample=True, top_k=10)
-
-for output in outputs:
-    print(f"Result: {output['generated_text']}")
-```
-
-
-While the basic few-shot prompting approach embedded examples within a single text string, the chat template format offers the following benefits.
-
- The model may have a potentially improved understanding because it can better recognize the pattern and the expected roles of user input and assistant output.
- The model may more consistently output the desired output format because it is structured like its input during training.
-
-Always consult a specific instruction-tuned model's documentation to learn more about the format of their chat template so that you can structure your few-shot prompts accordingly.
+The downside of few-shot prompting is that you need to create lengthier prompts which increases computation and latency. There is also a limit to prompt lengths. Finally, a model can learn unintended patterns from your examples and it doesn't work well on complex reasoning tasks.

 ### Chain-of-thought

--- a/docs/source/en/torchscript.md
+++ b/docs/source/en/torchscript.md
@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.

 # TorchScript

-[TorchScript](https://pytorch.org/docs/stable/jit.html) serializes PyTorch models into programs that can be executed in non-Python processes. This is especially advantageous in production environments where Python may not be the most performant choice.
+[TorchScript](https://pytorch.org/docs/stable/jit.html) serializes PyTorch models into programs that can be executed in non-Python processes. This is especially advantageous in production environments where Python may the most performant choice.

 Transformers can export a model to TorchScript by:

--- a/docs/source/es/converting_tensorflow_models.md
+++ b/docs/source/es/converting_tensorflow_models.md
@ -20,9 +20,9 @@ Te proporcionamos una interfaz de línea de comando (`CLI`, por sus siglas en in

 <Tip>

-Desde 2.3.0, el script para convertir es parte de la CLI de transformers (**transformers**) disponible en cualquier instalación de transformers >= 2.3.0.
+Desde 2.3.0, el script para convertir es parte de la CLI de transformers (**transformers-cli**) disponible en cualquier instalación de transformers >= 2.3.0.

-La siguiente documentación refleja el formato para el comando **transformers convert**.
+La siguiente documentación refleja el formato para el comando **transformers-cli convert**.

 </Tip>

@ -41,7 +41,7 @@ Aquí hay un ejemplo del proceso para convertir un modelo `BERT-Base Uncased` pr
 ```bash
 export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12

-transformers convert --model_type bert \
+transformers-cli convert --model_type bert \
  --tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \
  --config $BERT_BASE_DIR/bert_config.json \
  --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin
@ -60,7 +60,7 @@ Aquí hay un ejemplo del proceso para convertir un modelo `ALBERT Base` pre-entr
 ```bash
 export ALBERT_BASE_DIR=/path/to/albert/albert_base

-transformers convert --model_type albert \
+transformers-cli convert --model_type albert \
  --tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \
  --config $ALBERT_BASE_DIR/albert_config.json \
  --pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin
@ -75,7 +75,7 @@ Este es un ejemplo del proceso para convertir un modelo OpenAI GPT pre-entrenado
 ```bash
 export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights

-transformers convert --model_type gpt \
+transformers-cli convert --model_type gpt \
  --tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
  [--config OPENAI_GPT_CONFIG] \
@ -89,7 +89,7 @@ Aquí hay un ejemplo del proceso para convertir un modelo OpenAI GPT-2 pre-entre
 ```bash
 export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/openai-community/gpt2/pretrained/weights

-transformers convert --model_type gpt2 \
+transformers-cli convert --model_type gpt2 \
  --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
  [--config OPENAI_GPT2_CONFIG] \
@ -104,7 +104,7 @@ Aquí hay un ejemplo del proceso para convertir un modelo XLNet pre-entrenado:
 export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
 export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config

-transformers convert --model_type xlnet \
+transformers-cli convert --model_type xlnet \
  --tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \
  --config $TRANSFO_XL_CONFIG_PATH \
  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
@ -118,7 +118,7 @@ Aquí hay un ejemplo del proceso para convertir un modelo XLM pre-entrenado:
 ```bash
 export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint

-transformers convert --model_type xlm \
+transformers-cli convert --model_type xlm \
  --tf_checkpoint $XLM_CHECKPOINT_PATH \
  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT
 [--config XML_CONFIG] \
@ -132,7 +132,7 @@ Aquí hay un ejemplo del proceso para convertir un modelo T5 pre-entrenado:
 ```bash
 export T5=/path/to/t5/uncased_L-12_H-768_A-12

-transformers convert --model_type t5 \
+transformers-cli convert --model_type t5 \
  --tf_checkpoint $T5/t5_model.ckpt \
  --config $T5/t5_config.json \
  --pytorch_dump_output $T5/pytorch_model.bin
--- a/docs/source/it/add_new_model.md
+++ b/docs/source/it/add_new_model.md
@ -15,51 +15,51 @@ rendered properly in your Markdown viewer.

 # Come aggiungere un modello a 🤗 Transformers?

-Aggiungere un nuovo modello é spesso difficile e richiede una profonda conoscenza della libreria 🤗 Transformers e anche
-della repository originale del modello. A Hugging Face cerchiamo di dare alla community sempre piú poteri per aggiungere
-modelli independentemente. Quindi, per alcuni nuovi modelli che la community vuole aggiungere a 🤗 Transformers, abbiamo
-creato una specifica *call-for-model-addition* che spiega passo dopo passo come aggiungere il modello richiesto. Con
+Aggiungere un nuovo modello é spesso difficile e richiede una profonda conoscenza della libreria 🤗 Transformers e anche 
+della repository originale del modello. A Hugging Face cerchiamo di dare alla community sempre piú poteri per aggiungere 
+modelli independentemente. Quindi, per alcuni nuovi modelli che la community vuole aggiungere a 🤗 Transformers, abbiamo 
+creato una specifica *call-for-model-addition* che spiega passo dopo passo come aggiungere il modello richiesto. Con 
 questo *call-for-model-addition* vogliamo insegnare a volenterosi e esperti collaboratori della community come implementare
 un modello in 🤗 Transformers.

 Se questo é qualcosa che può interessarvi, siete liberi di controllare l'attuale “calls-for-model-addition” [qui](https://github.com/huggingface/transformers/tree/main/templates/adding_a_new_model/open_model_proposals/README.md)
-e contattarci.
+e contattarci. 

 Se il modello sarà selezionato, allora potrete lavorare insieme a un membro di Hugging Face per integrare il modello in 🤗
-Transformers. Così facendo, ci guadagnerai in una comprensione totale, sia teorica che pratica, del modello proposto. Inoltre,
+Transformers. Così facendo, ci guadagnerai in una comprensione totale, sia teorica che pratica, del modello proposto. Inoltre, 
 sarai l'artefice di un importante contributo open-source a 🤗 Transformers. Durante l'implementazione avrai l'opportunità di:

 - ottenere più comprensione delle best practices in open-source
- capire i principi di design di una della librerie NLP più popolari
+- capire i principi di design di una della librerie NLP più popolari 
 - capire come efficientemente testare complessi modelli NLP
- capire come integrare utilit Python come `black`, `ruff`, `make fix-copies` in una libreria per garantire sempre di avere un codice leggibile e pulito
+- capire come integrare utilit Python come `black`, `ruff`, `make fix-copies` in una libreria per garantire sempre di avere un codice leggibile e pulito 

-Siamo anche contenti se vuoi aggiungere un modello che non può essere trovato nella cartella “calls-for-model-addition”.
+Siamo anche contenti se vuoi aggiungere un modello che non può essere trovato nella cartella “calls-for-model-addition”. 
 Le seguenti sezioni spiegano in dettaglio come aggiungere un nuovo modello. Può anche essere molto utile controllare modelli
 già aggiunti [qui](https://github.com/huggingface/transformers/pulls?q=is%3Apr+label%3A%22PR+for+Model+Addition%22+is%3Aclosed),
-per capire se richiamano il modello che vorreste aggiungere.
+per capire se richiamano il modello che vorreste aggiungere. 

 Per cominciare, vediamo una panoramica general della libreria Transformers.

 ## Panoramica generale su 🤗 Transformers

 Prima di tutto, vediamo in generale 🤗 Transformers. 🤗 Transformers é una libreria molto strutturata, quindi
-puà essere che a volte ci sia un disaccordo con alcune filosofie della libreria o scelte di design. Dalla nostra esperienza,
+puà essere che a volte ci sia un disaccordo con alcune filosofie della libreria o scelte di design. Dalla nostra esperienza, 
 tuttavia, abbiamo trovato che le scelte fondamentali di design della libreria sono cruciali per usare 🤗 Transformers efficacemente
-su larga scala, mantenendo i costi a un livello accettabile.
+su larga scala, mantenendo i costi a un livello accettabile.  

 Un buon primo punto di partenza per capire al meglio la libreria é leggere la [documentazione sulla nostra filosofia](filosofia)
 Da qui, ci sono alcune scelte sul modo di lavorare che cerchiamo di applicare a tutti i modelli:

 - La composizione é generalmente favorita sulla sovra-astrazione
 - Duplicare il codice non é sempre male, soprattutto se migliora notevolmente la leggibilità e accessibilità del modello
- Tutti i files creati per il nuovo modello devono il piu possibile "compatti". Questo vuol dire che quando qualcuno leggerá il codice
+- Tutti i files creati per il nuovo modello devono il piu possibile "compatti". Questo vuol dire che quando qualcuno leggerá il codice 
 di uno specifico modello, potrá vedere solo il corrispettivo file `modeling_....py` senza avere multiple dipendenze.


-La cosa piú importante, é che consideriamo la libreria non solo un mezzo per dare un prodotto, *per esempio* dare la possibilità
-di usare BERT per inferenza, ma é anche il prodotto reale che noi vogliamo migliorare sempre più. Quindi, quando aggiungi
-un modello, non sei solo la persona che userà il modello, ma rappresenti anche tutti coloro che leggeranno,
+La cosa piú importante, é che consideriamo la libreria non solo un mezzo per dare un prodotto, *per esempio* dare la possibilità 
+di usare BERT per inferenza, ma é anche il prodotto reale che noi vogliamo migliorare sempre più. Quindi, quando aggiungi 
+un modello, non sei solo la persona che userà il modello, ma rappresenti anche tutti coloro che leggeranno, 
 cercheranno di capire e modificare il tuo modello.

 Tenendo questi principi in mente, immergiamoci nel design generale della libreria.
@ -67,25 +67,25 @@ Tenendo questi principi in mente, immergiamoci nel design generale della libreri
 ### Panoramica sui modelli

 Per aggiungere con successo un modello, é importante capire l'interazione tra il tuo modello e la sua configurazione,
-[`PreTrainedModel`], e [`PretrainedConfig`]. Per dare un esempio, chiameremo il modello da aggiungere a 🤗 Transformers
+[`PreTrainedModel`], e [`PretrainedConfig`]. Per dare un esempio, chiameremo il modello da aggiungere a 🤗 Transformers  
 `BrandNewBert`.

 Diamo un'occhiata:

 <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers_overview.png"/>

-Come potete vedere, ci basiamo sull'ereditarietà in 🤗 Transformers, tenendo però il livello di astrazione a un minimo
-assoluto.  Non ci sono mai più di due livelli di astrazione per ogni modello nella libreria. `BrandNewBertModel` eredita
-da `BrandNewBertPreTrainedModel` che, a sua volta, eredita da [`PreTrainedModel`] -  semplice no?
+Come potete vedere, ci basiamo sull'ereditarietà in 🤗 Transformers, tenendo però il livello di astrazione a un minimo 
+assoluto.  Non ci sono mai più di due livelli di astrazione per ogni modello nella libreria. `BrandNewBertModel` eredita 
+da `BrandNewBertPreTrainedModel` che, a sua volta, eredita da [`PreTrainedModel`] -  semplice no? 
 Come regola generale, vogliamo essere sicuri che un nuovo modello dipenda solo da [`PreTrainedModel`]. Le funzionalità
 importanti che sono automaticamente conferite a ogni nuovo modello sono [`~PreTrainedModel.from_pretrained`]
-e [`~PreTrainedModel.save_pretrained`], che sono usate per serializzazione e deserializzazione. Tutte le altre importanti
+e [`~PreTrainedModel.save_pretrained`], che sono usate per serializzazione e deserializzazione. Tutte le altre importanti 
 funzionalità, come ad esempio `BrandNewBertModel.forward` devono essere definite completamente nel nuovo script
-`modeling_brand_new_bert.py`. Inoltre, vogliamo essere sicuri che un modello con uno specifico head layer, come
+`modeling_brand_new_bert.py`. Inoltre, vogliamo essere sicuri che un modello con uno specifico head layer, come 
 `BrandNewBertForMaskedLM` non erediti da `BrandNewBertModel`, ma piuttosto usi `BrandNewBertModel`
-come componente che può essere chiamata nel passaggio forward per mantenere il livello di astrazione basso. Ogni
-nuovo modello richieste una classe di configurazione, chiamata `BrandNewBertConfig`. Questa configurazione é sempre
-mantenuta come un attributo in [`PreTrainedModel`], e quindi può essere accessibile tramite l'attributo `config`
+come componente che può essere chiamata nel passaggio forward per mantenere il livello di astrazione basso. Ogni 
+nuovo modello richieste una classe di configurazione, chiamata `BrandNewBertConfig`. Questa configurazione é sempre 
+mantenuta come un attributo in [`PreTrainedModel`], e quindi può essere accessibile tramite l'attributo `config` 
 per tutte le classi che ereditano da `BrandNewBertPreTrainedModel`:

 ```python
@ -93,35 +93,35 @@ model = BrandNewBertModel.from_pretrained("brandy/brand_new_bert")
 model.config  # il modello ha accesso al suo config
 ```

-Analogamente al modello, la configurazione eredita le funzionalità base di serializzazione e deserializzazione da
-[`PretrainedConfig`]. É da notare che la configurazione e il modello sono sempre serializzati in due formati differenti -
-il modello é serializzato in un file *pytorch_model.bin* mentre la configurazione con *config.json*. Chiamando
-[`~PreTrainedModel.save_pretrained`] automaticamente chiamerà [`~PretrainedConfig.save_pretrained`], cosicché sia il
+Analogamente al modello, la configurazione eredita le funzionalità base di serializzazione e deserializzazione da 
+[`PretrainedConfig`]. É da notare che la configurazione e il modello sono sempre serializzati in due formati differenti - 
+il modello é serializzato in un file *pytorch_model.bin* mentre la configurazione con *config.json*. Chiamando 
+[`~PreTrainedModel.save_pretrained`] automaticamente chiamerà [`~PretrainedConfig.save_pretrained`], cosicché sia il 
 modello che la configurazione siano salvati.


 ### Stile per il codice

-Quando codifichi un nuovo modello, tieni presente che Transformers ha una sua struttura di fondo come libreria, perciò
+Quando codifichi un nuovo modello, tieni presente che Transformers ha una sua struttura di fondo come libreria, perciò 
 ci sono alcuni fatti da considerare su come scrivere un codice :-)

-1. Il forward pass del tuo modello dev'essere scritto completamente nel file del modello, mentre dev'essere indipendente
+1. Il forward pass del tuo modello dev'essere scritto completamente nel file del modello, mentre dev'essere indipendente 
   da altri modelli nella libreria. Se vuoi riutilizzare un blocco di codice da un altro modello, copia e incolla il codice con un commento `# Copied from` in cima al codice (guarda [qui](https://github.com/huggingface/transformers/blob/v4.17.0/src/transformers/models/roberta/modeling_roberta.py#L160)
   per un ottimo esempio).
-2. Il codice dev'essere interamente comprensibile, anche da persone che non parlano in inglese. Questo significa che le
-   variabili devono avere un nome descrittivo e bisogna evitare abbreviazioni. Per esempio, `activation` é molto meglio
+2. Il codice dev'essere interamente comprensibile, anche da persone che non parlano in inglese. Questo significa che le 
+   variabili devono avere un nome descrittivo e bisogna evitare abbreviazioni. Per esempio, `activation` é molto meglio 
   che `act`. Le variabili con una lettera sono da evitare fortemente, almeno che non sia per un indce in un for loop.
 3. Generamente é meglio avere un codice esplicito e piú lungo che un codice corto e magico.
-4. Evita di subclassare `nn.Sequential` in Pytorch, puoi subclassare `nn.Module` e scrivere il forward pass, cosicché
-   chiunque può effettuare debug sul tuo codice, aggiungendo print o breaking points.
-5. La tua function-signature dev'essere type-annoted. Per il resto, é meglio preferire variabili con un nome accettabile
+4. Evita di subclassare `nn.Sequential` in Pytorch, puoi subclassare `nn.Module` e scrivere il forward pass, cosicché 
+   chiunque può effettuare debug sul tuo codice, aggiungendo print o breaking points. 
+5. La tua function-signature dev'essere type-annoted. Per il resto, é meglio preferire variabili con un nome accettabile 
   piuttosto che annotazioni per aumentare la comprensione e leggibilità del codice.

 ### Panoramica sui tokenizers

 Questa sezione sarà creata al piu presto :-(

-## Aggiungere un modello a 🤗 Transformers passo dopo passo
+## Aggiungere un modello a 🤗 Transformers passo dopo passo 

 Ci sono differenti modi per aggiungere un modello a Hugging Face. Qui trovi una lista di blog posts da parte della community su come aggiungere un modello:

@ -141,11 +141,11 @@ La lista seguente é un sommario di tutto quello che é stato fatto per aggiunge

 -  1. ☐ (Opzionale) Capire gli aspetti teorici del modello
 -  2. ☐ Preparare l'ambiente dev per transformers
-  3. ☐ Preparare l'ambiente debugging della repository originale
-  4. ☐ Create uno script che gestisca con successo il forward pass usando la repository originale e checkpoint
+-  3. ☐ Preparare l'ambiente debugging della repository originale 
+-  4. ☐ Create uno script che gestisca con successo il forward pass usando la repository originale e checkpoint 
 -  5. ☐ Aggiungere con successo lo scheletro del modello a Transformers
 -  6. ☐ Convertire i checkpoint original a Transformers checkpoint
-  7. ☐ Effettuare con successo la forward pass in Transformers, di modo che dia un output identico al checkpoint originale
+-  7. ☐ Effettuare con successo la forward pass in Transformers, di modo che dia un output identico al checkpoint originale 
 -  8. ☐ Finire i tests per il modello in Transformers
 -  9. ☐ Aggiungere con successo Tokenizer in Transformers
 -  10. ☐ Testare e provare gli integration tests da capo a fine
@ -156,22 +156,22 @@ La lista seguente é un sommario di tutto quello che é stato fatto per aggiunge

 Per cominciare di solito consigliamo `BrandNewBert`, partendo dalla teoria, di modo da avere una buona comprensione della teoria generale. TUttavia, se preferisci imparare l'aspetto teorico del modello mentre *lavori* sul modello é ok immergersi direttamente nel codice di `BrandNewBert`. Questa opzione puó essere buona se le tue skills ingegneristiche sono meglio che quelle teoriche, o se il paper `BrandNewBert` ti dá problemi, o se semplicemente ti piace programmare piú che leggere articoli scientifici.

-### 1. (Opzionale) Aspetti teorici di BrandNewBert
+### 1. (Opzionale) Aspetti teorici di BrandNewBert 

 Allora con calma, prendi un po' di tempo per leggere l'articolo su *BrandNewBert* . Sicuramente, alcune sezioni dell'articolo sono molto complesse, ma non preoccuparti! L'obiettivo non é avere una compresione immensa della teoria alla base, ma estrarre le informazioni necessarie per re-implementare con successo il modello in 🤗 Transformers. Quindi, non impazzire sugli aspetti teorici, ma piuttosto focalizzati su quelli pratici, ossia:

- Che tipo di modello é *brand_new_bert*? É solo un encoder in stile BERT? O tipo decoder come GPT2? O encoder e decoder stile BART? Dai un'occhiata a [model_summary](model_summary) se non sei famigliare con le differenze tra questi modelli
- Quali sono le applicazioni di *brand_new_bert*? Classificazione di testo? Generazione di testo? O per tasks del genere seq2seq?
- Quali sono le nuove aggiunte al modello che lo rendono diverso da BERT/GPT-2/BART?
+- Che tipo di modello é *brand_new_bert*? É solo un encoder in stile BERT? O tipo decoder come GPT2? O encoder e decoder stile BART? Dai un'occhiata a [model_summary](model_summary) se non sei famigliare con le differenze tra questi modelli 
+- Quali sono le applicazioni di *brand_new_bert*? Classificazione di testo? Generazione di testo? O per tasks del genere seq2seq? 
+- Quali sono le nuove aggiunte al modello che lo rendono diverso da BERT/GPT-2/BART? 
 - Quali modelli estistenti in [🤗 Transformers models](https://huggingface.co/transformers/#contents) sono molto simili a *brand_new_bert*?
- Che tipo di tokenizer si usa in questo caso? Un sentencepiece tokenizer? O un word piece tokenizer? Il tokenizer é lo stesso di BERT o BART?
+- Che tipo di tokenizer si usa in questo caso? Un sentencepiece tokenizer? O un word piece tokenizer? Il tokenizer é lo stesso di BERT o BART? 

-Una volta che senti che hai avuto una bella overview dell'architettura del modello, puoi scrivere senza problemi al team di Hugging Face per ogni domanda che tu hai. Questo puó includere domande sull'architettura del modello, o sull'attention layer, etc. Saremo molto felici di aiutarti :)
+Una volta che senti che hai avuto una bella overview dell'architettura del modello, puoi scrivere senza problemi al team di Hugging Face per ogni domanda che tu hai. Questo puó includere domande sull'architettura del modello, o sull'attention layer, etc. Saremo molto felici di aiutarti :) 


 ### 2. Prepare il tuo ambiente

-1. Forka la [repository](https://github.com/huggingface/transformers) cliccando sul tasto ‘Fork' nella pagina della repository. Questo crea una copia del codice nel tuo account GitHub
+1. Forka la [repository](https://github.com/huggingface/transformers) cliccando sul tasto ‘Fork' nella pagina della repository. Questo crea una copia del codice nel tuo account GitHub 

 2. Clona il tuo fork `transfomers` sul tuo dico locale, e aggiungi la repository base come remota:

@ -190,7 +190,7 @@ source .env/bin/activate
 pip install -e ".[dev]"
 ```

-quindi torna alla directory principale:
+quindi torna alla directory principale: 

 ```bash
 cd ..
@ -205,7 +205,7 @@ cd ..
 5. Per trasferire *brand_new_bert* To port *brand_new_bert* avrai bisogno anche accesso alla sua repository originale:

 ```bash
-git clone https://github.com/org_that_created_brand_new_bert_org/brand_new_bert.git
+git clone https://github.com/org_that_created_brand_new_bert_org/brand_new_bert.git 
 cd brand_new_bert
 pip install -e .
 ```
@ -213,16 +213,16 @@ pip install -e .
 Ok, ora hai un ambiente di sviluppo per portare *brand_new_bert* in 🤗 Transformers.


-### 3.-4. Provare un pretrained checkpoint usando la repo originale
+### 3.-4. Provare un pretrained checkpoint usando la repo originale 

-Per cominciare, comincerai a lavorare sulla repo originale di *brand_new_bert*. Come spesso accade, l'implementazione originale é molto sullo stile "ricerca". Questo significa che a volte la documentazione non é al top, magari manca qualche cosa e il codice puó essere difficile da capire. Tuttavia, questa é e dev'essere la motivazione per reimplementare *brand_new_bert*. In Hugging Face, uno degli obiettivi principali é di *mettere le persone sulle spalle dei giganti*, il che si traduce, in questo contesto, di prendere un modello funzionante e riscriverlo e renderlo il piú possibile **accessibile, user-friendly, e leggibile**. Questa é la top motivazione per re-implementare modelli in 🤗 Transformers - cercare di creare nuove complesse tecnologie NLP accessibili a **chiunque**.
+Per cominciare, comincerai a lavorare sulla repo originale di *brand_new_bert*. Come spesso accade, l'implementazione originale é molto sullo stile "ricerca". Questo significa che a volte la documentazione non é al top, magari manca qualche cosa e il codice puó essere difficile da capire. Tuttavia, questa é e dev'essere la motivazione per reimplementare *brand_new_bert*. In Hugging Face, uno degli obiettivi principali é di *mettere le persone sulle spalle dei giganti*, il che si traduce, in questo contesto, di prendere un modello funzionante e riscriverlo e renderlo il piú possibile **accessibile, user-friendly, e leggibile**. Questa é la top motivazione per re-implementare modelli in 🤗 Transformers - cercare di creare nuove complesse tecnologie NLP accessibili a **chiunque**. 

 Riuscire a far girare il modello pretrained originale dalla repository ufficiale é spesso il passo **piu arduo**. Dalla nostra esperienza, é molto importante spendere un p' di tempo per diventare familiari con il codice base originale. Come test, prova a capire i seguenti punti:

- Dove si trovano i pretrained weights?
- Come caricare i pretrained weights nel modello corrispondente?
- Come girare un tokenizer independentemente dal modello?
- Prova a tracciare un singolo forward pass, cosicché potrai sapere che classi e funzioni sono richieste per un semplice forward pass. Di solito, dovrai reimplementare queste funzioni e basta
+- Dove si trovano i pretrained weights? 
+- Come caricare i pretrained weights nel modello corrispondente? 
+- Come girare un tokenizer independentemente dal modello? 
+- Prova a tracciare un singolo forward pass, cosicché potrai sapere che classi e funzioni sono richieste per un semplice forward pass. Di solito, dovrai reimplementare queste funzioni e basta 
 - Prova a localizzare i componenti importanti del modello: Dove si trova la classe del modello? Ci sono sotto classi nel modello *per esempio* EngoderModel, DecoderMOdel? Dove si trova il self-attention layer? Ci sono molteplici differenti layer di attention, *per esempio * *self-attention*, *cross-attention*...?
 - Come puoi fare debug sul modello nell'ambiente originale della repo? Devi aggiungere dei *print* o puoi usare *ipdb* come debugger interattivo, o vabene anche un IDE efficiente per debug come PyCharm?

@ -230,14 +230,14 @@ Riuscire a far girare il modello pretrained originale dalla repository ufficiale

 A questo punto, sta a te decidere quale ambiente per debug vuoi usare. Noi consilgiamo di evitare setup con GPU, che potrebbero costare assai, lavorare su una CPU puó essere un ottimo punto di partenza per indagare la repository originale e per cominciare a scrivere il codice per 🤗 Transformers. Solo alla fine, quando il modello é stato portato con successo in  🤗 Transformers, allora si potrá verificare il suo funzionamento su GPU.

-In generale ci sono due possibili ambienti di debug per il testare il modello originale:
+In generale ci sono due possibili ambienti di debug per il testare il modello originale: 

 - [Jupyter notebooks](https://jupyter.org/) / [google colab](https://colab.research.google.com/notebooks/intro.ipynb)
- Scripts locali in Python
+- Scripts locali in Python 

 Il vantaggio dei Jupyter notebooks é la possibilità di eseguire cella per cella, il che può essere utile per decomporre tutte le componenti logiche, cosi da a vere un ciclo di debug più rapido, siccome si possono salvare i risultati da steps intermedi. Inoltre, i notebooks spesso sono molto facili da condividere con altri contributors, il che può essere molto utile se vuoi chiedere aiuto al team di Hugging Face. Se sei famigliare con Jupyter notebooks allora racommandiamo di lavorare in questa maniera.

-Ovviamente se non siete abituati a lavorare con i notebook, questo può essere uno svantaggio nell'usare questa tecnologia, sprecando un sacco di tempo per setup e portare tutto al nuovo ambiente, siccome non potreste neanche usare dei tools di debug come `ipdb`.
+Ovviamente se non siete abituati a lavorare con i notebook, questo può essere uno svantaggio nell'usare questa tecnologia, sprecando un sacco di tempo per setup e portare tutto al nuovo ambiente, siccome non potreste neanche usare dei tools di debug come `ipdb`. 

 Per ogni pratica code-base, é sempre meglio come primo step caricare un **piccolo** checkpoint pretrained e cercare di riprodurre un singolo forward pass usando un vettore fittizio di IDs fatti da numeri interi. Un esempio per uno script simile, in pseudocodice é:

@ -249,42 +249,42 @@ original_output = model.predict(input_ids)

 Per quanto riguarda la strategia di debugging, si può scegliere tra:

- Decomporre il modello originario in piccole componenenti e testare ognuna di esse
- Decomporre il modello originario nel *tokenizer* originale e nel *modello* originale, testare un forward pass su questi,
+- Decomporre il modello originario in piccole componenenti e testare ognuna di esse 
+- Decomporre il modello originario nel *tokenizer* originale e nel *modello* originale, testare un forward pass su questi, 
 e usare dei print statement o breakpoints intermedi per verificare

-Ancora una volta, siete liberi di scegliere quale strategia sia ottimale per voi. Spesso una strategia é piu
+Ancora una volta, siete liberi di scegliere quale strategia sia ottimale per voi. Spesso una strategia é piu 
 avvantaggiosa di un'altra, ma tutto dipende dall'code-base originario.

-Se il code-base vi permette di decomporre il modello in piccole sub-componenenti, *per esempio* se il code-base
-originario può essere facilmente testato in eager mode, allora vale la pena effettuare un debugging di questo genere.
-Ricordate che ci sono dei vantaggi nel decidere di prendere la strada piu impegnativa sin da subito:
+Se il code-base vi permette di decomporre il modello in piccole sub-componenenti, *per esempio* se il code-base 
+originario può essere facilmente testato in eager mode, allora vale la pena effettuare un debugging di questo genere. 
+Ricordate che ci sono dei vantaggi nel decidere di prendere la strada piu impegnativa sin da subito: 

 - negli stage piu finali, quando bisognerà comparare il modello originario all'implementazione in Hugging Face, potrete verificare
 automaticamente ogni componente, individualmente, di modo che ci sia una corrispondenza 1:1
 - avrete l'opportunità di decomporre un problema molto grande in piccoli passi, così da strutturare meglio il vostro lavoro
- separare il modello in componenti logiche vi aiuterà ad avere un'ottima overview sul design del modello, quindi una migliore
-comprensione del modello stesso
+- separare il modello in componenti logiche vi aiuterà ad avere un'ottima overview sul design del modello, quindi una migliore 
+comprensione del modello stesso 
 - verso gli stage finali i test fatti componente per componente vi aiuterà ad essere sicuri di non andare avanti e indietro
 nell'implementazione, così da continuare la modifica del codice senza interruzione

-Un ottimo esempio di come questo può essere fatto é dato da [Lysandre](https://gist.github.com/LysandreJik/db4c948f6b4483960de5cbac598ad4ed)
+Un ottimo esempio di come questo può essere fatto é dato da [Lysandre](https://gist.github.com/LysandreJik/db4c948f6b4483960de5cbac598ad4ed) 
 per il modello ELECTRA

-Tuttavia, se il code-base originale é molto complesso o le componenti intermedie possono essere testate solo in tramite
-compilazione, potrebbe richiedere parecchio tempo o addirittura essere impossibile separare il modello in piccole sotto-componenti.
-Un buon esempio é [MeshTensorFlow di T5](https://github.com/tensorflow/mesh/tree/master/mesh_tensorflow). Questa libreria
-é molto complessa e non offre un metodo semplice di decomposizione in sotto-componenti. Per simili librerie, potrete fare
+Tuttavia, se il code-base originale é molto complesso o le componenti intermedie possono essere testate solo in tramite 
+compilazione, potrebbe richiedere parecchio tempo o addirittura essere impossibile separare il modello in piccole sotto-componenti. 
+Un buon esempio é [MeshTensorFlow di T5](https://github.com/tensorflow/mesh/tree/master/mesh_tensorflow). Questa libreria 
+é molto complessa e non offre un metodo semplice di decomposizione in sotto-componenti. Per simili librerie, potrete fare 
 affidamento ai print statements.

-In ogni caso, indipendentemente da quale strategia scegliete, la procedura raccomandata é di cominciare a fare debug dal
-primo layer al layer finale.
+In ogni caso, indipendentemente da quale strategia scegliete, la procedura raccomandata é di cominciare a fare debug dal 
+primo layer al layer finale. 
 É consigliato recuperare gli output dai layers, tramite print o sotto-componenti, nel seguente ordine:

 1. Recuperare gli IDs di input dati al modello
 2. Recuperare i word embeddings
-3. Recuperare l'input del primo Transformer layer
-4. Recuperare l'output del primo Transformer layer
+3. Recuperare l'input del primo Transformer layer 
+4. Recuperare l'output del primo Transformer layer 
 5. Recuperare l'output dei seguenti `n - 1` Transformer layers
 6. Recuperare l'output dell'intero BrandNewBert Model

@ -303,36 +303,36 @@ Gli output dei seguenti layer di solito dovrebbero essere degli array di float m
 [-0.5334, -0.6403,  0.4271,  ..., -0.3339,  0.6533,  0.8694]]],
 ```

-Ci aspettiamo che ogni modello aggiunto a 🤗 Transformers passi con successo un paio di test d'integrazione. Questo
-significa che il modello originale e la sua implementazione in 🤗 Transformers abbiano lo stesso output con una precisione
-di 0.001! Siccome é normale che lo stesso esatto modello, scritto in librerie diverse, possa dare output leggermente
-diversi, la tolleranza accettata é 1e-3 (0.001). Ricordate che i due modelli devono dare output quasi identici. Dunque,
-é molto conveniente comparare gli output intermedi di 🤗 Transformers molteplici volte con gli output intermedi del
+Ci aspettiamo che ogni modello aggiunto a 🤗 Transformers passi con successo un paio di test d'integrazione. Questo 
+significa che il modello originale e la sua implementazione in 🤗 Transformers abbiano lo stesso output con una precisione 
+di 0.001! Siccome é normale che lo stesso esatto modello, scritto in librerie diverse, possa dare output leggermente 
+diversi, la tolleranza accettata é 1e-3 (0.001). Ricordate che i due modelli devono dare output quasi identici. Dunque, 
+é molto conveniente comparare gli output intermedi di 🤗 Transformers molteplici volte con gli output intermedi del 
 modello originale di *brand_new_bert*. Di seguito vi diamo alcuni consigli per avere un ambiente di debug il piu efficiente
 possibile:

 - Trovate la migliore strategia per fare debug dei risultati intermedi. Per esempio, é la repository originale scritta in PyTorch?
-Se si, molto probabilmente dovrete dedicare un po' di tempo per scrivere degli script piu lunghi, così da decomporre il
-modello originale in piccole sotto-componenti, in modo da poter recuperare i valori intermedi. Oppure, la repo originale
-é scritta in Tensorflow 1? Se é così dovrete fare affidamento ai print di Tensorflow [tf.print](https://www.tensorflow.org/api_docs/python/tf/print)
-per avere i valori intermedi. Altro caso, la repo é scritta in Jax? Allora assicuratevi che il modello non sia in **jit**
-quanto testate il foward pass, *per esempio* controllate [questo link](https://github.com/google/jax/issues/196).
- Usate i più piccoli pretrained checkpoint che potete trovare. Piu piccolo é il checkpoint, piu velocemente sarà il vostro
-ciclo di debug. Non é efficiente avere un pretrained model così gigante che per il forward pass impieghi piu di 10 secondi.
+Se si, molto probabilmente dovrete dedicare un po' di tempo per scrivere degli script piu lunghi, così da decomporre il 
+modello originale in piccole sotto-componenti, in modo da poter recuperare i valori intermedi. Oppure, la repo originale 
+é scritta in Tensorflow 1? Se é così dovrete fare affidamento ai print di Tensorflow [tf.print](https://www.tensorflow.org/api_docs/python/tf/print) 
+per avere i valori intermedi. Altro caso, la repo é scritta in Jax? Allora assicuratevi che il modello non sia in **jit** 
+quanto testate il foward pass, *per esempio* controllate [questo link](https://github.com/google/jax/issues/196). 
+- Usate i più piccoli pretrained checkpoint che potete trovare. Piu piccolo é il checkpoint, piu velocemente sarà il vostro 
+ciclo di debug. Non é efficiente avere un pretrained model così gigante che per il forward pass impieghi piu di 10 secondi. 
 Nel caso in cui i checkpoints siano molto grandi, e non si possa trovare di meglio, allora é buona consuetudine ricorrere
-a fare un dummy model nel nuovo ambiente, con weights inizializzati random e salvare quei weights per comprare la versione 🤗 Transformers
+a fare un dummy model nel nuovo ambiente, con weights inizializzati random e salvare quei weights per comprare la versione 🤗 Transformers 
 con il vostro modello
- Accertatevi di usare la via piu semplice per chiamare il forward pass nella repo originale. Sarebbe opportuno trovare
-la funzione originaria che chiami **solo** un singolo forward pass, *per esempio* questa funzione spesso viene chiamata
-`predict`, `evaluate`, `forward` o `__call__`. Siate sicuri di non fare debug su una funzione che chiami `forward` molteplici
+- Accertatevi di usare la via piu semplice per chiamare il forward pass nella repo originale. Sarebbe opportuno trovare 
+la funzione originaria che chiami **solo** un singolo forward pass, *per esempio* questa funzione spesso viene chiamata 
+`predict`, `evaluate`, `forward` o `__call__`. Siate sicuri di non fare debug su una funzione che chiami `forward` molteplici 
 volte, *per esempio* per generare testo, come `autoregressive_sample`, `generate`.
- Cercate di separare la tokenization dal forward pass del modello. Se la repo originaria mostra esempio dove potete dare
-come input una stringa, provate a cercare dove nella forward call la stringa viene cambiata in input ids e cominciate il
-debug da questo punto. Questo vi garantisce un ottimo punto di partenza per scrivere un piccolo script personale dove dare
-gli input al modello, anziche delle stringhe in input.
- Assicuratevi che il debugging **non** sia in training mode. Spesso questo potra il modello a dare degli output random, per
-via dei molteplici dropout layers. Assicuratevi che il forward pass nell'ambiente di debug sia **deterministico**, cosicche
-i dropout non siano usati. Alternativamente, potete usare *transformers.utils.set_seed* se la vecchia e nuova implementazione
+- Cercate di separare la tokenization dal forward pass del modello. Se la repo originaria mostra esempio dove potete dare 
+come input una stringa, provate a cercare dove nella forward call la stringa viene cambiata in input ids e cominciate il 
+debug da questo punto. Questo vi garantisce un ottimo punto di partenza per scrivere un piccolo script personale dove dare 
+gli input al modello, anziche delle stringhe in input. 
+- Assicuratevi che il debugging **non** sia in training mode. Spesso questo potra il modello a dare degli output random, per 
+via dei molteplici dropout layers. Assicuratevi che il forward pass nell'ambiente di debug sia **deterministico**, cosicche 
+i dropout non siano usati. Alternativamente, potete usare *transformers.utils.set_seed* se la vecchia e nuova implementazione 
 sono nello stesso framework.

 La seguente sezione vi da ulteriori dettagli e accorgimenti su come potete fare tutto questo per *brand_new_bert*.
@ -343,7 +343,7 @@ La seguente sezione vi da ulteriori dettagli e accorgimenti su come potete fare
 Allora cominciamo ad aggiungere un nuovo codice in 🤗 Transformers. Andate nel vostro fork clone di 🤗 Transformers:


-```bash
+```bash 
 cd transformers
 ```

@ -355,52 +355,52 @@ Se questo non é il caso, cominciamo con il generare un nuovo modello. Ti consig
 un modello esistente:

 ```bash
-transformers add-new-model-like
+transformers-cli add-new-model-like
 ```

 Ti verrà richiesto con un questionario di compilare le informazioni di base del tuo modello.

 **Aprire una Pull Request in main huggingface/transformers repo**

-Prime di cominciare ad adattare il codice automaticamente generato, aprite una nuova PR come "Work in progress (WIP)",
+Prime di cominciare ad adattare il codice automaticamente generato, aprite una nuova PR come "Work in progress (WIP)", 
 *per esempio* "[WIP] Aggiungere *brand_new_bert*", cosicché il team di Hugging Face possa lavorare al vostro fianco nell'
 integrare il modello in 🤗 Transformers.

 Questi sarebbero gli step generali da seguire:

-1. Creare un branch dal main branch con un nome descrittivo
+1. Creare un branch dal main branch con un nome descrittivo 

-```bash
-git checkout -b add_brand_new_bert
+```bash 
+git checkout -b add_brand_new_bert 
 ```

-2. Commit del codice automaticamente generato
+2. Commit del codice automaticamente generato 

-```bash
-git add .
-git commit
+```bash 
+git add . 
+git commit 
 ```

 3. Fare fetch e rebase del main esistente

-```bash
-git fetch upstream
-git rebase upstream/main
+```bash 
+git fetch upstream 
+git rebase upstream/main 
 ```

-4. Push dei cambiamenti al proprio account:
+4. Push dei cambiamenti al proprio account: 

 ```bash
 git push -u origin a-descriptive-name-for-my-changes
 ```

-5. Una volte che siete soddisfatti dei nuovi cambiamenti, andate sulla webpage del vostro fork su GitHub. Cliccate "Pull request".
-Assiuratevi di aggiungere alcuni membri di Hugging Face come reviewers, nel riguardo alla destra della pagina della PR, cosicche il team
-Hugging Face verrà notificato anche per i futuri cambiamenti.
+5. Una volte che siete soddisfatti dei nuovi cambiamenti, andate sulla webpage del vostro fork su GitHub. Cliccate "Pull request". 
+Assiuratevi di aggiungere alcuni membri di Hugging Face come reviewers, nel riguardo alla destra della pagina della PR, cosicche il team 
+Hugging Face verrà notificato anche per i futuri cambiamenti. 

 6. Cambiare la PR a draft, cliccando su "Convert to draft" alla destra della pagina della PR

-Da quel punto in poi, ricordate di fare commit di ogni progresso e cambiamento, cosicche venga mostrato nella PR. Inoltre,
+Da quel punto in poi, ricordate di fare commit di ogni progresso e cambiamento, cosicche venga mostrato nella PR. Inoltre, 
 ricordatevi di tenere aggiornato il vostro lavoro con il main esistente:

 ```bash
@ -408,39 +408,39 @@ git fetch upstream
 git merge upstream/main
 ```

-In generale, tutte le domande che avrete riguardo al modello o l'implementazione dovranno essere fatte nella vostra PR
-e discusse/risolte nella PR stessa. In questa maniera, il team di Hugging Face sarà sempre notificato quando farete commit
-di un nuovo codice o se avrete qualche domanda. É molto utile indicare al team di Hugging Face il codice a cui fate riferimento
-nella domanda, cosicche il team potra facilmente capire il problema o la domanda.
+In generale, tutte le domande che avrete riguardo al modello o l'implementazione dovranno essere fatte nella vostra PR 
+e discusse/risolte nella PR stessa. In questa maniera, il team di Hugging Face sarà sempre notificato quando farete commit 
+di un nuovo codice o se avrete qualche domanda. É molto utile indicare al team di Hugging Face il codice a cui fate riferimento 
+nella domanda, cosicche il team potra facilmente capire il problema o la domanda. 

-Per fare questo andate sulla tab "Files changed", dove potrete vedere tutti i vostri cambiamenti al codice, andate sulla linea
-dove volete chiedere una domanda, e cliccate sul simbolo "+" per aggiungere un commento. Ogni volta che una domanda o problema
+Per fare questo andate sulla tab "Files changed", dove potrete vedere tutti i vostri cambiamenti al codice, andate sulla linea 
+dove volete chiedere una domanda, e cliccate sul simbolo "+" per aggiungere un commento. Ogni volta che una domanda o problema 
 é stato risolto, cliccate sul bottone "Resolve".

-In questa stessa maniera, Hugging Face aprirà domande o commenti nel rivedere il vostro codice. Mi raccomando, chiedete più
-domande possibili nella pagina della vostra PR. Se avete domande molto generali, non molto utili per il pubblico, siete liberi
+In questa stessa maniera, Hugging Face aprirà domande o commenti nel rivedere il vostro codice. Mi raccomando, chiedete più 
+domande possibili nella pagina della vostra PR. Se avete domande molto generali, non molto utili per il pubblico, siete liberi 
 di chiedere al team Hugging Face direttamente su slack o email.


 **5. Adattare i codici per brand_new_bert**

-Per prima cosa, ci focalizzeremo sul modello e non sui tokenizer. Tutto il codice relative dovrebbe trovarsi in
+Per prima cosa, ci focalizzeremo sul modello e non sui tokenizer. Tutto il codice relative dovrebbe trovarsi in  
 `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` e
 `src/transformers/models/brand_new_bert/configuration_brand_new_bert.py`.

-Ora potete finalmente cominciare il codice :). Il codice generato in
-`src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` avrà sia la stessa architettura di BERT se é un
-modello encoder-only o BART se é encoder-decoder. A questo punto, ricordatevi cio che avete imparato all'inizio, riguardo
-agli aspetti teorici del modello: *In che maniera il modello che sto implmementando é diverso da BERT o BART?*. Implementare
-questi cambi  spesso vuol dire cambiare il layer *self-attention*, l'ordine dei layer di normalizzazione e così via...
-Ancora una volta ripetiamo, é molto utile vedere architetture simili di modelli gia esistenti in Transformers per avere
-un'idea migliore su come implementare il modello.
+Ora potete finalmente cominciare il codice :). Il codice generato in 
+`src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` avrà sia la stessa architettura di BERT se é un 
+modello encoder-only o BART se é encoder-decoder. A questo punto, ricordatevi cio che avete imparato all'inizio, riguardo 
+agli aspetti teorici del modello: *In che maniera il modello che sto implmementando é diverso da BERT o BART?*. Implementare 
+questi cambi  spesso vuol dire cambiare il layer *self-attention*, l'ordine dei layer di normalizzazione e così via... 
+Ancora una volta ripetiamo, é molto utile vedere architetture simili di modelli gia esistenti in Transformers per avere 
+un'idea migliore su come implementare il modello. 

-**Notate** che a questo punto non dovete avere subito un codice tutto corretto o pulito. Piuttosto, é consigliato cominciare con un
-codice poco pulito, con copia-incolla del codice originale in `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py`
-fino a che non avrete tutto il codice necessario. In base alla nostra esperienza, é molto meglio aggiungere una prima bozza
-del codice richiesto e poi correggere e migliorare iterativamente. L'unica cosa essenziale che deve funzionare qui é la seguente
-instanza:
+**Notate** che a questo punto non dovete avere subito un codice tutto corretto o pulito. Piuttosto, é consigliato cominciare con un 
+codice poco pulito, con copia-incolla del codice originale in `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py` 
+fino a che non avrete tutto il codice necessario. In base alla nostra esperienza, é molto meglio aggiungere una prima bozza 
+del codice richiesto e poi correggere e migliorare iterativamente. L'unica cosa essenziale che deve funzionare qui é la seguente 
+instanza: 

 ```python
 from transformers import BrandNewBertModel, BrandNewBertConfig
@ -448,23 +448,23 @@ from transformers import BrandNewBertModel, BrandNewBertConfig
 model = BrandNewBertModel(BrandNewBertConfig())
 ```

-Questo comando creerà un modello con i parametri di default definiti in `BrandNewBergConfig()` e weights random. Questo garantisce
+Questo comando creerà un modello con i parametri di default definiti in `BrandNewBergConfig()` e weights random. Questo garantisce 
 che `init()` di tutte le componenti funzioni correttamente.


 **6. Scrivere uno script di conversione**

-Il prossimo step é scrivere uno script per convertire il checkpoint che avete usato per fare debug su *brand_new_berts* nella
-repo originale in un checkpoint per la nuova implementazione di *brand_new_bert* in 🤗 Transformers. Non é consigliato scrivere
+Il prossimo step é scrivere uno script per convertire il checkpoint che avete usato per fare debug su *brand_new_berts* nella 
+repo originale in un checkpoint per la nuova implementazione di *brand_new_bert* in 🤗 Transformers. Non é consigliato scrivere 
 lo script di conversione da zero, ma piuttosto cercate e guardate script gia esistenti in 🤗 Transformers, così da trovarne
-uno simile al vostro modello. Di solito basta fare una copia di uno script gia esistente e adattarlo al vostro caso.
+uno simile al vostro modello. Di solito basta fare una copia di uno script gia esistente e adattarlo al vostro caso. 
 Non esistate a chiedre al team di Hugging Face a riguardo.

 - Se state convertendo un modello da TensorFlow a PyTorch, un ottimo inizio é vedere [questo script di conversione per BERT](https://github.com/huggingface/transformers/blob/7acfa95afb8194f8f9c1f4d2c6028224dbed35a2/src/transformers/models/bert/modeling_bert.py#L91)
 - Se state convertendo un modello da PyTorch a PyTorch, [lo script di conversione di BART può esservi utile](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py)

-Qui di seguito spiegheremo come i modelli PyTorch salvano i weights per ogni layer e come i nomi dei layer sono definiti. In PyTorch,
-il nomde del layer é definito dal nome della class attribute che date al layer. Definiamo un modello dummy in PyTorch,
+Qui di seguito spiegheremo come i modelli PyTorch salvano i weights per ogni layer e come i nomi dei layer sono definiti. In PyTorch, 
+il nomde del layer é definito dal nome della class attribute che date al layer. Definiamo un modello dummy in PyTorch, 
 chiamato `SimpleModel`:

 ```python
@ -497,7 +497,7 @@ SimpleModel(
 )
 ```

-Si può vedere come i nomi dei layers siano definiti dal nome della class attribute in PyTorch. I valori dei weights di uno
+Si può vedere come i nomi dei layers siano definiti dal nome della class attribute in PyTorch. I valori dei weights di uno 
 specifico layer possono essere visualizzati:


@ -530,7 +530,7 @@ tensor([[-0.0818,  0.2207, -0.0749, -0.0030,  0.0045, -0.1569, -0.1598,  0.0212,
          0.2220,  0.2358]]).
 ```

-Nello script di conversione, dovreste riempire quei valori di inizializzazione random con gli stessi weights del corrispondente
+Nello script di conversione, dovreste riempire quei valori di inizializzazione random con gli stessi weights del corrispondente 
 layer nel checkpoint. *Per esempio*

 ```python
@ -544,8 +544,8 @@ model_pointer = getattr(model, "dense")
 model_pointer.weight.data = torch.from_numpy(pretrained_weight)
 ```

-Così facendo, dovete verificare che ogni inizializzazione random di un peso del modello PyTorch e il suo corrispondente peso nel pretrained checkpoint
-siano esattamente gli stessi e uguali in **dimensione/shape e nome**. Per fare questo, é **necessario** aggiungere un `assert`
+Così facendo, dovete verificare che ogni inizializzazione random di un peso del modello PyTorch e il suo corrispondente peso nel pretrained checkpoint 
+siano esattamente gli stessi e uguali in **dimensione/shape e nome**. Per fare questo, é **necessario** aggiungere un `assert` 
 per la dimensione/shape e nome:

 ```python
@ -560,19 +560,19 @@ Inoltre, dovrete fare il print sia dei nomi che dei weights per essere sicuri ch
 logger.info(f"Initialize PyTorch weight {layer_name} from {pretrained_weight.name}")
 ```

-Se la dimensione o il nome non sono uguali, probabilmente avete sbagliato ad assegnare il peso nel checkpoint o nel layer costrutture di
+Se la dimensione o il nome non sono uguali, probabilmente avete sbagliato ad assegnare il peso nel checkpoint o nel layer costrutture di 
 🤗 Transformers.

-Una dimensione sbagliata può essere dovuta ad un errore nei parameteri in `BrandNewBertConfig()`. Tuttavia, può essere anche
-che l'implementazione del layer in PyTorch richieda di fare una transposizione della matrice dei weights.
+Una dimensione sbagliata può essere dovuta ad un errore nei parameteri in `BrandNewBertConfig()`. Tuttavia, può essere anche 
+che l'implementazione del layer in PyTorch richieda di fare una transposizione della matrice dei weights. 

-Infine, controllate **tutti** che tutti i weights inizializzati e fate print di tutti i weights del checkpoint che non sono stati
-usati per l'inizializzazione, di modo da essere sicuri che il modello sia correttamente convertito. É normale che ci siano
-errori nel test di conversione, fai per un errore in `BrandNewBertConfig()`, o un errore nell'architettura in 🤗 Transformers,
-o un bug in `init()`.
+Infine, controllate **tutti** che tutti i weights inizializzati e fate print di tutti i weights del checkpoint che non sono stati 
+usati per l'inizializzazione, di modo da essere sicuri che il modello sia correttamente convertito. É normale che ci siano 
+errori nel test di conversione, fai per un errore in `BrandNewBertConfig()`, o un errore nell'architettura in 🤗 Transformers, 
+o un bug in `init()`. 

-Questo step dev'essere fatto tramite iterazioni fino a che non si raggiungano gli stessi valori per i weights. Una volta che
-il checkpoint é stato correttamente caricato in 🤗 Transformers, potete salvare il modello in una cartella di vostra scelta
+Questo step dev'essere fatto tramite iterazioni fino a che non si raggiungano gli stessi valori per i weights. Una volta che 
+il checkpoint é stato correttamente caricato in 🤗 Transformers, potete salvare il modello in una cartella di vostra scelta 
 `/path/to/converted/checkpoint/folder` che contenga sia
 `pytorch_model.bin` che `config.json`:

@ -583,9 +583,9 @@ model.save_pretrained("/path/to/converted/checkpoint/folder")

 **7. Implementare il forward pass**

-Una volta che i weights pretrained sono stati correttamente caricati in 🤗 Transformers, dovrete assicurarvi che il forward pass
+Una volta che i weights pretrained sono stati correttamente caricati in 🤗 Transformers, dovrete assicurarvi che il forward pass 
 sia correttamente implementato. [Qui](#3-4-provare-un-pretrained-checkpoint-usando-la-repo-originale), avete give creato e provato
-uno script che testi il forward pass del modello usando la repo originaria. Ora dovrete fare lo stesso con uno script analogo
+uno script che testi il forward pass del modello usando la repo originaria. Ora dovrete fare lo stesso con uno script analogo 
 usando l'implementazione in 🤗 Transformers anziché l'originale. Piu o meno lo script dovrebbe essere:

 ```python
@ -594,27 +594,27 @@ input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]
 output = model(input_ids).last_hidden_states
 ```

-Di solito l'output da 🤗 Transformers non é uguale uguale all'output originario, sopratto la prima volta. Non vi abbattete -
-é normale! Prima di tutto assicuratevi che non ci siano errori o che non vengano segnalati degli errori nella forward pass.
-Spesso capita che ci siano dimensioni sbagliate o data type sbagliati, *ad esempio* `torch.long` anziche `torch.float32`.
+Di solito l'output da 🤗 Transformers non é uguale uguale all'output originario, sopratto la prima volta. Non vi abbattete - 
+é normale! Prima di tutto assicuratevi che non ci siano errori o che non vengano segnalati degli errori nella forward pass. 
+Spesso capita che ci siano dimensioni sbagliate o data type sbagliati, *ad esempio* `torch.long` anziche `torch.float32`. 
 Non esistate a chiedere al team Hugging Face!

-Nella parte finale assicuratevi che l'implementazione 🤗 Transformers funzioni correttamente cosi da testare che gli output
-siano equivalenti a una precisione di `1e-3`. Controllate che `outputs.shape` siano le stesse tra 🤗 Transformers e l'implementazione
-originaria. Poi, controllate che i valori in output siano identici. Questa é sicuramente la parte più difficile, qui una serie
+Nella parte finale assicuratevi che l'implementazione 🤗 Transformers funzioni correttamente cosi da testare che gli output 
+siano equivalenti a una precisione di `1e-3`. Controllate che `outputs.shape` siano le stesse tra 🤗 Transformers e l'implementazione 
+originaria. Poi, controllate che i valori in output siano identici. Questa é sicuramente la parte più difficile, qui una serie 
 di errori comuni quando gli output non sono uguali:

- Alcuni layers non sono stati aggiunti, *ad esempio* un *activation* layer non é stato aggiunto, o ci si é scordati di una connessione
- La matrice del word embedding non é stata ripareggiata
- Ci sono degli embeddings posizionali sbagliati perché l'implementazione originaria ha un offset
- Il dropout é in azione durante il forward pass. Per sistemare questo errore controllate che *model.training = False* e che
+- Alcuni layers non sono stati aggiunti, *ad esempio* un *activation* layer non é stato aggiunto, o ci si é scordati di una connessione 
+- La matrice del word embedding non é stata ripareggiata 
+- Ci sono degli embeddings posizionali sbagliati perché l'implementazione originaria ha un offset 
+- Il dropout é in azione durante il forward pass. Per sistemare questo errore controllate che *model.training = False* e che 
 il dropout non sia stato attivato nel forward pass, * per esempio * passate *self.training* a [PyTorch's functional dropout](https://pytorch.org/docs/stable/nn.functional.html?highlight=dropout#torch.nn.functional.dropout)

-La miglior maniera per sistemare il problema é di vedere all'implementazione originaria del forward pass e in 🤗 Transformers
-fianco a fianco e vedere se ci sono delle differenze. In teoria, con debug e print degli output intermedie di entrambe le
-implementazioni nel forward pass nell'esatta posizione del network dovrebbe aiutarvi a vedere dove ci sono differenze tra
-i due frameworks. Come prima mossa controllate che `input_ids` siano identici in entrambi gli scripts. Da lì andate fino
-all'ultimo layer. Potrete notare una differenza tra le due implementazioni a quel punto.
+La miglior maniera per sistemare il problema é di vedere all'implementazione originaria del forward pass e in 🤗 Transformers 
+fianco a fianco e vedere se ci sono delle differenze. In teoria, con debug e print degli output intermedie di entrambe le 
+implementazioni nel forward pass nell'esatta posizione del network dovrebbe aiutarvi a vedere dove ci sono differenze tra 
+i due frameworks. Come prima mossa controllate che `input_ids` siano identici in entrambi gli scripts. Da lì andate fino 
+all'ultimo layer. Potrete notare una differenza tra le due implementazioni a quel punto. 

 Una volta che lo stesso output é stato ragguingi, verificate gli output con `torch.allclose(original_output, output, atol=1e-3)`.
 A questo punto se é tutto a posto: complimenti! Le parti seguenti saranno una passeggiata 😊.
@ -622,9 +622,9 @@ A questo punto se é tutto a posto: complimenti! Le parti seguenti saranno una p

 **8. Aggiungere i test necessari per il modello**

-A questo punto avete aggiunto con successo il vostro nuovo modello. Tuttavia, é molto probabile che il modello non sia
+A questo punto avete aggiunto con successo il vostro nuovo modello. Tuttavia, é molto probabile che il modello non sia 
 del tutto ok con il design richiesto. Per essere sicuri che l'implementazione sia consona e compatibile con 🤗 Transformers é
-necessario implementare dei tests. Il Cookiecutter dovrebbe fornire automaticamente dei file per test per il vostro modello,
+necessario implementare dei tests. Il Cookiecutter dovrebbe fornire automaticamente dei file per test per il vostro modello, 
 di solito nella folder `tests/test_modeling_brand_new_bert.py`. Provate questo per verificare l'ok nei test piu comuni:

 ```bash
@ -636,8 +636,8 @@ Una volta sistemati i test comuni, bisogna assicurarsi che il vostro lavoro sia
 - a) La community puo capire in maniera semplice il vostro lavoro controllando tests specifici del modello *brand_new_bert*,
 - b) Implementazioni future del vostro modello non rompano alcune feature importante del modello.

-Per prima cosa agguingete dei test d'integrazione. Questi sono essenziali perche fanno la stessa funzione degli scripts di
-debug usati precedentemente. Un template per questi tests esiste gia nel Cookiecutter ed é sotto il nome di `BrandNewBertModelIntegrationTests`,
+Per prima cosa agguingete dei test d'integrazione. Questi sono essenziali perche fanno la stessa funzione degli scripts di 
+debug usati precedentemente. Un template per questi tests esiste gia nel Cookiecutter ed é sotto il nome di `BrandNewBertModelIntegrationTests`, 
 voi dovrete solo completarlo. Una volta che questi tests sono OK, provate:

 ```bash
@ -650,7 +650,7 @@ Nel caso siate su Windows, sostituite `RUN_SLOW=1` con `SET RUN_SLOW=1`

 </Tip>

-Di seguito, tutte le features che sono utili e necessarire per *brand_new_bert* devono essere testate in test separati,
+Di seguito, tutte le features che sono utili e necessarire per *brand_new_bert* devono essere testate in test separati, 
 contenuti in `BrandNewBertModelTester`/ `BrandNewBertModelTest`. spesso la gente si scorda questi test, ma ricordate che sono utili per:


@ -664,7 +664,7 @@ A questo punto avremo bisogno un tokenizer per *brand_new_bert*. Di solito il to

 É importante che troviate il file con il tokenizer originale e che lo carichiate in 🤗 Transformers.

-Per controllare che il tokenizer funzioni in modo corretto, create uno script nella repo originaria che riceva come input
+Per controllare che il tokenizer funzioni in modo corretto, create uno script nella repo originaria che riceva come input 
 una stringa e ritorni gli `input_ids`. Piu o meno questo potrebbe essere il codice:

 ```python
@ -673,8 +673,8 @@ model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/")
 input_ids = model.tokenize(input_str)
 ```

-Potrebbe richiedere un po' di tempo, ma guardate ancora alla repo originaria per trovare la funzione corretta del tokenizer.
-A volte capita di dover riscrivere il tokenizer nella repo originaria, di modo da avere come output gli `input_ids`.
+Potrebbe richiedere un po' di tempo, ma guardate ancora alla repo originaria per trovare la funzione corretta del tokenizer. 
+A volte capita di dover riscrivere il tokenizer nella repo originaria, di modo da avere come output gli `input_ids`. 
 A quel punto uno script analogo é necessario in 🤗 Transformers:

 ```python
@ -687,7 +687,7 @@ tokenizer = BrandNewBertTokenizer.from_pretrained("/path/to/tokenizer/folder/")
 input_ids = tokenizer(input_str).input_ids
 ```

-Una volta che `input_ids` sono uguali, bisogna aggiungere un test per il tokenizer.
+Una volta che `input_ids` sono uguali, bisogna aggiungere un test per il tokenizer. 

 Il file test per tokenizer di *brand_new_brand* dovrebbe avere un paio di hard-coded test d'integrazione.

@ -696,22 +696,22 @@ Il file test per tokenizer di *brand_new_brand* dovrebbe avere un paio di hard-c

 Ora che avete il tokenizer, dovrete aggiungere dei test d'integrazione per l'intero workflow in `tests/test_modeling_brand_new_bert.py` in 🤗 Transformer.
 Questi test devono mostrare che un significante campione text-to-text funzioni come ci si aspetta nell'implementazione di  🤗 Transformers.
-*Per esempio* potreste usare dei source-to-target-translation, o un sommario di un articolo, o un domanda-risposta e cosi via.
-Se nessuno dei checkpoints é stato ultra parametrizzato per task simili, allora i tests per il modello sono piu che sufficienti.
-Nello step finale dovete assicurarvi che il modello sia totalmente funzionale, e consigliamo anche di provare a testare su GPU.
+*Per esempio* potreste usare dei source-to-target-translation, o un sommario di un articolo, o un domanda-risposta e cosi via. 
+Se nessuno dei checkpoints é stato ultra parametrizzato per task simili, allora i tests per il modello sono piu che sufficienti. 
+Nello step finale dovete assicurarvi che il modello sia totalmente funzionale, e consigliamo anche di provare a testare su GPU. 
 Puo succedere che ci si scordi un `.to(self.device)` ad esempio. Se non avete accesso a GPU, il team Hugging Face puo provvedere
-a testare questo aspetto per voi.
+a testare questo aspetto per voi. 

 **11. Aggiungere una Docstring**

-Siete quasi alla fine! L'ultima cosa rimasta é avere una bella docstring e una pagina doc. Il Cookiecutter dovrebbe provvedere già
-un template chiamato `docs/source/model_doc/brand_new_bert.rst`, che dovrete compilare. La prima cosa che un utente farà
-per usare il vostro modello sarà dare una bella lettura al doc. Quindi proponete una documentazione chiara e concisa. É molto
-utile per la community avere anche delle *Tips* per mostrare come il modello puo' essere usato. Non esitate a chiedere a Hugging Face
-riguardo alle docstirng.
+Siete quasi alla fine! L'ultima cosa rimasta é avere una bella docstring e una pagina doc. Il Cookiecutter dovrebbe provvedere già 
+un template chiamato `docs/source/model_doc/brand_new_bert.rst`, che dovrete compilare. La prima cosa che un utente farà 
+per usare il vostro modello sarà dare una bella lettura al doc. Quindi proponete una documentazione chiara e concisa. É molto 
+utile per la community avere anche delle *Tips* per mostrare come il modello puo' essere usato. Non esitate a chiedere a Hugging Face 
+riguardo alle docstirng. 

-Quindi, assicuratevi che la docstring sia stata aggiunta a `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py`.
-Assicuratevi che la docstring sia corretta e che includa tutti i necessari input e output. Abbiamo una guida dettagliata per
+Quindi, assicuratevi che la docstring sia stata aggiunta a `src/transformers/models/brand_new_bert/modeling_brand_new_bert.py`. 
+Assicuratevi che la docstring sia corretta e che includa tutti i necessari input e output. Abbiamo una guida dettagliata per 
 scrivere la documentazione e docstring.


@ -729,8 +729,8 @@ E che il codice passi i quality check:
 make quality
 ```

-A volte capita che manchino delle informazioninella docstring o alcuni nomi sbagliati, questo farà fallire i tests sopra.
-Ripetiamo: chiedete pure a Hugging Face, saremo lieti di aiutarvi.
+A volte capita che manchino delle informazioninella docstring o alcuni nomi sbagliati, questo farà fallire i tests sopra. 
+Ripetiamo: chiedete pure a Hugging Face, saremo lieti di aiutarvi. 

 Per ultimo, fare del refactoring del codice una volta che é stato creato.

@ -738,10 +738,10 @@ Avete finito con il codice, congratulazioni! 🎉 Siete fantasticiiiiiii! 😎

 **12. Caricare il modello sul model hub**

-In questa ultima parte dovrete convertire e caricare il modello, con tutti i checkpoints, nel model hub e aggiungere una
-model card per ogni checkpoint caricato. Leggete la nostra guida [Model sharing and uploading Page](model_sharing) per
-avere familiarità con l'hub. Di solito in questa parte lavorate a fianco di Hugging face per decidere un nome che sia ok
-per ogni checkpoint, per ottenere i permessi necessari per caricare il modello nell'organizzazione dell'autore di *brand_new_bert*.
+In questa ultima parte dovrete convertire e caricare il modello, con tutti i checkpoints, nel model hub e aggiungere una 
+model card per ogni checkpoint caricato. Leggete la nostra guida [Model sharing and uploading Page](model_sharing) per 
+avere familiarità con l'hub. Di solito in questa parte lavorate a fianco di Hugging face per decidere un nome che sia ok 
+per ogni checkpoint, per ottenere i permessi necessari per caricare il modello nell'organizzazione dell'autore di *brand_new_bert*. 
 Il metodo `push_to_hub`, presente in tutti i modelli `transformers`, é una maniera rapida e indolore per caricare il vostro checkpoint sull'hub:

 ```python
@ -754,27 +754,27 @@ brand_new_bert.push_to_hub(
 )
 ```

-Vale la pena spendere un po' di tempo per creare una model card ad-hoc per ogni checkpoint. Le model cards dovrebbero
-suggerire le caratteristiche specifiche del checkpoint, *per esempio* su che dataset il checkpoint é stato pretrained o fine-tuned.
+Vale la pena spendere un po' di tempo per creare una model card ad-hoc per ogni checkpoint. Le model cards dovrebbero 
+suggerire le caratteristiche specifiche del checkpoint, *per esempio* su che dataset il checkpoint é stato pretrained o fine-tuned. 
 O che su che genere di task il modello lavoro? E anche buona pratica includere del codice su come usare il modello correttamente.


 **13. (Opzionale) Aggiungere un notebook**

-É molto utile aggiungere un notebook, che dimostri in dettaglio come *brand_new_bert* si utilizzi per fare inferenza e/o
+É molto utile aggiungere un notebook, che dimostri in dettaglio come *brand_new_bert* si utilizzi per fare inferenza e/o 
 fine-tuned su specifiche task. Non é una cosa obbligatoria da avere nella vostra PR, ma é molto utile per la community.

 **14. Sottomettere la PR**

-L'ultimissimo step! Ovvero il merge della PR nel main. Di solito il team Hugging face a questo punto vi avrà gia aiutato,
+L'ultimissimo step! Ovvero il merge della PR nel main. Di solito il team Hugging face a questo punto vi avrà gia aiutato, 
 ma é ok prendere un po' di tempo per pulire la descirzione e commenti nel codice.


 ### Condividete il vostro lavoro!!

-É ora tempo di prendere un po' di credito dalla communità per il vostro lavoro! Caricare e implementare un nuovo modello
-é un grandissimo contributo per Transformers e l'intera community NLP. Il codice e la conversione dei modelli pre-trained sara
-sicuramente utilizzato da centinaia o migliaia di sviluppatori e ricercatori. Siate fieri e orgogliosi di condividere il vostro
-traguardo con l'intera community :)
+É ora tempo di prendere un po' di credito dalla communità per il vostro lavoro! Caricare e implementare un nuovo modello 
+é un grandissimo contributo per Transformers e l'intera community NLP. Il codice e la conversione dei modelli pre-trained sara 
+sicuramente utilizzato da centinaia o migliaia di sviluppatori e ricercatori. Siate fieri e orgogliosi di condividere il vostro 
+traguardo con l'intera community :) 

 ** Avete create un altro modello che é super facile da usare per tutti quanti nella community! 🤯**
--- a/docs/source/it/converting_tensorflow_models.md
+++ b/docs/source/it/converting_tensorflow_models.md
@ -18,10 +18,10 @@ in modelli che possono essere caricati utilizzando i metodi `from_pretrained` de

 <Tip>

-A partire dalla versione 2.3.0 lo script di conversione è parte di transformers CLI (**transformers**), disponibile in ogni installazione
+A partire dalla versione 2.3.0 lo script di conversione è parte di transformers CLI (**transformers-cli**), disponibile in ogni installazione
 di transformers >=2.3.0.

-La seguente documentazione riflette il formato dei comandi di **transformers convert**.
+La seguente documentazione riflette il formato dei comandi di **transformers-cli convert**.

 </Tip>

@ -49,7 +49,7 @@ Questo è un esempio del processo di conversione per un modello `BERT-Base Uncas

 ```bash
 export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
-transformers convert --model_type bert \
+transformers-cli convert --model_type bert \
  --tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \
  --config $BERT_BASE_DIR/bert_config.json \
  --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin
@ -70,7 +70,7 @@ Ecco un esempio del procedimento di conversione di un modello `ALBERT Base` pre-

 ```bash
 export ALBERT_BASE_DIR=/path/to/albert/albert_base
-transformers convert --model_type albert \
+transformers-cli convert --model_type albert \
  --tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \
  --config $ALBERT_BASE_DIR/albert_config.json \
  --pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin
@ -84,7 +84,7 @@ Ecco un esempio del processo di conversione di un modello OpenAI GPT pre-allenat
 sia salvato nello stesso formato dei modelli pre-allenati OpenAI (vedi [qui](https://github.com/openai/finetune-transformer-lm)):
 ```bash
 export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
-transformers convert --model_type gpt \
+transformers-cli convert --model_type gpt \
  --tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
  [--config OPENAI_GPT_CONFIG] \
@ -97,7 +97,7 @@ Ecco un esempio del processo di conversione di un modello OpenAI GPT-2 pre-allen

 ```bash
 export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/openai-community/gpt2/pretrained/weights
-transformers convert --model_type gpt2 \
+transformers-cli convert --model_type gpt2 \
  --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
  [--config OPENAI_GPT2_CONFIG] \
@ -111,7 +111,7 @@ Ecco un esempio del processo di conversione di un modello XLNet pre-allenato:
 ```bash
 export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
 export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
-transformers convert --model_type xlnet \
+transformers-cli convert --model_type xlnet \
  --tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \
  --config $TRANSFO_XL_CONFIG_PATH \
  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
@ -124,7 +124,7 @@ Ecco un esempio del processo di conversione di un modello XLM pre-allenato:

 ```bash
 export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint
-transformers convert --model_type xlm \
+transformers-cli convert --model_type xlm \
  --tf_checkpoint $XLM_CHECKPOINT_PATH \
  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT
 [--config XML_CONFIG] \
@ -137,7 +137,7 @@ Ecco un esempio del processo di conversione di un modello T5 pre-allenato:

 ```bash
 export T5=/path/to/t5/uncased_L-12_H-768_A-12
-transformers convert --model_type t5 \
+transformers-cli convert --model_type t5 \
  --tf_checkpoint $T5/t5_model.ckpt \
  --config $T5/t5_config.json \
  --pytorch_dump_output $T5/pytorch_model.bin
--- a/docs/source/it/perf_train_cpu.md
+++ b/docs/source/it/perf_train_cpu.md
@ -19,7 +19,7 @@ Questa guida si concentra su come addestrare in maniera efficiente grandi modell

 ## Mixed precision con IPEX

-IPEX è ottimizzato per CPU con AVX-512 o superiore, e funziona per le CPU con solo AVX2. Pertanto, si prevede che le prestazioni saranno più vantaggiose per le CPU Intel con AVX-512 o superiori, mentre le CPU con solo AVX2 (ad esempio, le CPU AMD o le CPU Intel più vecchie) potrebbero ottenere prestazioni migliori con IPEX, ma non sono garantite. IPEX offre ottimizzazioni delle prestazioni per l'addestramento della CPU sia con Float32 che con BFloat16. L'uso di BFloat16 è l'argomento principale delle seguenti sezioni.
+IPEX è ottimizzato per CPU con AVX-512 o superiore, e funziona per le CPU con solo AVX2. Pertanto, si prevede che le prestazioni saranno più vantaggiose per le le CPU Intel con AVX-512 o superiori, mentre le CPU con solo AVX2 (ad esempio, le CPU AMD o le CPU Intel più vecchie) potrebbero ottenere prestazioni migliori con IPEX, ma non sono garantite. IPEX offre ottimizzazioni delle prestazioni per l'addestramento della CPU sia con Float32 che con BFloat16. L'uso di BFloat16 è l'argomento principale delle seguenti sezioni.

 Il tipo di dati a bassa precisione BFloat16 è stato supportato in modo nativo su 3rd Generation Xeon® Scalable Processors (aka Cooper Lake) con AVX512 e sarà supportata dalla prossima generazione di Intel® Xeon® Scalable Processors con Intel® Advanced Matrix Extensions (Intel® AMX) instruction set con prestazioni ulteriormente migliorate. L'Auto Mixed Precision per il backende della CPU è stato abilitato da PyTorch-1.10. allo stesso tempo, il supporto di Auto Mixed Precision con BFloat16 per CPU e l'ottimizzazione degli operatori BFloat16 è stata abilitata in modo massiccio in Intel® Extension per PyTorch, and parzialmente aggiornato al branch master di PyTorch. Gli utenti possono ottenere prestazioni migliori ed users experience con IPEX Auto Mixed Precision..

--- a/docs/source/ja/add_new_model.md
+++ b/docs/source/ja/add_new_model.md
@ -312,7 +312,7 @@ cd transformers
 既存のモデル:

 ```bash
-transformers add-new-model-like
+transformers-cli add-new-model-like
 ```

 モデルの基本情報を入力するためのアンケートが表示されます。
@ -517,7 +517,7 @@ tensor([[-0.0818,  0.2207, -0.0749, -0.0030,  0.0045, -0.1569, -0.1598,  0.0212,

 スクリプト内の変換スクリプトでは、ランダムに初期化された重みを、対応するチェックポイント内の正確な重みで埋める必要があります。例えば、以下のように翻訳します：

-
+ 
 ```python
 # retrieve matching layer weights, e.g. by
 # recursive algorithm
@ -747,3 +747,5 @@ brand_new_bert.push_to_hub("brand_new_bert")
 さあ、コミュニティからあなたの作業に対する評価を得る時が来ました！モデルの追加を完了することは、TransformersおよびNLPコミュニティにとって重要な貢献です。あなたのコードとポートされた事前学習済みモデルは、何百人、何千人という開発者や研究者によって確実に使用されるでしょう。あなたの仕事に誇りを持ち、コミュニティとあなたの成果を共有しましょう。

 **あなたはコミュニティの誰でも簡単にアクセスできる別のモデルを作成しました！ 🤯**
+
+
--- a/docs/source/ja/model_doc/beit.md
+++ b/docs/source/ja/model_doc/beit.md
@ -105,11 +105,6 @@ BEiT の使用を開始するのに役立つ公式 Hugging Face およびコミ

 [[autodoc]] BeitImageProcessor
    - preprocess
-
-## BeitImageProcessorFast
-
-[[autodoc]] BeitImageProcessorFast
-    - preprocess
    - post_process_semantic_segmentation

 ## BeitModel
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@ -97,8 +97,6 @@
    sections:
    - local: generation_strategies
      title: 텍스트 생성 전략 사용자 정의
-    - local: serving
-      title: 모델 서빙하기
    title: 생성
  - isExpanded: false
    sections:
@ -125,8 +123,6 @@
    title: Amazon SageMaker에서 학습 실행하기
  - local: serialization
    title: ONNX로 내보내기
-  - local: gpu_selection
-    title: GPU 선택하기
  - local: tflite
    title: TFLite로 내보내기
  - local: torchscript
@ -358,8 +354,8 @@
        title: (번역중) DistilBERT
      - local: in_translation
        title: (번역중) DPR
-      - local: model_doc/electra
-        title: ELECTRA
+      - local: in_translation
+        title: (번역중) ELECTRA
      - local: model_doc/encoder-decoder
        title: 인코더 디코더 모델
      - local: in_translation
--- a/docs/source/ko/add_new_model.md
+++ b/docs/source/ko/add_new_model.md
@ -73,7 +73,7 @@ model.config  # model has access to its config
 5. 함수 시그니처에는 타입 주석을 사용해야 합니다. 그 외에는 타입 주석보다 변수 이름이 훨씬 읽기 쉽고 이해하기 쉽습니다.

 ### 토크나이저 개요 [[overview-of-tokenizers]]
-
+ 
 아직 준비되지 않았습니다 :-( 이 섹션은 곧 추가될 예정입니다!

 ## 🤗 Transformers에 모델 추가하는 단계별 방법  [[stepbystep-recipe-to-add-a-model-to-transformers]]
@ -272,7 +272,7 @@ cd transformers
 기존 모델:

 ```bash
-transformers add-new-model-like
+transformers-cli add-new-model-like
 ```

 모델의 기본 정보를 입력하는 설문지가 표시됩니다.
--- a/docs/source/ko/contributing.md
+++ b/docs/source/ko/contributing.md
@ -63,7 +63,7 @@ limitations under the License.
 운영체제와 소프트웨어 버전을 자동으로 가져오려면 다음 명령을 실행하세요:

 ```bash
-transformers env
+transformers-cli env
 ```

 저장소의 루트 디렉터리에서도 같은 명령을 실행할 수 있습니다:
--- a/docs/source/ko/deepspeed.md
+++ b/docs/source/ko/deepspeed.md
@ -1165,7 +1165,7 @@ python -c 'import deepspeed; print(f"deepspeed: {deepspeed.__version__}")'

 ### DeepSpeed 프로세스가 시작 단계에서 종료되었을 경우[[deepspeed-process-killed-at-startup]]

-실행 중에 트레이스백 없이 DeepSpeed 프로세스가 종료되면 일반적으로 프로그램이 시스템보다 많은 CPU 메모리를 할당하려고 시도했거나 프로세스가 허용된 것보다 많은 CPU 메모리를 할당하려고 시도하여 OS 커널이 프로세스를 종료했음을 의미합니다. 이 경우 구성 파일에 `offload_optimizer`, `offload_param` 또는 둘 다 CPU로 오프로드하도록 구성되어 있는지 확인하세요.
+실행 중에 트레이스백 없이 DeepSpeed 프로세스가 종료되면 일반적으로 프로그램이 시스템보다 많은 CPU 메모리를 할당하려고 시도했거나 프로세스가 허용된 것보다 많은 CPU 메모리를 할당하려고 시도하여 OS 커널이 프로세스를 종료했음을 의미합니다. 이 경우 구성 파일에 `offload_optimizer`, `offload_param` 또는 둘 다 CPU로 오프로드하도록 구성되어 있는지 확인하세요.  

 NVMe 및 ZeRO-3를 설정한 경우 NVMe로 오프로드를 실험해 보세요(모델의 메모리 요구 사항을 [확인](https://deepspeed.readthedocs.io/en/latest/memory.html)하세요).

@ -1211,7 +1211,7 @@ NVMe 및 ZeRO-3를 설정한 경우 NVMe로 오프로드를 실험해 보세요(

 ## 리소스[[resources]]

-DeepSpeed ZeRO는 제한된 GPU 리소스로 추론을 위해 매우 큰 모델을 훈련하고 로드하는 강력한 기술로, 누구나 쉽게 사용할 수 있습니다. DeepSpeed에 대해 자세히 알아보려면 [블로그 포스트](https://www.microsoft.com/en-us/research/search/?q=deepspeed), [공식 문서](https://www.deepspeed.ai/getting-started/), [깃허브 리포지토리](https://github.com/deepspeedai/DeepSpeed)를 참조하세요.
+DeepSpeed ZeRO는 제한된 GPU 리소스로 추론을 위해 매우 큰 모델을 훈련하고 로드하는 강력한 기술로, 누구나 쉽게 사용할 수 있습니다. DeepSpeed에 대해 자세히 알아보려면 [블로그 포스트](https://www.microsoft.com/en-us/research/search/?q=deepspeed), [공식 문서](https://www.deepspeed.ai/getting-started/), [깃허브 리포지토리](https://github.com/deepspeedai/DeepSpeed)를 참조하세요. 

 다음 문서도 ZeRO에 대해 자세히 알아볼 수 있는 훌륭한 자료입니다:

--- a/docs/source/ko/gpu_selection.md
+++ b/docs/source/ko/gpu_selection.md
@ -1,96 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# GPU 선택하기 [[gpu-selection]]
-
-분산 학습 과정에서 사용할 GPU의 개수와 순서를 정할 수 있습니다. 이 방법은 서로 다른 연산 성능을 가진 GPU가 있을 때 더 빠른 GPU를 우선적으로 사용하거나, 사용 가능한 GPU 중 일부만 선택하여 활용하고자 할 때 유용합니다. 이 선택 과정은 [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html)과 [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html)에서 모두 작동합니다. Accelerate나 [DeepSpeed 통합](./main_classes/deepspeed)은 필요하지 않습니다.
-
-이 가이드는 사용할 GPU의 개수를 선택하는 방법과 사용 순서를 설정하는 방법을 설명합니다.
-
-## GPU 개수 지정 [[number-of-gpus]]
-
-예를 들어, GPU가 4개 있고 그중 처음 2개만 사용하려는 경우, 아래 명령어를 실행하세요.
-
-<hfoptions id="select-gpu">
-<hfoption id="torchrun">
-
-사용할 GPU 개수를 정하기 위해 `--nproc_per_node` 옵션을 사용하세요.
-
-```bash
-torchrun --nproc_per_node=2  trainer-program.py ...
-```
-
-</hfoption>
-<hfoption id="Accelerate">
-
-사용할 GPU 개수를 정하기 위해 `--num_processes` 옵션을 사용하세요.
-
-```bash
-accelerate launch --num_processes 2 trainer-program.py ...
-```
-
-</hfoption>
-<hfoption id="DeepSpeed">
-
-사용할 GPU 개수를 정하기 위해 `--num_gpus` 옵션을 사용하세요.
-
-```bash
-deepspeed --num_gpus 2 trainer-program.py ...
-```
-
-</hfoption>
-</hfoptions>
-
-### GPU 순서 [[order-of-gpus]]
-
-사용할 GPU와 그 순서를 지정하려면 `CUDA_VISIBLE_DEVICES` 환경 변수를 설정하세요. 가장 쉬운 방법은 `~/bashrc` 또는 다른 시작 설정 파일에서 해당 변수를 설정하는 것입니다. `CUDA_VISIBLE_DEVICES`는 사용할 GPU를 매핑하는 데 사용됩니다. 예를 들어, GPU가 4개 (0, 1, 2, 3) 있고 그중에서 0번과 2번 GPU만 사용하고 싶을 경우, 다음과 같이 설정할 수 있습니다:
-
-```bash
-CUDA_VISIBLE_DEVICES=0,2 torchrun trainer-program.py ...
-```
-
-오직 두 개의 물리적 GPU(0, 2)만 PyTorch에서 "보이는" 상태가 되며, 각각 `cuda:0`과 `cuda:1`로 매핑됩니다. 또한, GPU 사용 순서를 반대로 설정할 수도 있습니다. 이 경우, GPU 0이 `cuda:1`, GPU 2가 `cuda:0`으로 매핑됩니다."
-
-```bash
-CUDA_VISIBLE_DEVICES=2,0 torchrun trainer-program.py ...
-```
-
-`CUDA_VISIBLE_DEVICES` 환경 변수를 빈 값으로 설정하여 GPU가 없는 환경을 만들 수도 있습니다.
-
-```bash
-CUDA_VISIBLE_DEVICES= python trainer-program.py ...
-```
-
-> [!WARNING]
-> 다른 환경 변수와 마찬가지로, CUDA_VISIBLE_DEVICES를 커맨드 라인에 추가하는 대신 export하여 설정할 수도 있습니다. 그러나 이 방식은 환경 변수가 어떻게 설정되었는지를 잊어버릴 경우, 잘못된 GPU를 사용할 위험이 있기 때문에 권장하지 않습니다. 특정 학습 실행에 대해 동일한 커맨드 라인에서 환경 변수를 설정하는 것이 일반적인 방법입니다.
-
-`CUDA_DEVICE_ORDER`는 GPU의 순서를 제어하는 데 사용할 수 있는 대체 환경 변수입니다. 이 변수를 사용하면 다음과 같은 방식으로 GPU 순서를 지정할 수 있습니다:
-
-1. NVIDIA 및 AMD GPU의 PCIe 버스 ID는 각각 [nvidia-smi](https://developer.nvidia.com/nvidia-system-management-interface)와 [rocm-smi](https://rocm.docs.amd.com/projects/rocm_smi_lib/en/latest/.doxygen/docBin/html/index.html)의 순서와 일치합니다.
-
-```bash
-export CUDA_DEVICE_ORDER=PCI_BUS_ID
-```
-
-2. GPU 연산 능력
-
-```bash
-export CUDA_DEVICE_ORDER=FASTEST_FIRST
-```
-
-The `CUDA_DEVICE_ORDER` is especially useful if your training setup consists of an older and newer GPU, where the older GPU appears first, but you cannot physically swap the cards to make the newer GPU appear first. In this case, set `CUDA_DEVICE_ORDER=FASTEST_FIRST` to always use the newer and faster GPU first (`nvidia-smi` or `rocm-smi` still reports the GPUs in their PCIe order). Or you could also set `export CUDA_VISIBLE_DEVICES=1,0`.
-
-`CUDA_DEVICE_ORDER`는 구형 GPU와 신형 GPU가 혼합된 환경에서 특히 유용합니다. 예를 들어, 구형 GPU가 먼저 표시되지만 물리적으로 교체할 수 없는 경우, `CUDA_DEVICE_ORDER=FASTEST_FIRST`를 설정하면 항상 신형 및 더 빠른 GPU를 우선적으로 사용(nvidia-smi 또는 rocm-smi는 PCIe 순서대로 GPU를 표시함)할 수 있습니다. 또는, `export CUDA_VISIBLE_DEVICES=1,0`을 설정하여 GPU 사용 순서를 직접 지정할 수도 있습니다.
--- a/docs/source/ko/model_doc/electra.md
+++ b/docs/source/ko/model_doc/electra.md
@ -1,196 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# ELECTRA[[electra]]
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
-<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
-">
-</div>
-
-## 개요[[overview]]
-
-ELECTRA 모델은 [ELECTRA: Pre-training Text Encoders as Discriminators Rather Than
-Generators](https://openreview.net/pdf?id=r1xMH1BtvB) 논문에서 제안되었습니다. ELECTRA는 두가지 트랜스포머 모델인 생성 모델과 판별 모델을 학습시키는 새로운 사전학습 접근법입니다. 생성 모델의 역할은 시퀀스에 있는 토큰을 대체하는 것이며 마스킹된 언어 모델로 학습됩니다. 우리가 관심을 가진 판별 모델은 시퀀스에서 어떤 토큰이 생성 모델에 의해 대체되었는지 식별합니다. 
-
-논문의 초록은 다음과 같습니다:
-
-*BERT와 같은 마스킹된 언어 모델(MLM) 사전학습 방법은 일부 토큰을 [MASK] 토큰으로 바꿔 손상시키고 난 뒤, 모델이 다시 원본 토큰을 복원하도록 학습합니다. 이런 방식은 다운스트림 NLP 작업을 전이할 때 좋은 성능을 내지만, 효과적으로 사용하기 위해서는 일반적으로 많은 양의 연산이 필요합니다. 따라서 대안으로, 대체 토큰 탐지라고 불리는 샘플-효과적인 사전학습을 제안합니다. 우리의 방법론은 입력에 마스킹을 하는 대신에 소형 생성 모델의 그럴듯한 대안 토큰으로 손상시킵니다. 그리고 나서, 모델이 손상된 토큰의 원래 토큰을 예측하도록 훈련시키는 대신, 판별 모델을 각각의 토큰이 생성 모델의 샘플로 손상되었는지 아닌지 학습합니다. 실험들은 통해 이 새로운 사전학습 방식은 마스킹된 일부 토큰에만 적용되는 기존 방식과 달리 모든 입력 토큰에 대해 학습이 이뤄지기 때문에 마스킹된 언어 모델(MLM)보다 더 효율적임을 입증하였습니다. 결과적으로 소개된 방식이 같은 모델 크기, 데이터, 연산량을 가진 BERT모델로 학습한 결과를 압도하는 문맥 표현 학습을 할 수 있다는 것을 확인했습니다. 특히 작은 모델에서 성능 향상이 두드러지며, 예를 들어 GPU 한 대로 4일간 학습한 모델이 30배 더 많은 계산 자원을 사용한 GPT보다 GLUE 자연어 이해 벤치마크에서 더 나은 성능을 보입니다. 대규모 환경에서도 유효하며 더 적은 연산량으로 RoBERTa와 XLNet과 비슷한 성능을 낼 수 있으며, 동일한 연산량을 가질 경우 이들의 성능을 능가합니다.*
-
-
-이 모델은 [lysandre](https://huggingface.co/lysandre)이 기여했습니다. 원본 코드는 [이곳](https://github.com/google-research/electra)에서 찾아보실 수 있습니다.
-
-## 사용 팁[[usage-tips]]
-
- ELECTRA는 사전학습 방법으로 기본 모델인 BERT의 구조와 거의 차이가 없습니다. 유일한 차이는 임베딩 크기와 히든 크기를 구분했다는 점입니다. 임베딩 크기는 일반적으로 더 작고, 히든 크기는 더 큽니다. 임베딩에서 임베딩 크기를 히든 크기로 변환하기 위해 추가로 선형 변환 층이 사용됩니다. 임베딩 크기와 히든 크기가 동일할 경우에는 이 선형 변환 층이 필요하지 않습니다. 
- ELECTRA는 또 다른 (작은) 마스킹된 언어 모델을 사용해 사전학습 된 트랜스포머 모델입니다.  작은 언어 모델이 입력 텍스트의 일부를 무작위로 마스킹하고, 그 자리에 새로운 토큰을 삽입합니다. ELECTRA는 원래 토큰과 대체된 토큰을 구분하는 역할을 수행합니다. GAN 훈련과 비슷하지만, 생성 모델은 ELECTRA 모델을 속이는 것이 아니라 원래 텍스트를 복원하는 목표로 몇 단계 학습합니다. 그 후 ELECTRA가 학습을 하게 됩니다.
- [구글 리서치의 구현](https://github.com/google-research/electra)으로 저장된 ELECTRA checkpoints는 생성 모델과 판별 모델을 포함합니다. 변환 스크립트에서는 사용자가 어떤 모델을 어떤 아키텍처로 내보낼지 명시해야 합니다. 일단 Hugging Face 포맷으로 변환되면, 이 체크포인트들은 모든 ELECTRA 모델에서 불러올 수 있습니다. 즉, 판별 모델은 [`ElectraForMaskedLM`] 모델에, 생성 모델은 [`ElectraForPreTraining`]모델에 불러올 수 있다는 의미입니다. (단, 생성 모델에는 분류 헤드가 존재하지 않기 때문에, 해당 부분은 무작위로 초기화됩니다.)
-
-## 참고 자료[[resources]]
-
- [텍스트 분류 가이드](../tasks/sequence_classification)
- [토큰 분류 가이드](../tasks/token_classification)
- [질의 응답 가이드](../tasks/question_answering)
- [인과 언어 모델링 가이드](../tasks/language_modeling)
- [마스킹된 언어 모델링 가이드](../tasks/masked_language_modeling)
- [객관식 문제 가이드](../tasks/multiple_choice)
-
-## ElectraConfig
-
-[[autodoc]] ElectraConfig
-
-## ElectraTokenizer
-
-[[autodoc]] ElectraTokenizer
-
-## ElectraTokenizerFast
-
-[[autodoc]] ElectraTokenizerFast
-
-## Electra specific outputs
-
-[[autodoc]] models.electra.modeling_electra.ElectraForPreTrainingOutput
-
-[[autodoc]] models.electra.modeling_tf_electra.TFElectraForPreTrainingOutput
-
-<frameworkcontent>
-<pt>
-
-## ElectraModel
-
-[[autodoc]] ElectraModel
-    - forward
-
-## ElectraForPreTraining
-
-[[autodoc]] ElectraForPreTraining
-    - forward
-
-## ElectraForCausalLM
-
-[[autodoc]] ElectraForCausalLM
-    - forward
-
-## ElectraForMaskedLM
-
-[[autodoc]] ElectraForMaskedLM
-    - forward
-
-## ElectraForSequenceClassification
-
-[[autodoc]] ElectraForSequenceClassification
-    - forward
-
-## ElectraForMultipleChoice
-
-[[autodoc]] ElectraForMultipleChoice
-    - forward
-
-## ElectraForTokenClassification
-
-[[autodoc]] ElectraForTokenClassification
-    - forward
-
-## ElectraForQuestionAnswering
-
-[[autodoc]] ElectraForQuestionAnswering
-    - forward
-
-</pt>
-<tf>
-
-## TFElectraModel
-
-[[autodoc]] TFElectraModel
-    - call
-
-## TFElectraForPreTraining
-
-[[autodoc]] TFElectraForPreTraining
-    - call
-
-## TFElectraForMaskedLM
-
-[[autodoc]] TFElectraForMaskedLM
-    - call
-
-## TFElectraForSequenceClassification
-
-[[autodoc]] TFElectraForSequenceClassification
-    - call
-
-## TFElectraForMultipleChoice
-
-[[autodoc]] TFElectraForMultipleChoice
-    - call
-
-## TFElectraForTokenClassification
-
-[[autodoc]] TFElectraForTokenClassification
-    - call
-
-## TFElectraForQuestionAnswering
-
-[[autodoc]] TFElectraForQuestionAnswering
-    - call
-
-</tf>
-<jax>
-
-## FlaxElectraModel
-
-[[autodoc]] FlaxElectraModel
-    - __call__
-
-## FlaxElectraForPreTraining
-
-[[autodoc]] FlaxElectraForPreTraining
-    - __call__
-
-## FlaxElectraForCausalLM
-
-[[autodoc]] FlaxElectraForCausalLM
-    - __call__
-
-## FlaxElectraForMaskedLM
-
-[[autodoc]] FlaxElectraForMaskedLM
-    - __call__
-
-## FlaxElectraForSequenceClassification
-
-[[autodoc]] FlaxElectraForSequenceClassification
-    - __call__
-
-## FlaxElectraForMultipleChoice
-
-[[autodoc]] FlaxElectraForMultipleChoice
-    - __call__
-
-## FlaxElectraForTokenClassification
-
-[[autodoc]] FlaxElectraForTokenClassification
-    - __call__
-
-## FlaxElectraForQuestionAnswering
-
-[[autodoc]] FlaxElectraForQuestionAnswering
-    - __call__
-
-</jax>
-</frameworkcontent>
--- a/docs/source/ko/serving.md
+++ b/docs/source/ko/serving.md
@ -1,64 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# 모델 서빙 [[Serving]]
-
-Text Generation Inference (TGI) 및 vLLM과 같은 특수한 라이브러리를 사용해 Transformer 모델을 추론에 사용할 수 있습니다. 이러한 라이브러리는 vLLM의 성능을 최적화하도록 설계되었으며, Transformers에는 포함되지 않은 고유한 최적화 기능을 다양하게 제공합니다.
-
-## TGI [[TGI]]
-
-[네이티브로 구현된 모델](https://huggingface.co/docs/text-generation-inference/supported_models)이 아니더라도 TGI로 Transformers 구현 모델을 서빙할 수 있습니다. TGI에서 제공하는 일부 고성능 기능은 지원하지 않을 수 있지만 연속 배칭이나 스트리밍과 같은 기능들은 사용할 수 있습니다.
-
-> [!TIP]
-> 더 자세한 내용은 [논-코어 모델 서빙](https://huggingface.co/docs/text-generation-inference/basic_tutorials/non_core_models) 가이드를 참고하세요.
-
-TGI 모델을 서빙하는 방식과 동일한 방식으로 Transformer 구현 모델을 서빙할 수 있습니다.
-
-```docker
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id gpt2
-```
-
-커스텀 Transformers 모델을 서빙하려면 `--trust-remote_code`를 명령어에 추가하세요.
-
-```docker
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id <CUSTOM_MODEL_ID> --trust-remote-code
-```
-
-## vLLM [[vLLM]]
-
-[vLLM](https://docs.vllm.ai/en/latest/index.html)은 특정 모델이 vLLM에서 [네이티브로 구현된 모델](https://docs.vllm.ai/en/latest/models/supported_models.html#list-of-text-only-language-models)이 아닐 경우, Transformers 구현 모델을 서빙할 수도 있습니다. 
-
-Transformers 구현에서는 양자화, LoRA 어댑터, 분산 추론 및 서빙과 같은 다양한 기능이 지원됩니다.
-
-> [!TIP]
-> [Transformers fallback](https://docs.vllm.ai/en/latest/models/supported_models.html#transformers-fallback) 섹션에서 더 자세한 내용을 확인할 수 있습니다.
-
-기본적으로 vLLM은 네이티브 구현을 서빙할 수 있지만, 해당 구현이 존재하지 않으면 Transformers 구현을 사용합니다. 하지만 `--model-impl transformers` 옵션을 설정하면 명시적으로 Transformers 모델 구현을 사용할 수 있습니다.
-
-```shell
-vllm serve Qwen/Qwen2.5-1.5B-Instruct \
-    --task generate \
-    --model-impl transformers \
-```
-
-`trust-remote-code` 파라미터를 추가해 원격 코드 모델 로드를 활성화할 수 있습니다.
-
-```shell
-vllm serve Qwen/Qwen2.5-1.5B-Instruct \
-    --task generate \
-    --model-impl transformers \
-    --trust-remote-code \
-```
--- a/docs/source/pt/converting_tensorflow_models.md
+++ b/docs/source/pt/converting_tensorflow_models.md
@ -21,10 +21,10 @@ que podem ser carregados usando os métodos `from_pretrained` da biblioteca.

 <Tip>

-A partir da versão 2.3.0 o script de conversão agora faz parte do transformers CLI (**transformers**) disponível em qualquer instalação
+A partir da versão 2.3.0 o script de conversão agora faz parte do transformers CLI (**transformers-cli**) disponível em qualquer instalação
 transformers >= 2.3.0.

-A documentação abaixo reflete o formato do comando **transformers convert**.
+A documentação abaixo reflete o formato do comando **transformers-cli convert**.

 </Tip>

@ -49,7 +49,7 @@ Aqui está um exemplo do processo de conversão para um modelo `BERT-Base Uncase
 ```bash
 export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12

-transformers convert --model_type bert \
+transformers-cli convert --model_type bert \
  --tf_checkpoint $BERT_BASE_DIR/bert_model.ckpt \
  --config $BERT_BASE_DIR/bert_config.json \
  --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin
@ -71,7 +71,7 @@ Aqui está um exemplo do processo de conversão para o modelo `ALBERT Base` pré
 ```bash
 export ALBERT_BASE_DIR=/path/to/albert/albert_base

-transformers convert --model_type albert \
+transformers-cli convert --model_type albert \
  --tf_checkpoint $ALBERT_BASE_DIR/model.ckpt-best \
  --config $ALBERT_BASE_DIR/albert_config.json \
  --pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin
@ -88,7 +88,7 @@ foi salvo com o mesmo formato do modelo pré-treinado OpenAI (veja [aqui](https:
 ```bash
 export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights

-transformers convert --model_type gpt \
+transformers-cli convert --model_type gpt \
  --tf_checkpoint $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
  [--config OPENAI_GPT_CONFIG] \
@ -102,7 +102,7 @@ Aqui está um exemplo do processo de conversão para um modelo OpenAI GPT-2 pré
 ```bash
 export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/openai-community/gpt2/pretrained/weights

-transformers convert --model_type gpt2 \
+transformers-cli convert --model_type gpt2 \
  --tf_checkpoint $OPENAI_GPT2_CHECKPOINT_PATH \
  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
  [--config OPENAI_GPT2_CONFIG] \
@ -117,7 +117,7 @@ Aqui está um exemplo do processo de conversão para um modelo XLNet pré-treina
 export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
 export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config

-transformers convert --model_type xlnet \
+transformers-cli convert --model_type xlnet \
  --tf_checkpoint $TRANSFO_XL_CHECKPOINT_PATH \
  --config $TRANSFO_XL_CONFIG_PATH \
  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
@ -131,7 +131,7 @@ Aqui está um exemplo do processo de conversão para um modelo XLM pré-treinado
 ```bash
 export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint

-transformers convert --model_type xlm \
+transformers-cli convert --model_type xlm \
  --tf_checkpoint $XLM_CHECKPOINT_PATH \
  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT
 [--config XML_CONFIG] \
@ -145,7 +145,7 @@ Aqui está um exemplo do processo de conversão para um modelo T5 pré-treinado:
 ```bash
 export T5=/path/to/t5/uncased_L-12_H-768_A-12

-transformers convert --model_type t5 \
+transformers-cli convert --model_type t5 \
  --tf_checkpoint $T5/t5_model.ckpt \
  --config $T5/t5_config.json \
  --pytorch_dump_output $T5/pytorch_model.bin
--- a/docs/source/zh/contributing.md
+++ b/docs/source/zh/contributing.md
@ -63,7 +63,7 @@ limitations under the License.
 想要自动获取操作系统和软件版本，请运行以下命令：

 ```bash
-transformers env
+transformers-cli env
 ```

 你也可以从代码仓库的根目录下运行相同的命令：
--- a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
+++ b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
@ -99,7 +99,7 @@ class ModelArguments:
    use_auth_token: bool = field(
        default=False,
        metadata={
-            "help": "Will use the token generated when running `transformers login` (necessary to use this script "
+            "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script "
            "with private models)."
        },
    )
--- a/examples/legacy/multiple_choice/utils_multiple_choice.py
+++ b/examples/legacy/multiple_choice/utils_multiple_choice.py
@ -539,7 +539,7 @@ def convert_examples_to_features(
            if "num_truncated_tokens" in inputs and inputs["num_truncated_tokens"] > 0:
                logger.info(
                    "Attention! you are cropping tokens (swag task is ok). "
-                    "If you are training ARC and RACE and you are popping question + options, "
+                    "If you are training ARC and RACE and you are poping question + options, "
                    "you need to try to use a bigger max seq length!"
                )

--- a/examples/legacy/question-answering/run_squad.py
+++ b/examples/legacy/question-answering/run_squad.py
@ -745,7 +745,7 @@ def main():
        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
        do_lower_case=args.do_lower_case,
        cache_dir=args.cache_dir if args.cache_dir else None,
-        use_fast=False,  # SquadDataset is not compatible with Fast tokenizers which have a smarter overflow handling
+        use_fast=False,  # SquadDataset is not compatible with Fast tokenizers which have a smarter overflow handeling
    )
    model = AutoModelForQuestionAnswering.from_pretrained(
        args.model_name_or_path,
@ -795,7 +795,7 @@ def main():
        # Load a trained model and vocabulary that you have fine-tuned
        model = AutoModelForQuestionAnswering.from_pretrained(args.output_dir)  # , force_download=True)

-        # SquadDataset is not compatible with Fast tokenizers which have a smarter overflow handling
+        # SquadDataset is not compatible with Fast tokenizers which have a smarter overflow handeling
        # So we use use_fast=False here for now until Fast-tokenizer-compatible-examples are out
        tokenizer = AutoTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case, use_fast=False)
        model.to(args.device)
--- a/examples/legacy/question-answering/run_squad_trainer.py
+++ b/examples/legacy/question-answering/run_squad_trainer.py
@ -122,7 +122,7 @@ def main():
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
-        use_fast=False,  # SquadDataset is not compatible with Fast tokenizers which have a smarter overflow handling
+        use_fast=False,  # SquadDataset is not compatible with Fast tokenizers which have a smarter overflow handeling
    )
    model = AutoModelForQuestionAnswering.from_pretrained(
        model_args.model_name_or_path,
--- a/examples/legacy/run_transfo_xl.py
+++ b/examples/legacy/run_transfo_xl.py
@ -71,7 +71,7 @@ def main():
    # You can also build the corpus yourself using TransfoXLCorpus methods
    # The pre-processing involve computing word frequencies to prepare the Adaptive input and SoftMax
    # and tokenizing the dataset
-    # The pre-processed corpus is a conversion (using the conversion script )
+    # The pre-processed corpus is a convertion (using the conversion script )
    corpus = TransfoXLCorpus.from_pretrained(args.model_name)

    va_iter = corpus.get_iterator("valid", args.batch_size, args.tgt_len, device=device, ext_len=args.ext_len)
--- a/Show More
+++ b/Show More