[WIP] Emu3: add model (#33770 )

* model can convert to HF and be loaded back * nit * works in single batch generation but hallucinates * use the image tokens * add image generation * now it works * add tests * update * add modulare but it doesn't work for porting docstring :( * skip some tests * add slow tests * modular removed the import? * guess this works * update * update * fix copies * fix test * fix copies * update * docs * fix tests * last fix tests? * pls * repo consistency * more style * style * remove file * address comments * tiny bits * update after the new modular * fix tests * add one more cond in check attributes * decompose down/up/mid blocks * allow static cache generation in VLMs * nit * fix copies * Update docs/source/en/model_doc/emu3.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * Update docs/source/en/model_doc/emu3.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * Update docs/source/en/model_doc/emu3.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * Update docs/source/en/model_doc/emu3.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * Update docs/source/en/model_doc/emu3.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * Update docs/source/en/model_doc/emu3.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * Update docs/source/en/model_doc/emu3.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * Update docs/source/en/model_doc/emu3.md Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> * fix VAE upsampling * Update src/transformers/models/emu3/modular_emu3.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * address comments * state overwritten stuff explicitly * fix copies * add the flag for flex attn --------- Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Fix flex_attention in training mode (#35605 )
2025-10-21 01:23:56 +08:00 · 2025-01-10 12:30:23 +01:00 · 2025-01-10 11:50:12 +01:00 · 2025-01-10 11:34:08 +01:00 · 2025-01-10 11:29:31 +01:00 · 2025-01-10 11:20:41 +01:00
1916 changed files with 66306 additions and 92672 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -58,14 +58,14 @@ jobs:
                name: "Prepare pipeline parameters"
                command: |
                    python utils/process_test_artifacts.py 
-            
+
            # To avoid too long generated_config.yaml on the continuation orb, we pass the links to the artifacts as parameters.
            # Otherwise the list of tests was just too big. Explicit is good but for that it was a limitation.
            # We used:

            # https://circleci.com/docs/api/v2/index.html#operation/getJobArtifacts : to get the job artifacts
            # We could not pass a nested dict, which is why we create the test_file_... parameters for every single job
-                
+
            - store_artifacts:
                path: test_preparation/transformed_artifacts.json
            - store_artifacts:
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@ -32,7 +32,7 @@ COMMON_ENV_VARIABLES = {
    "RUN_PT_FLAX_CROSS_TESTS": False,
 }
 # Disable the use of {"s": None} as the output is way too long, causing the navigation on CircleCI impractical
-COMMON_PYTEST_OPTIONS = {"max-worker-restart": 0, "dist": "loadfile", "vvv": None, "rsf":None}
+COMMON_PYTEST_OPTIONS = {"max-worker-restart": 0, "dist": "loadfile", "vvv": None, "rsfE":None}
 DEFAULT_DOCKER_IMAGE = [{"image": "cimg/python:3.8.12"}]


@ -40,9 +40,23 @@ class EmptyJob:
    job_name = "empty"

    def to_dict(self):
+        steps = [{"run": 'ls -la'}]
+        if self.job_name == "collection_job":
+            steps.extend(
+                [
+                    "checkout",
+                    {"run": "pip install requests || true"},
+                    {"run": """while [[ $(curl --location --request GET "https://circleci.com/api/v2/workflow/$CIRCLE_WORKFLOW_ID/job" --header "Circle-Token: $CCI_TOKEN"| jq -r '.items[]|select(.name != "collection_job")|.status' | grep -c "running") -gt 0 ]]; do sleep 5; done || true"""},
+                    {"run": 'python utils/process_circleci_workflow_test_reports.py --workflow_id $CIRCLE_WORKFLOW_ID || true'},
+                    {"store_artifacts": {"path": "outputs"}},
+                    {"run": 'echo "All required jobs have now completed"'},
+                ]
+            )
+
        return {
            "docker": copy.deepcopy(DEFAULT_DOCKER_IMAGE),
-            "steps":["checkout"],
+            "resource_class": "small",
+            "steps": steps,
        }


@ -54,9 +68,9 @@ class CircleCIJob:
    install_steps: List[str] = None
    marker: Optional[str] = None
    parallelism: Optional[int] = 0
-    pytest_num_workers: int = 12
+    pytest_num_workers: int = 8
    pytest_options: Dict[str, Any] = None
-    resource_class: Optional[str] = "2xlarge"
+    resource_class: Optional[str] = "xlarge"
    tests_to_run: Optional[List[str]] = None
    num_test_files_per_worker: Optional[int] = 10
    # This should be only used for doctest job!
@ -133,7 +147,7 @@ class CircleCIJob:
                "command": """dpkg-query --show --showformat='${Installed-Size}\t${Package}\n' | sort -rh | head -25 | sort -h | awk '{ package=$2; sub(".*/", "", package); printf("%.5f GB %s\n", $1/1024/1024, package)}' || true"""}
            },
            {"run": {"name": "Create `test-results` directory", "command": "mkdir test-results"}},
-            {"run": {"name": "Get files to test", "command":f'curl -L -o {self.job_name}_test_list.txt <<pipeline.parameters.{self.job_name}_test_list>>' if self.name != "pr_documentation_tests" else 'echo "Skipped"'}},
+            {"run": {"name": "Get files to test", "command":f'curl -L -o {self.job_name}_test_list.txt <<pipeline.parameters.{self.job_name}_test_list>> --header "Circle-Token: $CIRCLE_TOKEN"' if self.name != "pr_documentation_tests" else 'echo "Skipped"'}},
                        {"run": {"name": "Split tests across parallel nodes: show current parallel tests",
                    "command": f"TESTS=$(circleci tests split  --split-by=timings {self.job_name}_test_list.txt) && echo $TESTS > splitted_tests.txt && echo $TESTS | tr ' ' '\n'" if self.parallelism else f"awk '{{printf \"%s \", $0}}' {self.job_name}_test_list.txt > splitted_tests.txt"
                    }
@ -185,7 +199,6 @@ torch_job = CircleCIJob(
    docker_image=[{"image": "huggingface/transformers-torch-light"}],
    marker="not generate",
    parallelism=6,
-    pytest_num_workers=8
 )

 generate_job = CircleCIJob(
@ -193,28 +206,24 @@ generate_job = CircleCIJob(
    docker_image=[{"image": "huggingface/transformers-torch-light"}],
    marker="generate",
    parallelism=6,
-    pytest_num_workers=8
 )

 tokenization_job = CircleCIJob(
    "tokenization",
    docker_image=[{"image": "huggingface/transformers-torch-light"}],
    parallelism=8,
-    pytest_num_workers=16
 )

 processor_job = CircleCIJob(
    "processors",
    docker_image=[{"image": "huggingface/transformers-torch-light"}],
    parallelism=8,
-    pytest_num_workers=6
 )

 tf_job = CircleCIJob(
    "tf",
    docker_image=[{"image":"huggingface/transformers-tf-light"}],
    parallelism=6,
-    pytest_num_workers=16,
 )


@ -222,7 +231,8 @@ flax_job = CircleCIJob(
    "flax",
    docker_image=[{"image":"huggingface/transformers-jax-light"}],
    parallelism=6,
-    pytest_num_workers=16
+    pytest_num_workers=16,
+    resource_class="2xlarge",
 )


@ -231,7 +241,7 @@ pipelines_torch_job = CircleCIJob(
    additional_env={"RUN_PIPELINE_TESTS": True},
    docker_image=[{"image":"huggingface/transformers-torch-light"}],
    marker="is_pipeline_test",
-    parallelism=4
+    parallelism=4,
 )


@ -240,7 +250,7 @@ pipelines_tf_job = CircleCIJob(
    additional_env={"RUN_PIPELINE_TESTS": True},
    docker_image=[{"image":"huggingface/transformers-tf-light"}],
    marker="is_pipeline_test",
-    parallelism=4
+    parallelism=4,
 )


@ -257,7 +267,6 @@ examples_torch_job = CircleCIJob(
    docker_image=[{"image":"huggingface/transformers-examples-torch"}],
    # TODO @ArthurZucker remove this once docker is easier to build
    install_steps=["uv venv && uv pip install . && uv pip install -r examples/pytorch/_tests_requirements.txt"],
-    pytest_num_workers=8,
 )


@ -265,7 +274,6 @@ examples_tensorflow_job = CircleCIJob(
    "examples_tensorflow",
    additional_env={"OMP_NUM_THREADS": 8},
    docker_image=[{"image":"huggingface/transformers-examples-tf"}],
-    pytest_num_workers=16,
 )


@ -280,6 +288,7 @@ hub_job = CircleCIJob(
    ],
    marker="is_staging_test",
    pytest_num_workers=2,
+    resource_class="medium",
 )


@ -292,13 +301,13 @@ onnx_job = CircleCIJob(
    ],
    pytest_options={"k onnx": None},
    pytest_num_workers=1,
+    resource_class="small",
 )


 exotic_models_job = CircleCIJob(
    "exotic_models",
    docker_image=[{"image":"huggingface/transformers-exotic-models"}],
-    pytest_num_workers=12,
    parallelism=4,
    pytest_options={"durations": 100},
 )
@ -317,7 +326,6 @@ non_model_job = CircleCIJob(
    docker_image=[{"image": "huggingface/transformers-torch-light"}],
    marker="not generate",
    parallelism=6,
-    pytest_num_workers=8,
 )


@ -352,6 +360,7 @@ REPO_UTIL_TESTS = [repo_utils_job]
 DOC_TESTS = [doc_test_job]
 ALL_TESTS = REGULAR_TESTS + EXAMPLES_TESTS + PIPELINE_TESTS + REPO_UTIL_TESTS + DOC_TESTS + [custom_tokenizers_job] + [exotic_models_job]  # fmt: skip

+
 def create_circleci_config(folder=None):
    if folder is None:
        folder = os.getcwd()
@ -361,7 +370,13 @@ def create_circleci_config(folder=None):

    if len(jobs) == 0:
        jobs = [EmptyJob()]
-    print("Full list of job name inputs", {j.job_name + "_test_list":{"type":"string", "default":''} for j in jobs})
+    else:
+        print("Full list of job name inputs", {j.job_name + "_test_list":{"type":"string", "default":''} for j in jobs})
+        # Add a job waiting all the test jobs and aggregate their test summary files at the end
+        collection_job = EmptyJob()
+        collection_job.job_name = "collection_job"
+        jobs = [collection_job] + jobs
+
    config = {
        "version": "2.1",
        "parameters": {
@ -371,9 +386,14 @@ def create_circleci_config(folder=None):
            **{j.job_name + "_test_list":{"type":"string", "default":''} for j in jobs},
            **{j.job_name + "_parallelism":{"type":"integer", "default":1} for j in jobs},
        },
-        "jobs" : {j.job_name: j.to_dict() for j in jobs},
-        "workflows": {"version": 2, "run_tests": {"jobs": [j.job_name for j in jobs]}}
+        "jobs": {j.job_name: j.to_dict() for j in jobs}
    }
+    if "CIRCLE_TOKEN" in os.environ:
+        # For private forked repo. (e.g. new model addition)
+        config["workflows"] = {"version": 2, "run_tests": {"jobs": [{j.job_name: {"context": ["TRANSFORMERS_CONTEXT"]}} for j in jobs]}}
+    else:
+        # For public repo. (e.g. `transformers`)
+        config["workflows"] = {"version": 2, "run_tests": {"jobs": [j.job_name for j in jobs]}}
    with open(os.path.join(folder, "generated_config.yml"), "w") as f:
        f.write(yaml.dump(config, sort_keys=False, default_flow_style=False).replace("' << pipeline", " << pipeline").replace(">> '", " >>"))

--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@ -63,7 +63,7 @@ jobs:
            commit_id=$GITHUB_SHA
          fi
          commit_msg=$(git show -s --format=%s | cut -c1-70)
-          python3 benchmark/llama.py "${{ github.head_ref || github.ref_name }}" "$commit_id" "$commit_msg"
+          python3 benchmark/benchmarks_entrypoint.py "${{ github.head_ref || github.ref_name }}" "$commit_id" "$commit_msg"
        env:
          HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
          # Enable this to see debug logs
--- a/.github/workflows/push-important-models.yml
+++ b/.github/workflows/push-important-models.yml
@ -134,10 +134,3 @@ jobs:
          slackChannel: ${{ secrets.SLACK_CIFEEDBACK_CHANNEL }}
          slackToken: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
          waitForSSH: true
-
-  benchmark:
-    name: Benchmark workflow
-    needs: get_modified_models
-    if: ${{ needs.get_modified_models.outputs.matrix != '[]' && needs.get_modified_models.outputs.matrix != '' && fromJson(needs.get_modified_models.outputs.matrix)[0] != null }}
-    uses: ./.github/workflows/benchmark.yml
-    secrets: inherit
--- a/.github/workflows/self-nightly-past-ci-caller.yml
+++ b/.github/workflows/self-nightly-past-ci-caller.yml
@ -21,39 +21,6 @@ jobs:
          echo "$(python3 -c 'print(int(${{ github.run_number }}) % 10)')"
          echo "run_number=$(python3 -c 'print(int(${{ github.run_number }}) % 10)')" >> $GITHUB_OUTPUT

-  run_past_ci_pytorch_1-13:
-    name: PyTorch 1.13
-    needs: get_number
-    if: needs.get_number.outputs.run_number == 0 && (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')))
-    uses: ./.github/workflows/self-past-caller.yml
-    with:
-      framework: pytorch
-      version: "1.13"
-      sha: ${{ github.sha }}
-    secrets: inherit
-
-  run_past_ci_pytorch_1-12:
-    name: PyTorch 1.12
-    needs: get_number
-    if: needs.get_number.outputs.run_number == 1 && (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')))
-    uses: ./.github/workflows/self-past-caller.yml
-    with:
-      framework: pytorch
-      version: "1.12"
-      sha: ${{ github.sha }}
-    secrets: inherit
-
-  run_past_ci_pytorch_1-11:
-    name: PyTorch 1.11
-    needs: get_number
-    if: needs.get_number.outputs.run_number == 2 && (cancelled() != true) && ((github.event_name == 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')))
-    uses: ./.github/workflows/self-past-caller.yml
-    with:
-      framework: pytorch
-      version: "1.11"
-      sha: ${{ github.sha }}
-    secrets: inherit
-
  run_past_ci_tensorflow_2-11:
    name: TensorFlow 2.11
    needs: get_number
--- a/.github/workflows/self-pr-slow-ci.yml
+++ b/.github/workflows/self-pr-slow-ci.yml
@ -1,151 +0,0 @@
-name: PR slow CI
-
-on:
-  pull_request:
-    paths:
-      - "src/transformers/models/*/modeling_*.py"
-      - "tests/**/test_*.py"
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-env:
-  HF_HOME: /mnt/cache
-  TRANSFORMERS_IS_CI: yes
-  OMP_NUM_THREADS: 8
-  MKL_NUM_THREADS: 8
-  RUN_SLOW: yes
-  # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access.
-  # This token is created under the bot `hf-transformers-bot`.
-  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
-  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
-  TF_FORCE_GPU_ALLOW_GROWTH: true
-  RUN_PT_TF_CROSS_TESTS: 1
-  CUDA_VISIBLE_DEVICES: 0,1
-
-jobs:
-  find_models_to_run:
-      runs-on: ubuntu-22.04
-      name: Find models to run slow tests
-      # Triggered only if the required label `run-slow` is added
-      if: ${{ contains(github.event.pull_request.labels.*.name, 'run-slow') }}
-      outputs:
-        models: ${{ steps.models_to_run.outputs.models }}
-      steps:
-        - uses: actions/checkout@v4
-          with:
-            fetch-depth: "0"
-            ref: ${{ github.event.pull_request.head.sha }}
-
-        - name: Get commit message
-          run: |
-            echo "commit_message=$(git show -s --format=%s)" >> $GITHUB_ENV
-
-        - name: Get models to run slow tests
-          run: |
-            echo "${{ env.commit_message }}"
-            python -m pip install GitPython
-            python utils/pr_slow_ci_models.py --commit_message "${{ env.commit_message }}" | tee output.txt
-            echo "models=$(tail -n 1 output.txt)" >> $GITHUB_ENV
-
-        - name: Models to run slow tests
-          id: models_to_run
-          run: |
-            echo "${{ env.models }}"
-            echo "models=${{ env.models }}" >> $GITHUB_OUTPUT
-
-  run_models_gpu:
-      name: Run all tests for the model
-      # Triggered only `find_models_to_run` is triggered (label `run-slow` is added) which gives the models to run
-      # (either a new model PR or via a commit message)
-      if: ${{ needs.find_models_to_run.outputs.models != '[]' }}
-      needs: find_models_to_run
-      strategy:
-        fail-fast: false
-        matrix:
-          folders: ${{ fromJson(needs.find_models_to_run.outputs.models) }}
-          machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache]
-      runs-on:
-        group: '${{ matrix.machine_type }}'
-      container:
-        image: huggingface/transformers-all-latest-gpu
-        options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-      steps:
-      - name: Echo input and matrix info
-        shell: bash
-        run: |
-          echo "${{ matrix.folders }}"
-
-      - name: Echo folder ${{ matrix.folders }}
-        shell: bash
-        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
-        # set the artifact folder names (because the character `/` is not allowed).
-        run: |
-          echo "${{ matrix.folders }}"
-          matrix_folders=${{ matrix.folders }}
-          matrix_folders=${matrix_folders/'models/'/'models_'}
-          echo "$matrix_folders"
-          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
-
-      - name: Update clone
-        working-directory: /transformers
-        run: git fetch && git fetch origin pull/${{ github.event.pull_request.number }}/head:pull/${{ github.event.pull_request.number }}/merge && git checkout pull/${{ github.event.pull_request.number }}/merge
-
-      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
-        working-directory: /transformers
-        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . && python3 -m pip install --upgrade torch torchaudio torchvision
-
-      - name: NVIDIA-SMI
-        run: |
-          nvidia-smi
-
-      - name: Set `machine_type` for report and artifact names
-        working-directory: /transformers
-        shell: bash
-        run: |
-          echo "${{ matrix.machine_type }}"
-          if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then
-            machine_type=single-gpu
-          elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then
-            machine_type=multi-gpu
-          else
-            machine_type=${{ matrix.machine_type }}
-          fi
-          echo "$machine_type"
-          echo "machine_type=$machine_type" >> $GITHUB_ENV    
-
-      - name: Environment
-        working-directory: /transformers
-        run: |
-          python3 utils/print_env.py
-
-      - name: Show installed libraries and their versions
-        working-directory: /transformers
-        run: pip freeze
-
-      - name: Run all tests on GPU
-        working-directory: /transformers
-        run: |
-          export CUDA_VISIBLE_DEVICES="$(python3 utils/set_cuda_devices_for_ci.py --test_folder ${{ matrix.folders }})"
-          echo $CUDA_VISIBLE_DEVICES
-          python3 -m pytest -v -rsfE --make-reports=${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }}
-
-      - name: Failure short reports
-        if: ${{ failure() }}
-        continue-on-error: true
-        run: cat /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt
-
-      - name: Make sure report directory exists
-        shell: bash
-        run: |
-          mkdir -p /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
-          echo "hello" > /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/hello.txt
-          echo "${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports"
-
-      - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports"
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports
-          path: /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports
--- a/.github/workflows/self-push-amd-mi210-caller.yml
+++ b/.github/workflows/self-push-amd-mi210-caller.yml
@ -1,25 +1,25 @@
-name: Self-hosted runner (AMD mi210 CI caller)
-
-on:
-  workflow_run:
-    workflows: ["Self-hosted runner (push-caller)"]
-    branches: ["main"]
-    types: [completed]
-  push:
-    branches:
-      - run_amd_push_ci_caller*
-    paths:
-      - "src/**"
-      - "tests/**"
-      - ".github/**"
-      - "templates/**"
-      - "utils/**"
-
-jobs:
-  run_amd_ci:
-    name: AMD mi210
-    if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_push_ci_caller')))
-    uses: ./.github/workflows/self-push-amd.yml
-    with:
-      gpu_flavor: mi210
-    secrets: inherit
+name: Self-hosted runner (AMD mi210 CI caller)
+
+on:
+  #workflow_run:
+  #  workflows: ["Self-hosted runner (push-caller)"]
+  #  branches: ["main"]
+  #  types: [completed]
+  push:
+    branches:
+      - run_amd_push_ci_caller*
+    paths:
+      - "src/**"
+      - "tests/**"
+      - ".github/**"
+      - "templates/**"
+      - "utils/**"
+
+jobs:
+  run_amd_ci:
+    name: AMD mi210
+    if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_push_ci_caller')))
+    uses: ./.github/workflows/self-push-amd.yml
+    with:
+      gpu_flavor: mi210
+    secrets: inherit
--- a/.github/workflows/self-push-amd-mi250-caller.yml
+++ b/.github/workflows/self-push-amd-mi250-caller.yml
@ -1,25 +1,25 @@
-name: Self-hosted runner (AMD mi250 CI caller)
-
-on:
-  workflow_run:
-    workflows: ["Self-hosted runner (push-caller)"]
-    branches: ["main"]
-    types: [completed]
-  push:
-    branches:
-      - run_amd_push_ci_caller*
-    paths:
-      - "src/**"
-      - "tests/**"
-      - ".github/**"
-      - "templates/**"
-      - "utils/**"
-
-jobs:
-  run_amd_ci:
-    name: AMD mi250
-    if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_push_ci_caller')))
-    uses: ./.github/workflows/self-push-amd.yml
-    with:
-      gpu_flavor: mi250
-    secrets: inherit
+name: Self-hosted runner (AMD mi250 CI caller)
+
+on:
+  #workflow_run:
+  #  workflows: ["Self-hosted runner (push-caller)"]
+  #  branches: ["main"]
+  #  types: [completed]
+  push:
+    branches:
+      - run_amd_push_ci_caller*
+    paths:
+      - "src/**"
+      - "tests/**"
+      - ".github/**"
+      - "templates/**"
+      - "utils/**"
+
+jobs:
+  run_amd_ci:
+    name: AMD mi250
+    if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_push_ci_caller')))
+    uses: ./.github/workflows/self-push-amd.yml
+    with:
+      gpu_flavor: mi250
+    secrets: inherit
--- a/.github/workflows/self-push-amd-mi300-caller.yml
+++ b/.github/workflows/self-push-amd-mi300-caller.yml
@ -1,10 +1,10 @@
 name: Self-hosted runner (AMD mi300 CI caller)

 on:
-  workflow_run:
-    workflows: ["Self-hosted runner (push-caller)"]
-    branches: ["main"]
-    types: [completed]
+  #workflow_run:
+  #  workflows: ["Self-hosted runner (push-caller)"]
+  #  branches: ["main"]
+  #  types: [completed]
  push:
    branches:
      - run_amd_push_ci_caller*
--- a/27
+++ b/27
@ -0,0 +1,27 @@
+# These owners will be the default owners for everything in
+# the repo. Unless a later match takes precedence,
+# @global-owner1 and @global-owner2 will be requested for
+# review when someone opens a pull request.
+*       @Rocketknight1 @ArthurZucker # if no one is pinged based on the other rules, he will do the dispatch
+**.md @stevhliu
+docs/ @stevhliu
+/benchmark/ @McPatate
+/utils/modular_model_converter.py @Cyrilvallez @ArthurZucker
+/src/transformers/models/*/*processing* @molbap @yonigozlan @qubvel
+/src/transformers/models/*/image_processing* @qubvel
+/src/transformers/models/*/image_processing_*_fast* @yonigozlan
+/src/transformers/models/*/*_modeling* @Rocketknight1
+/src/transformers/**/*_tokenization* @ArthurZucker
+/src/transformers/generation/ @gante
+trainer.py @muellerzr @SunMarc
+/src/transformers/pipeline @Rocketknight1 @yonigozlan
+/src/transformers/integrations @SunMarc @MekkCyber @muellerzr
+/src/transformers/quantizers @SunMarc @MekkCyber
+/src/transformers/tests @ydshieh
+/src/transformers/models/auto @ArthurZucker
+/src/transformers/utils @ArthurZucker @Rocketknight1
+/docker @ydshieh @ArthurZucker
+/src/transformers/loss @ArthurZucker
+/src/transformers/onnx @michaelbenayoun
+/.circleci/config.yml @ArthurZucker @ydshieh
+/utils/tests_fetcher.py @ydshieh
--- a/README.md
+++ b/README.md
@ -249,7 +249,7 @@ The model itself is a regular [Pytorch `nn.Module`](https://pytorch.org/docs/sta

 ### With pip

-This repository is tested on Python 3.9+, Flax 0.4.1+, PyTorch 1.11+, and TensorFlow 2.6+.
+This repository is tested on Python 3.9+, Flax 0.4.1+, PyTorch 2.0+, and TensorFlow 2.6+.

 You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).

--- a/benchmark/README.md
+++ b/benchmark/README.md
@ -0,0 +1,49 @@
+# Benchmarks
+
+You might want to add new benchmarks.
+
+You will need to define a python function named `run_benchmark` in your python file and the file must be located in this `benchmark/` directory.
+
+The expected function signature is the following:
+
+```py
+def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str, num_tokens_to_generate=100):
+```
+
+## Writing metrics to the database
+
+`MetricRecorder` is thread-safe, in the sense of the python [`Thread`](https://docs.python.org/3/library/threading.html#threading.Thread). This means you can start a background thread to do the readings on the device measurements while not blocking the main thread to execute the model measurements.
+
+cf [`llama.py`](./llama.py) to see an example of this in practice.
+
+```py
+from benchmarks_entrypoint import MetricsRecorder
+import psycopg2
+
+def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str, num_tokens_to_generate=100):
+  metrics_recorder = MetricsRecorder(psycopg2.connect("dbname=metrics"), logger, branch, commit_id, commit_msg)
+  benchmark_id = metrics_recorder.initialise_benchmark({"gpu_name": gpu_name, "model_id": model_id})
+    # To collect device measurements
+    metrics_recorder.collect_device_measurements(
+        benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes
+    )
+    # To collect your model measurements
+    metrics_recorder.collect_model_measurements(
+        benchmark_id,
+        {
+            "model_load_time": model_load_time,
+            "first_eager_forward_pass_time_secs": first_eager_fwd_pass_time,
+            "second_eager_forward_pass_time_secs": second_eager_fwd_pass_time,
+            "first_eager_generate_time_secs": first_eager_generate_time,
+            "second_eager_generate_time_secs": second_eager_generate_time,
+            "time_to_first_token_secs": time_to_first_token,
+            "time_to_second_token_secs": time_to_second_token,
+            "time_to_third_token_secs": time_to_third_token,
+            "time_to_next_token_mean_secs": mean_time_to_next_token,
+            "first_compile_generate_time_secs": first_compile_generate_time,
+            "second_compile_generate_time_secs": second_compile_generate_time,
+            "third_compile_generate_time_secs": third_compile_generate_time,
+            "fourth_compile_generate_time_secs": fourth_compile_generate_time,
+        },
+    )
+```
--- a/benchmark/benchmarks_entrypoint.py
+++ b/benchmark/benchmarks_entrypoint.py
@ -0,0 +1,144 @@
+import argparse
+import importlib.util
+import logging
+import os
+from typing import Dict
+import psycopg2
+import sys
+
+from psycopg2.extras import Json
+from psycopg2.extensions import register_adapter
+
+
+register_adapter(dict, Json)
+
+
+class ImportModuleException(Exception):
+    pass
+
+
+class MetricsRecorder:
+    def __init__(self, connection, logger: logging.Logger, branch: str, commit_id: str, commit_msg: str):
+        self.conn = connection
+        self.conn.autocommit = True
+        self.logger = logger
+        self.branch = branch
+        self.commit_id = commit_id
+        self.commit_msg = commit_msg
+
+    def initialise_benchmark(self, metadata: Dict[str, str]) -> int:
+        """
+        Creates a new benchmark, returns the benchmark id
+        """
+        # gpu_name: str, model_id: str
+        with self.conn.cursor() as cur:
+            cur.execute(
+                "INSERT INTO benchmarks (branch, commit_id, commit_message, metadata) VALUES (%s, %s, %s, %s) RETURNING benchmark_id",
+                (self.branch, self.commit_id, self.commit_msg, metadata),
+            )
+            benchmark_id = cur.fetchone()[0]
+            logger.debug(f"initialised benchmark #{benchmark_id}")
+            return benchmark_id
+
+    def collect_device_measurements(self, benchmark_id: int, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes):
+        """
+        Collect device metrics, such as CPU & GPU usage. These are "static", as in you cannot pass arbitrary arguments to the function.
+        """
+        with self.conn.cursor() as cur:
+            cur.execute(
+                "INSERT INTO device_measurements (benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes) VALUES (%s, %s, %s, %s, %s)",
+                (benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes),
+            )
+        self.logger.debug(
+            f"inserted device measurements for benchmark #{benchmark_id} [CPU util: {cpu_util}, mem MBs: {mem_megabytes}, GPU util: {gpu_util}, GPU mem MBs: {gpu_mem_megabytes}]"
+        )
+
+    def collect_model_measurements(self, benchmark_id: int, measurements: Dict[str, float]):
+        with self.conn.cursor() as cur:
+            cur.execute(
+                """
+                INSERT INTO model_measurements (
+                    benchmark_id,
+                    measurements
+                ) VALUES (%s, %s)
+                """,
+                (
+                    benchmark_id,
+                    measurements,
+                ),
+            )
+        self.logger.debug(f"inserted model measurements for benchmark #{benchmark_id}: {measurements}")
+
+    def close(self):
+        self.conn.close()
+
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+handler = logging.StreamHandler(sys.stdout)
+handler.setLevel(logging.INFO)
+formatter = logging.Formatter("[%(levelname)s - %(asctime)s] %(message)s")
+handler.setFormatter(formatter)
+logger.addHandler(handler)
+
+
+def parse_arguments():
+    """
+    Parse command line arguments for the benchmarking CLI.
+    """
+    parser = argparse.ArgumentParser(description="CLI for benchmarking the huggingface/transformers.")
+
+    parser.add_argument(
+        "branch",
+        type=str,
+        help="The branch name on which the benchmarking is performed.",
+    )
+
+    parser.add_argument(
+        "commit_id",
+        type=str,
+        help="The commit hash on which the benchmarking is performed.",
+    )
+
+    parser.add_argument(
+        "commit_msg",
+        type=str,
+        help="The commit message associated with the commit, truncated to 70 characters.",
+    )
+
+    args = parser.parse_args()
+
+    return args.branch, args.commit_id, args.commit_msg
+
+
+def import_from_path(module_name, file_path):
+    try:
+        spec = importlib.util.spec_from_file_location(module_name, file_path)
+        module = importlib.util.module_from_spec(spec)
+        sys.modules[module_name] = module
+        spec.loader.exec_module(module)
+        return module
+    except Exception as e:
+        raise ImportModuleException(f"failed to load python module: {e}")
+
+
+if __name__ == "__main__":
+    benchmarks_folder_path = os.path.dirname(os.path.realpath(__file__))
+
+    branch, commit_id, commit_msg = parse_arguments()
+
+    for entry in os.scandir(benchmarks_folder_path):
+        try:
+            if not entry.name.endswith(".py"):
+                continue
+            if entry.path == __file__:
+                continue
+            logger.debug(f"loading: {entry.name}")
+            module = import_from_path(entry.name.split(".")[0], entry.path)
+            logger.info(f"runnning benchmarks in: {entry.name}")
+            module.run_benchmark(logger, branch, commit_id, commit_msg)
+        except ImportModuleException as e:
+            logger.error(e)
+        except Exception as e:
+            logger.error(f"error running benchmarks for {entry.name}: {e}")
--- a/benchmark/default.yml
+++ b/benchmark/default.yml
@ -0,0 +1,10 @@
+apiVersion: 1
+
+providers:
+  - name: 'Transformers Benchmarks'
+    orgId: 1
+    type: file
+    updateIntervalSeconds: 10
+    allowUiUpdates: true
+    options:
+      path: /etc/grafana/dashboards
--- a/benchmark/grafana_dashboard.json
+++ b/benchmark/grafana_dashboard.json
@ -30,7 +30,7 @@
      "title": "Go to data",
      "tooltip": "Go to data",
      "type": "link",
-      "url": "http://transformers-benchmarks.huggingface.co/d/fdz33iyzln9c0a/transformers-benchmarks?orgId=1&from=${StartTime}&to=${EndTime}"
+      "url": "http://transformers-benchmarks.hf.co/d/fdz33iyzln9c0a/transformers-benchmarks?orgId=1&from=${StartTime}&to=${EndTime}"
    }
  ],
  "liveNow": true,
@ -77,7 +77,7 @@
            "properties": [
              {
                "id": "custom.width",
-                "value": 196
+                "value": 202
              }
            ]
          },
@ -101,7 +101,7 @@
            "properties": [
              {
                "id": "custom.width",
-                "value": 581
+                "value": 524
              }
            ]
          },
@ -113,7 +113,19 @@
            "properties": [
              {
                "id": "custom.width",
-                "value": 379
+                "value": 353
+              }
+            ]
+          },
+          {
+            "matcher": {
+              "id": "byName",
+              "options": "model_id"
+            },
+            "properties": [
+              {
+                "id": "custom.width",
+                "value": 216
              }
            ]
          }
@ -143,12 +155,14 @@
      "targets": [
        {
          "datasource": {
-            "type": "grafana-postgresql-datasource"
+            "default": true,
+            "type": "grafana-postgresql-datasource",
+            "uid": "be28nkzirtb0gd"
          },
          "editorMode": "code",
          "format": "table",
          "rawQuery": true,
-          "rawSql": "SELECT commit_id as commit_id, commit_message, gpu_name, created_at AS date FROM benchmarks WHERE branch = '${branch}' ORDER BY benchmark_id DESC LIMIT ${last_n_commits};",
+          "rawSql": "SELECT commit_id, commit_message, metadata->>'gpu_name' as gpu_name, metadata->>'model_id' as model_id, created_at AS date FROM benchmarks WHERE branch = '${branch}' AND metadata->>'gpu_name' = '${gpu_name}' ORDER BY benchmark_id DESC LIMIT ${last_n_commits};",
          "refId": "A",
          "sql": {
            "columns": [
@ -306,13 +320,14 @@
      "targets": [
        {
          "datasource": {
+            "default": true,
            "type": "grafana-postgresql-datasource",
-            "uid": "bdz2yss7sxo1sc"
+            "uid": "be28nkzirtb0gd"
          },
          "editorMode": "code",
          "format": "table",
          "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'first_eager_forward_pass_time_secs' AS double precision) AS first_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
+          "rawSql": "SELECT CAST(m.measurements->'first_eager_forward_pass_time_secs' AS double precision) AS first_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
          "refId": "A",
          "sql": {
            "columns": [
@ -431,13 +446,14 @@
      "targets": [
        {
          "datasource": {
+            "default": true,
            "type": "grafana-postgresql-datasource",
-            "uid": "bdz2yss7sxo1sc"
+            "uid": "be28nkzirtb0gd"
          },
          "editorMode": "code",
          "format": "table",
          "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'second_eager_forward_pass_time_secs' AS double precision) AS second_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
+          "rawSql": "SELECT CAST(m.measurements->'second_eager_forward_pass_time_secs' AS double precision) AS second_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
          "refId": "A",
          "sql": {
            "columns": [
@ -565,13 +581,14 @@
      "targets": [
        {
          "datasource": {
+            "default": true,
            "type": "grafana-postgresql-datasource",
-            "uid": "bdz2yss7sxo1sc"
+            "uid": "be28nkzirtb0gd"
          },
          "editorMode": "code",
          "format": "table",
          "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'time_to_first_token_secs' AS double precision) AS time_to_first_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
+          "rawSql": "SELECT CAST(m.measurements->'time_to_first_token_secs' AS double precision) AS time_to_first_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
          "refId": "A",
          "sql": {
            "columns": [
@ -686,13 +703,14 @@
      "targets": [
        {
          "datasource": {
+            "default": true,
            "type": "grafana-postgresql-datasource",
-            "uid": "bdz2yss7sxo1sc"
+            "uid": "be28nkzirtb0gd"
          },
          "editorMode": "code",
          "format": "table",
          "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'time_to_second_token_secs' AS double precision) AS time_to_second_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
+          "rawSql": "SELECT CAST(m.measurements->'time_to_second_token_secs' AS double precision) AS time_to_second_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
          "refId": "A",
          "sql": {
            "columns": [
@ -807,13 +825,14 @@
      "targets": [
        {
          "datasource": {
+            "default": true,
            "type": "grafana-postgresql-datasource",
-            "uid": "bdz2yss7sxo1sc"
+            "uid": "be28nkzirtb0gd"
          },
          "editorMode": "code",
          "format": "table",
          "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'time_to_third_token_secs' AS double precision) AS time_to_third_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
+          "rawSql": "SELECT CAST(m.measurements->'time_to_third_token_secs' AS double precision) AS time_to_third_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
          "refId": "A",
          "sql": {
            "columns": [
@ -928,13 +947,14 @@
      "targets": [
        {
          "datasource": {
+            "default": true,
            "type": "grafana-postgresql-datasource",
-            "uid": "bdz2yss7sxo1sc"
+            "uid": "be28nkzirtb0gd"
          },
          "editorMode": "code",
          "format": "table",
          "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'time_to_next_token_mean_secs' AS double precision) AS time_to_next_token_mean_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
+          "rawSql": "SELECT CAST(m.measurements->'time_to_next_token_mean_secs' AS double precision) AS time_to_next_token_mean_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
          "refId": "A",
          "sql": {
            "columns": [
@ -1062,13 +1082,14 @@
      "targets": [
        {
          "datasource": {
+            "default": true,
            "type": "grafana-postgresql-datasource",
-            "uid": "bdz2yss7sxo1sc"
+            "uid": "be28nkzirtb0gd"
          },
          "editorMode": "code",
          "format": "table",
          "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'first_compile_generate_time_secs' AS double precision) AS first_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
+          "rawSql": "SELECT CAST(m.measurements->'first_compile_generate_time_secs' AS double precision) AS first_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
          "refId": "A",
          "sql": {
            "columns": [
@ -1183,13 +1204,14 @@
      "targets": [
        {
          "datasource": {
+            "default": true,
            "type": "grafana-postgresql-datasource",
-            "uid": "bdz2yss7sxo1sc"
+            "uid": "be28nkzirtb0gd"
          },
          "editorMode": "code",
          "format": "table",
          "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'second_compile_generate_time_secs' AS double precision) AS second_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
+          "rawSql": "SELECT CAST(m.measurements->'second_compile_generate_time_secs' AS double precision) AS second_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
          "refId": "A",
          "sql": {
            "columns": [
@ -1304,13 +1326,14 @@
      "targets": [
        {
          "datasource": {
+            "default": true,
            "type": "grafana-postgresql-datasource",
-            "uid": "bdz2yss7sxo1sc"
+            "uid": "be28nkzirtb0gd"
          },
          "editorMode": "code",
          "format": "table",
          "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'third_compile_generate_time_secs' AS double precision) AS third_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
+          "rawSql": "SELECT CAST(m.measurements->'third_compile_generate_time_secs' AS double precision) AS third_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
          "refId": "A",
          "sql": {
            "columns": [
@ -1425,13 +1448,14 @@
      "targets": [
        {
          "datasource": {
+            "default": true,
            "type": "grafana-postgresql-datasource",
-            "uid": "bdz2yss7sxo1sc"
+            "uid": "be28nkzirtb0gd"
          },
          "editorMode": "code",
          "format": "table",
          "rawQuery": true,
-          "rawSql": "SELECT CAST(m.measurements->'fourth_compile_generate_time_secs' AS double precision) AS fourth_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND gpu_name = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
+          "rawSql": "SELECT CAST(m.measurements->'fourth_compile_generate_time_secs' AS double precision) AS fourth_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};",
          "refId": "A",
          "sql": {
            "columns": [
@ -1480,11 +1504,7 @@
      "id": 15,
      "panels": [
        {
-          "datasource": {
-            "default": true,
-            "type": "grafana-postgresql-datasource",
-            "uid": "be28nkzirtb0gd"
-          },
+          "datasource": {},
          "fieldConfig": {
            "defaults": {
              "color": {
@ -1528,8 +1548,7 @@
                "mode": "absolute",
                "steps": [
                  {
-                    "color": "green",
-                    "value": null
+                    "color": "green"
                  },
                  {
                    "color": "red",
@ -1563,8 +1582,9 @@
          "targets": [
            {
              "datasource": {
+                "default": true,
                "type": "grafana-postgresql-datasource",
-                "uid": "bdz2yss7sxo1sc"
+                "uid": "be28nkzirtb0gd"
              },
              "editorMode": "code",
              "format": "table",
@ -1665,11 +1685,7 @@
          "type": "timeseries"
        },
        {
-          "datasource": {
-            "default": true,
-            "type": "grafana-postgresql-datasource",
-            "uid": "be28nkzirtb0gd"
-          },
+          "datasource": {},
          "fieldConfig": {
            "defaults": {
              "color": {
@ -1713,8 +1729,7 @@
                "mode": "absolute",
                "steps": [
                  {
-                    "color": "green",
-                    "value": null
+                    "color": "green"
                  },
                  {
                    "color": "red",
@ -1748,8 +1763,9 @@
          "targets": [
            {
              "datasource": {
+                "default": true,
                "type": "grafana-postgresql-datasource",
-                "uid": "bdz2yss7sxo1sc"
+                "uid": "be28nkzirtb0gd"
              },
              "editorMode": "code",
              "format": "table",
@ -1850,11 +1866,7 @@
          "type": "timeseries"
        },
        {
-          "datasource": {
-            "default": true,
-            "type": "grafana-postgresql-datasource",
-            "uid": "be28nkzirtb0gd"
-          },
+          "datasource": {},
          "fieldConfig": {
            "defaults": {
              "color": {
@ -1898,8 +1910,7 @@
                "mode": "absolute",
                "steps": [
                  {
-                    "color": "green",
-                    "value": null
+                    "color": "green"
                  },
                  {
                    "color": "red",
@ -1933,8 +1944,9 @@
          "targets": [
            {
              "datasource": {
+                "default": true,
                "type": "grafana-postgresql-datasource",
-                "uid": "bdz2yss7sxo1sc"
+                "uid": "be28nkzirtb0gd"
              },
              "editorMode": "code",
              "format": "table",
@ -2035,11 +2047,7 @@
          "type": "timeseries"
        },
        {
-          "datasource": {
-            "default": true,
-            "type": "grafana-postgresql-datasource",
-            "uid": "be28nkzirtb0gd"
-          },
+          "datasource": {},
          "fieldConfig": {
            "defaults": {
              "color": {
@ -2083,8 +2091,7 @@
                "mode": "absolute",
                "steps": [
                  {
-                    "color": "green",
-                    "value": null
+                    "color": "green"
                  },
                  {
                    "color": "red",
@ -2118,8 +2125,9 @@
          "targets": [
            {
              "datasource": {
+                "default": true,
                "type": "grafana-postgresql-datasource",
-                "uid": "bdz2yss7sxo1sc"
+                "uid": "be28nkzirtb0gd"
              },
              "editorMode": "code",
              "format": "table",
@ -2224,7 +2232,6 @@
      "type": "row"
    }
  ],
-  "refresh": "",
  "schemaVersion": 39,
  "tags": [],
  "templating": {
@ -2236,6 +2243,7 @@
          "value": "main"
        },
        "datasource": {
+          "default": true,
          "type": "grafana-postgresql-datasource",
          "uid": "be28nkzirtb0gd"
        },
@ -2248,7 +2256,7 @@
        "name": "branch",
        "options": [],
        "query": "SELECT DISTINCT branch FROM benchmarks;",
-        "refresh": 2,
+        "refresh": 1,
        "regex": "",
        "skipUrlSync": false,
        "sort": 0,
@ -2261,6 +2269,7 @@
          "value": "1729701492845"
        },
        "datasource": {
+          "default": true,
          "type": "grafana-postgresql-datasource",
          "uid": "be28nkzirtb0gd"
        },
@ -2281,10 +2290,11 @@
      {
        "current": {
          "selected": false,
-          "text": "1730120430069",
-          "value": "1730120430069"
+          "text": "1730393397577",
+          "value": "1730393397577"
        },
        "datasource": {
+          "default": true,
          "type": "grafana-postgresql-datasource",
          "uid": "be28nkzirtb0gd"
        },
@ -2312,15 +2322,16 @@
          "type": "grafana-postgresql-datasource",
          "uid": "be28nkzirtb0gd"
        },
-        "definition": "SELECT DISTINCT gpu_name FROM benchmarks;",
+        "definition": "SELECT DISTINCT metadata->>'gpu_name' FROM benchmarks;",
+        "description": "",
        "hide": 0,
        "includeAll": false,
        "label": "GPU",
        "multi": false,
        "name": "gpu_name",
        "options": [],
-        "query": "SELECT DISTINCT gpu_name FROM benchmarks;",
-        "refresh": 2,
+        "query": "SELECT DISTINCT metadata->>'gpu_name' FROM benchmarks;",
+        "refresh": 1,
        "regex": "",
        "skipUrlSync": false,
        "sort": 0,
@ -2328,7 +2339,7 @@
      },
      {
        "current": {
-          "selected": false,
+          "selected": true,
          "text": "10",
          "value": "10"
        },
@ -2359,6 +2370,6 @@
  "timezone": "browser",
  "title": "Transformers benchmarks",
  "uid": "fdz33iyzln9c0a",
-  "version": 4,
+  "version": 10,
  "weekStart": ""
 }
--- a/benchmark/grafana_datasource.yaml
+++ b/benchmark/grafana_datasource.yaml
@ -0,0 +1,17 @@
+apiVersion: 1
+datasources:
+  - name: grafana-postgresql-datasource
+    uid: be28nkzirtb0gd
+    type: postgres
+    url: $GRAFANA_POSTGRES_DATASOURCE_URL
+    user: $GRAFANA_POSTGRES_DATASOURCE_USER
+    secureJsonData:
+      password: $GRAFANA_POSTGRES_DATASOURCE_PWD
+    jsonData:
+      database: metrics
+      maxOpenConns: 100
+      maxIdleConns: 100
+      maxIdleConnsAuto: true
+      connMaxLifetime: 14400
+      postgresVersion: 1000
+      timescaledb: false
--- a/benchmark/init_db.sql
+++ b/benchmark/init_db.sql
@ -3,7 +3,7 @@ CREATE TABLE IF NOT EXISTS benchmarks (
  branch VARCHAR(255),
  commit_id VARCHAR(72),
  commit_message VARCHAR(70),
-  gpu_name VARCHAR(255),
+  metadata jsonb,
  created_at timestamp without time zone NOT NULL DEFAULT (current_timestamp AT TIME ZONE 'UTC')
 );

--- a/benchmark/llama.py
+++ b/benchmark/llama.py
@ -1,71 +1,25 @@
-import argparse
-import json
-import logging
+from logging import Logger
 import os
-import sys
-from statistics import mean
 from threading import Event, Thread
 from time import perf_counter, sleep
 from typing import Optional
+from benchmarks_entrypoint import MetricsRecorder
 import gpustat
 import psutil
 import psycopg2
 import torch

 from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, StaticCache
-from psycopg2.extras import Json
-from psycopg2.extensions import register_adapter


 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-
-handler = logging.StreamHandler(sys.stdout)
-handler.setLevel(logging.INFO)
-formatter = logging.Formatter("[%(levelname)s - %(asctime)s] %(message)s")
-handler.setFormatter(formatter)
-logger.addHandler(handler)
-
 os.environ["TOKENIZERS_PARALLELISM"] = "1"
 torch.set_float32_matmul_precision("high")
-register_adapter(dict, Json)


-def parse_arguments():
-    """
-    Parse command line arguments for the benchmarking CLI.
-    """
-    parser = argparse.ArgumentParser(description="CLI for benchmarking the huggingface/transformers.")
-
-    parser.add_argument(
-        "branch",
-        type=str,
-        help="The branch name on which the benchmarking is performed.",
-    )
-
-    parser.add_argument(
-        "commit_id",
-        type=str,
-        help="The commit hash on which the benchmarking is performed.",
-    )
-
-    parser.add_argument(
-        "commit_msg",
-        type=str,
-        help="The commit message associated with the commit, truncated to 70 characters.",
-    )
-
-    args = parser.parse_args()
-
-    return args.branch, args.commit_id, args.commit_msg
-
-
-def collect_metrics(benchmark_id, continue_metric_collection):
+def collect_metrics(benchmark_id, continue_metric_collection, metrics_recorder):
    p = psutil.Process(os.getpid())
-    conn = psycopg2.connect("dbname=metrics")
-    cur = conn.cursor()
    while not continue_metric_collection.is_set():
        with p.oneshot():
            cpu_util = p.cpu_percent()
@ -73,47 +27,41 @@ def collect_metrics(benchmark_id, continue_metric_collection):
        gpu_stats = gpustat.GPUStatCollection.new_query()
        gpu_util = gpu_stats[0]["utilization.gpu"]
        gpu_mem_megabytes = gpu_stats[0]["memory.used"]
-        cur.execute(
-            "INSERT INTO device_measurements (benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes) VALUES (%s, %s, %s, %s, %s)",
-            (benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes),
+        metrics_recorder.collect_device_measurements(
+            benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes
        )
        sleep(0.01)
-        conn.commit()
-    conn.close()


-def run_benchmark(branch: str, commit_id: str, commit_msg: str, num_tokens_to_generate=100):
+def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str, num_tokens_to_generate=100):
    continue_metric_collection = Event()
    metrics_thread = None
+    model_id = "meta-llama/Llama-2-7b-hf"
+    metrics_recorder = MetricsRecorder(psycopg2.connect("dbname=metrics"), logger, branch, commit_id, commit_msg)
    try:
        gpu_stats = gpustat.GPUStatCollection.new_query()
        gpu_name = gpu_stats[0]["name"]
-        conn = psycopg2.connect("dbname=metrics")
-        cur = conn.cursor()
-        cur.execute(
-            "INSERT INTO benchmarks (branch, commit_id, commit_message, gpu_name) VALUES (%s, %s, %s, %s) RETURNING benchmark_id",
-            (branch, commit_id, commit_msg, gpu_name),
+        benchmark_id = metrics_recorder.initialise_benchmark({"gpu_name": gpu_name, "model_id": model_id})
+        logger.info(f"running benchmark #{benchmark_id} on {gpu_name} for {model_id}")
+        metrics_thread = Thread(
+            target=collect_metrics,
+            args=[benchmark_id, continue_metric_collection, metrics_recorder],
        )
-        conn.commit()
-        benchmark_id = cur.fetchone()[0]
-        logger.info(f"running benchmark #{benchmark_id} on {gpu_name}")
-        metrics_thread = Thread(target=collect_metrics, args=[benchmark_id, continue_metric_collection])
        metrics_thread.start()
        logger.info("started background thread to fetch device metrics")

        os.environ["TOKENIZERS_PARALLELISM"] = "false"  # silence warnings when compiling

        device = "cuda"
-        ckpt = "meta-llama/Llama-2-7b-hf"

        logger.info("downloading weights")
        # This is to avoid counting download in model load time measurement
-        model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16)
+        model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16)
        gen_config = GenerationConfig(do_sample=False, top_p=1, temperature=1)
        logger.info("loading model")
        start = perf_counter()
        model = AutoModelForCausalLM.from_pretrained(
-            ckpt, torch_dtype=torch.float16, generation_config=gen_config
+            model_id, torch_dtype=torch.float16, generation_config=gen_config
        ).eval()
        model.to(device)
        torch.cuda.synchronize()
@ -121,7 +69,7 @@ def run_benchmark(branch: str, commit_id: str, commit_msg: str, num_tokens_to_ge
        model_load_time = end - start
        logger.info(f"loaded model in: {model_load_time}s")

-        tokenizer = AutoTokenizer.from_pretrained(ckpt)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)

        prompt = "Why dogs are so cute?"
        inputs = tokenizer(prompt, return_tensors="pt").to(device)
@ -368,41 +316,27 @@ def run_benchmark(branch: str, commit_id: str, commit_msg: str, num_tokens_to_ge
            logger.info(f"completed second compile generation in: {fourth_compile_generate_time}s")
            logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}")

-        cur.execute(
-            """
-            INSERT INTO model_measurements (
-                benchmark_id,
-                measurements
-            ) VALUES (%s, %s)
-            """,
-            (
-                benchmark_id,
-                {
-                    "model_load_time": model_load_time,
-                    "first_eager_forward_pass_time_secs": first_eager_fwd_pass_time,
-                    "second_eager_forward_pass_time_secs": second_eager_fwd_pass_time,
-                    "first_eager_generate_time_secs": first_eager_generate_time,
-                    "second_eager_generate_time_secs": second_eager_generate_time,
-                    "time_to_first_token_secs": time_to_first_token,
-                    "time_to_second_token_secs": time_to_second_token,
-                    "time_to_third_token_secs": time_to_third_token,
-                    "time_to_next_token_mean_secs": mean_time_to_next_token,
-                    "first_compile_generate_time_secs": first_compile_generate_time,
-                    "second_compile_generate_time_secs": second_compile_generate_time,
-                    "third_compile_generate_time_secs": third_compile_generate_time,
-                    "fourth_compile_generate_time_secs": fourth_compile_generate_time,
-                },
-            ),
+        metrics_recorder.collect_model_measurements(
+            benchmark_id,
+            {
+                "model_load_time": model_load_time,
+                "first_eager_forward_pass_time_secs": first_eager_fwd_pass_time,
+                "second_eager_forward_pass_time_secs": second_eager_fwd_pass_time,
+                "first_eager_generate_time_secs": first_eager_generate_time,
+                "second_eager_generate_time_secs": second_eager_generate_time,
+                "time_to_first_token_secs": time_to_first_token,
+                "time_to_second_token_secs": time_to_second_token,
+                "time_to_third_token_secs": time_to_third_token,
+                "time_to_next_token_mean_secs": mean_time_to_next_token,
+                "first_compile_generate_time_secs": first_compile_generate_time,
+                "second_compile_generate_time_secs": second_compile_generate_time,
+                "third_compile_generate_time_secs": third_compile_generate_time,
+                "fourth_compile_generate_time_secs": fourth_compile_generate_time,
+            },
        )
-        conn.commit()
-        conn.close()
    except Exception as e:
        logger.error(f"Caught exception: {e}")
    continue_metric_collection.set()
    if metrics_thread is not None:
        metrics_thread.join()
-
-
-if __name__ == "__main__":
-    branch, commit_id, commit_msg = parse_arguments()
-    run_benchmark(branch, commit_id, commit_msg, num_tokens_to_generate=20)
+    metrics_recorder.close()
--- a/docker/transformers-pytorch-amd-gpu/Dockerfile
+++ b/docker/transformers-pytorch-amd-gpu/Dockerfile
@ -1,4 +1,4 @@
-FROM rocm/dev-ubuntu-22.04:6.0.2
+FROM rocm/dev-ubuntu-22.04:6.1
 # rocm/pytorch has no version with 2.1.0
 LABEL maintainer="Hugging Face"

@ -11,7 +11,7 @@ RUN apt update && \

 RUN python3 -m pip install --no-cache-dir --upgrade pip numpy

-RUN python3 -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.0
+RUN python3 -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.1

 RUN python3 -m pip install --no-cache-dir --upgrade importlib-metadata setuptools ninja git+https://github.com/facebookresearch/detectron2.git pytesseract "itsdangerous<2.1.0"

@ -30,5 +30,5 @@ RUN python3 -m pip uninstall -y tensorflow flax
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop

-# Remove nvml as it is not compatible with ROCm. apex is not tested on NVIDIA either.
-RUN python3 -m pip uninstall py3nvml pynvml apex -y
+# Remove nvml and nvidia-ml-py as it is not compatible with ROCm. apex is not tested on NVIDIA either.
+RUN python3 -m pip uninstall py3nvml pynvml nvidia-ml-py apex -y
--- a/docker/transformers-quantization-latest-gpu/Dockerfile
+++ b/docker/transformers-quantization-latest-gpu/Dockerfile
@ -50,6 +50,9 @@ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/pef
 # Add aqlm for quantization testing
 RUN python3 -m pip install --no-cache-dir aqlm[gpu]==1.0.2

+# Add vptq for quantization testing
+RUN python3 -m pip install --no-cache-dir vptq
+
 # Add hqq for quantization testing
 RUN python3 -m pip install --no-cache-dir hqq

@ -66,6 +69,10 @@ RUN python3 -m pip install --no-cache-dir optimum-quanto
 # Add eetq for quantization testing
 RUN python3 -m pip install git+https://github.com/NetEase-FuXi/EETQ.git

+# Add flute-kernel and fast_hadamard_transform for quantization testing
+RUN python3 -m pip install --no-cache-dir flute-kernel==0.3.0 -i https://flute-ai.github.io/whl/cu118
+RUN python3 -m pip install --no-cache-dir fast_hadamard_transform==1.0.4.post1
+
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
--- a/docs/source/ar/_toctree.yml
+++ b/docs/source/ar/_toctree.yml
@ -30,26 +30,26 @@
  - local: conversations
    title: الدردشة مع المحولات
  title: البرامج التعليمية
-# - sections:
-#   - isExpanded: false
-#     sections:
+- sections:
+  - isExpanded: false
+    sections:
 #     - local: tasks/sequence_classification
 #       title: تصنيف النصوص
 #     - local: tasks/token_classification
 #       title: تصنيف الرموز
-#     - local: tasks/question_answering
-#       title: الإجابة على الأسئلة
+    - local: tasks/question_answering
+      title: الإجابة على الأسئلة
 #     - local: tasks/language_modeling
 #       title: نمذجة اللغة السببية
 #     - local: tasks/masked_language_modeling
 #       title: نمذجة اللغة المقنعة
-#     - local: tasks/translation
-#       title: الترجمة
-#     - local: tasks/summarization
-#       title: التلخيص
-#     - local: tasks/multiple_choice
-#       title: الاختيار المتعدد
-#     title: معالجة اللغات الطبيعية
+    - local: tasks/translation
+      title: الترجمة
+    - local: tasks/summarization
+      title: التلخيص
+    - local: tasks/multiple_choice
+      title: الاختيار المتعدد
+    title: معالجة اللغات الطبيعية
 #   - isExpanded: false
 #     sections:
 #     - local: tasks/audio_classification
@ -107,7 +107,7 @@
 #     - local: tasks/prompting
 #       title: دليل إرشادي لمحفزات النماذج اللغوية الكبيرة
 #     title: الإرشاد
-#   title: أدلة المهام
+  title: أدلة المهام
 - sections:
  - local: fast_tokenizers
    title: استخدم مجزئيات النصوص السريعة من 🤗 Tokenizers 
@ -133,12 +133,18 @@
    title: المعايير
  - local: notebooks
    title: دفاتر الملاحظات مع الأمثلة
-#   - local: community
-#     title: موارد المجتمع
+  - local: community
+    title: موارد المجتمع
  - local: troubleshooting
    title: استكشاف الأخطاء وإصلاحها
  - local: gguf
    title: التوافق مع ملفات GGUF
+  - local: tiktoken
+    title: التوافق مع ملفات TikToken
+  - local: modular_transformers
+    title: الوحدات النمطية في `transformers`
+  - local: how_to_hack_models
+    title: اختراق النموذج (الكتابة فوق فئة لاستخدامك)
  title: أدلة المطورين
 # - sections:
 #   - local: quantization/overview
@ -151,6 +157,8 @@
 #     title: AWQ
 #   - local: quantization/aqlm
 #     title: AQLM
+#   - local: quantization/vptq
+#     title: VPTQ
 #   - local: quantization/quanto
 #     title: Quanto
 #   - local: quantization/eetq
--- a/docs/source/ar/community.md
+++ b/docs/source/ar/community.md
@ -0,0 +1,66 @@
+# مجتمع المطورين
+
+هذه الصفحة تجمع الموارد حول 🤗 Transformers التي طورها المجتمع.
+
+## موارد المجتمع:
+
+| المصدر     |      الوصف      |      المؤلف      |
+|:----------|:-------------|------:|
+| [Hugging Face Transformers Glossary Flashcards](https://www.darigovresearch.com/huggingface-transformers-glossary-flashcards) | مجموعة من البطاقات التعليمية القائمة على [Transformers Docs Glossary](glossary) والتي تم وضعها في شكل يمكن تعلمه/مراجعته بسهولة باستخدام [Anki](https://apps.ankiweb.net/) وهو تطبيق مفتوح المصدر متعدد المنصات مصمم خصيصًا للاحتفاظ بالمعرفة على المدى الطويل. شاهد هذا [فيديو تمهيدي حول كيفية استخدام البطاقات التعليمية](https://www.youtube.com/watch?v=Dji_7PILrw). | [Darigov Research](https://www.darigovresearch.com/) |
+
+## دفاتر ملاحظات المجتمع:
+
+| الدفتر     |      الوصف      |      المؤلف      |      |
+|:----------|:-------------|:-------------|------:|
+| [Fine-tune a pre-trained Transformer to generate lyrics](https://github.com/AlekseyKorshuk/huggingartists) | كيفية توليد كلمات الأغاني على غرار فنانك المفضل من خلال ضبط نموذج GPT-2 |  [Aleksey Korshuk](https://github.com/AlekseyKorshuk) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AlekseyKorshuk/huggingartists/blob/master/huggingartists-demo.ipynb) |
+| [Train T5 in Tensorflow 2](https://github.com/snapthat/TF-T5-text-to-text) | كيفية تدريب T5 لأي مهمة باستخدام Tensorflow 2. يوضح هذا الدفتر مهمة السؤال والجواب المنفذة في Tensorflow 2 باستخدام SQUAD | [Muhammad Harris](https://github.com/HarrisDePerceptron) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snapthat/TF-T5-text-to-text/blob/master/snapthatT5/notebooks/TF-T5-Datasets%20Training.ipynb) |
+| [Train T5 on TPU](https://github.com/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb)  | كيفية تدريب T5 على SQUAD مع Transformers و Nlp | [Suraj Patil](https://github.com/patil-suraj) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb#scrollTo=QLGiFCDqvuil) |
+| [Fine-tune T5 for Classification and Multiple Choice](https://github.com/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb)  | كيفية ضبط نموذج T5 للتصنيف والمهام متعددة الخيارات باستخدام تنسيق النص إلى نص مع PyTorch Lightning |  [Suraj Patil](https://github.com/patil-suraj) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb) |
+| [Fine-tune DialoGPT on New Datasets and Languages](https://github.com/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb)  | كيفية ضبط نموذج DialoGPT على مجموعة بيانات جديدة لروبوتات الدردشة المحادثية المفتوحة |  [Nathan Cooper](https://github.com/ncoop57) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb) |
+| [Long Sequence Modeling with Reformer](https://github.com/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb)  | كيفية التدريب على تسلسلات طويلة تصل إلى 500,000 رمز باستخدام Reformer |  [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb)  |
+| [Fine-tune BART for Summarization](https://github.com/ohmeow/ohmeow_website/blob/master/posts/2021-05-25-mbart-sequence-classification-with-blurr.ipynb) | كيفية ضبط نموذج BART للتلخيص باستخدام fastai باستخدام blurr | [Wayde Gilliam](https://ohmeow.com/) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ohmeow/ohmeow_website/blob/master/posts/2021-05-25-mbart-sequence-classification-with-blurr.ipynb) |
+| [Fine-tune a pre-trained Transformer on anyone's tweets](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) | كيفية توليد تغريدات على غرار حساب Twitter المفضل لديك من خلال ضبط نموذج GPT-2 |  [Boris Dayma](https://github.com/borisdayma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) |
+| [Optimize 🤗 Hugging Face models with Weights & Biases](https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_%26_Biases.ipynb) | دليل كامل لعرض تكامل W&B مع Hugging Face | [Boris Dayma](https://github.com/borisdayma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_%26_Biases.ipynb) |
+| [Pretrain Longformer](https://github.com/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb)  | كيفية بناء نسخة "طويلة" من النماذج المسبقة التدريب الموجودة |  [Iz Beltagy](https://beltagy.net) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb) |
+| [Fine-tune Longformer for QA](https://github.com/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) | كيفية ضبط نموذج Longformer لمهمة QA | [Suraj Patil](https://github.com/patil-suraj) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) |
+| [Evaluate Model with 🤗nlp](https://github.com/patrickvonplaten/notebooks/blob/master/How_to_evaluate_Longformer_on_TriviaQA_using_NLP.ipynb) | كيفية تقييم نموذج Longformer على TriviaQA مع `nlp` | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1m7eTGlPmLRgoPkkA7rkhQdZ9ydpmsdLE?usp=sharing) |
+| [Fine-tune T5 for Sentiment Span Extraction](https://github.com/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb)  | كيفية ضبط نموذج T5 لاستخراج المشاعر باستخدام تنسيق النص إلى نص مع PyTorch Lightning |  [Lorenzo Ampil](https://github.com/enzoampil) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb) |
+| [Fine-tune DistilBert for Multiclass Classification](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb) | كيفية ضبط نموذج DistilBert للتصنيف متعدد الفئات باستخدام PyTorch | [Abhishek Kumar Mishra](https://github.com/abhimishra91) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb)|
+|[Fine-tune BERT for Multi-label Classification](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb)|كيفية ضبط نموذج BERT للتصنيف متعدد التصنيفات باستخدام PyTorch|[Abhishek Kumar Mishra](https://github.com/abhimishra91) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb)|
+|[Fine-tune T5 for Summarization](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb)|كيفية ضبط نموذج T5 للتلخيص في PyTorch وتتبع التجارب باستخدام WandB|[Abhishek Kumar Mishra](https://github.com/abhimishra91) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb)|
+|[Speed up Fine-Tuning in Transformers with Dynamic Padding / Bucketing](https://github.com/ELS-RD/transformers-notebook/blob/master/Divide_Hugging_Face_Transformers_training_time_by_2_or_more.ipynb)|كيفية تسريع الضبط الدقيق بعامل 2 باستخدام الضبط الديناميكي/التقسيم|[Michael Benesty](https://github.com/pommedeterresautee) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1CBfRU1zbfu7-ijiOqAAQUA-RJaxfcJoO?usp=sharing)|
+|[Pretrain Reformer for Masked Language Modeling](https://github.com/patrickvonplaten/notebooks/blob/master/Reformer_For_Masked_LM.ipynb)| كيفية تدريب نموذج Reformer مع طبقات الانتباه ثنائية الاتجاه | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1tzzh0i8PgDQGV3SMFUGxM7_gGae3K-uW?usp=sharing)|
+|[Expand and Fine Tune Sci-BERT](https://github.com/lordtt13/word-embeddings/blob/master/COVID-19%20Research%20Data/COVID-SciBERT.ipynb)| كيفية زيادة مفردات نموذج SciBERT المسبق التدريب من AllenAI على مجموعة بيانات CORD وإنشاء خط أنابيب لها. | [Tanmay Thakur](https://github.com/lordtt13) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1rqAR40goxbAfez1xvF3hBJphSCsvXmh8)|
+|[Fine Tune BlenderBotSmall for Summarization using the Trainer API](https://github.com/lordtt13/transformers-experiments/blob/master/Custom%20Tasks/fine-tune-blenderbot_small-for-summarization.ipynb)| كيفية ضبط نموذج BlenderBotSmall للتلخيص على مجموعة بيانات مخصصة، باستخدام واجهة برمجة التطبيقات Trainer. | [Tanmay Thakur](https://github.com/lordtt13) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/19Wmupuls7mykSGyRN_Qo6lPQhgp56ymq?usp=sharing)|
+|[Fine-tune Electra and interpret with Integrated Gradients](https://github.com/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb) | كيفية ضبط نموذج Electra للتحليل العاطفي وتفسير التنبؤات باستخدام Captum Integrated Gradients | [Eliza Szczechla](https://elsanns.github.io) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb)|
+|[fine-tune a non-English GPT-2 Model with Trainer class](https://github.com/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb) | كيفية ضبط نموذج GPT-2 غير الإنجليزي باستخدام فئة Trainer | [Philipp Schmid](https://www.philschmid.de) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb)|
+|[Fine-tune a DistilBERT Model for Multi Label Classification task](https://github.com/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb) | كيفية ضبط نموذج DistilBERT لمهمة التصنيف متعدد التصنيفات | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb)|
+|[Fine-tune ALBERT for sentence-pair classification](https://github.com/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb) | كيفية ضبط نموذج ALBERT أو أي نموذج آخر قائم على BERT لمهمة التصنيف المزدوج للجمل | [Nadir El Manouzi](https://github.com/NadirEM) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb)|
+|[Fine-tune Roberta for sentiment analysis](https://github.com/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb) | كيفية ضبط نموذج Roberta للتحليل العاطفي | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb)|
+|[Evaluating Question Generation Models](https://github.com/flexudy-pipe/qugeev) | ما مدى دقة الإجابات على الأسئلة التي يولدها نموذجك التحويلي seq2seq؟ | [Pascal Zoleko](https://github.com/zolekode) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1bpsSqCQU-iw_5nNoRm_crPq6FRuJthq_?usp=sharing)|
+|[Classify text with DistilBERT and Tensorflow](https://github.com/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb) | كيفية ضبط نموذج DistilBERT للتصنيف النصي في TensorFlow | [Peter Bayerle](https://github.com/peterbayerle) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb)|
+|[Leverage BERT for Encoder-Decoder Summarization on CNN/Dailymail](https://github.com/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb) | كيفية البدء السريع لنموذج *EncoderDecoderModel* مع نقطة تفتيش *google-bert/bert-base-uncased* للتلخيص على CNN/Dailymail | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb)|
+|[Leverage RoBERTa for Encoder-Decoder Summarization on BBC XSum](https://github.com/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb) | كيفية البدء السريع لنموذج *EncoderDecoderModel* المشترك مع نقطة تفتيش *FacebookAI/roberta-base* للتلخيص على BBC/XSum | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb)|
+|[Fine-tune TAPAS on Sequential Question Answering (SQA)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) | كيفية ضبط نموذج *TapasForQuestionAnswering* مع نقطة تفتيش *tapas-base* على مجموعة بيانات Sequential Question Answering (SQA) | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb)|
+|[Evaluate TAPAS on Table Fact Checking (TabFact)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb) | كيفية تقييم نموذج *TapasForSequenceClassification* المضبوط مسبقًا مع نقطة تفتيش *tapas-base-finetuned-tabfact* باستخدام مزيج من مكتبتي 🤗 datasets و 🤗 transformers | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb)|
+|[Fine-tuning mBART for translation](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb) | كيفية ضبط نموذج mBART باستخدام Seq2SeqTrainer للترجمة من الهندية إلى الإنجليزية | [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb)|
+|[Fine-tune LayoutLM on FUNSD (a form understanding dataset)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb) | كيفية ضبط نموذج *LayoutLMForTokenClassification* على مجموعة بيانات FUNSD لاستخراج المعلومات من المستندات الممسوحة ضوئيًا | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb)|
+|[Fine-Tune DistilGPT2 and Generate Text](https://colab.research.google.com/github/tripathiaakash/DistilGPT2-Tutorial/blob/main/distilgpt2_fine_tuning.ipynb) | كيفية ضبط نموذج DistilGPT2 وتوليد النص | [Aakash Tripathi](https://github.com/tripathiaakash) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/tripathiaakash/DistilGPT2-Tutorial/blob/main/distilgpt2_fine_tuning.ipynb)|
+|[Fine-Tune LED on up to 8K tokens](https://github.com/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb) | كيفية ضبط نموذج LED على pubmed للتلخيص طويل المدى | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb)|
+|[Evaluate LED on Arxiv](https://github.com/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb) | كيفية تقييم نموذج LED للتلخيص طويل المدى بشكل فعال | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb)|
+|[Fine-tune LayoutLM on RVL-CDIP (a document image classification dataset)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb) | كيفية ضبط نموذج *LayoutLMForSequenceClassification* على مجموعة بيانات RVL-CDIP لتصنيف المستندات الممسوحة ضوئيًا | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb)|
+|[Wav2Vec2 CTC decoding with GPT2 adjustment](https://github.com/voidful/huggingface_notebook/blob/main/xlsr_gpt.ipynb) | كيفية فك تشفير تسلسل CTC مع تعديل نموذج اللغة | [Eric Lam](https://github.com/voidful) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1e_zQHYbO2YKEaUgzb1ww1WwiAyydAj?usp=sharing)|
+|[Fine-tune BART for summarization in two languages with Trainer class](https://github.com/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb) | كيفية ضبط نموذج BART للتلخيص بلغتين باستخدام فئة Trainer | [Eliza Szczechla](https://github.com/elsanns) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb)|
+|[Evaluate Big Bird on Trivia QA](https://github.com/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb) | كيفية تقييم نموذج BigBird للأسئلة والأجوبة على وثائق طويلة على Trivia QA | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb)|
+| [Create video captions using Wav2Vec2](https://github.com/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) | كيفية إنشاء تعليقات توضيحية على YouTube من أي فيديو من خلال تفريغ الصوت باستخدام Wav2Vec | [Niklas Muennighoff](https://github.com/Muennighoff) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) |
+| [Fine-tune the Vision Transformer on CIFAR-10 using PyTorch Lightning](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb) | كيفية ضبط نموذج Vision Transformer (ViT) على CIFAR-10 باستخدام مكتبات HuggingFace Transformers و Datasets و PyTorch Lightning | [Niels Rogge](https://github.com/nielsrogge) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb) |
+| [Fine-tune the Vision Transformer on CIFAR-10 using the 🤗 Trainer](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb) | كيفية ضبط نموذج Vision Transformer (ViT) على CIFAR-10 باستخدام مكتبات HuggingFace Transformers و Datasets و 🤗 Trainer | [Niels Rogge](https://github.com/nielsrogge) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb) |
+| [Evaluate LUKE on Open Entity, an entity typing dataset](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_open_entity.ipynb) | كيفية تقييم نموذج *LukeForEntityClassification* على مجموعة بيانات Open Entity | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_open_entity.ipynb) |
+| [Evaluate LUKE on TACRED, a relation extraction dataset](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_tacred.ipynb) | كيفية تقييم نموذج *LukeForEntityPairClassification* على مجموعة بيانات TACRED | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_tacred.ipynb) |
+| [Evaluate LUKE on CoNLL-2003, an important NER benchmark](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) | كيفية تقييم نموذج *LukeForEntitySpanClassification* على مجموعة بيانات CoNLL-2003 | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) |
+| [Evaluate BigBird-Pegasus on PubMed dataset](https://github.com/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) | كيفية تقييم نموذج *BigBirdPegasusForConditionalGeneration* على مجموعة بيانات PubMed | [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) |
+| [Speech Emotion Classification with Wav2Vec2](https://github.com/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) | كيفية استخدام نموذج Wav2Vec2 المسبق التدريب لتصنيف المشاعر على مجموعة بيانات MEGA | [Mehrdad Farahani](https://github.com/m3hrdadfi) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) |
+| [Detect objects in an image with DETR](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) | كيفية استخدام نموذج *DetrForObjectDetection* المدرب للكشف عن الأجسام في صورة وتصوير الانتباه | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) |
+| [Fine-tune DETR on a custom object detection dataset](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) | كيفية ضبط نموذج *DetrForObjectDetection* على مجموعة بيانات الكشف عن الأجسام المخصصة | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) |
+| [Finetune T5 for Named Entity Recognition](https://github.com/ToluClassics/Notebooks/blob/main/T5_Ner_Finetuning.ipynb) | كيفية ضبط نموذج *T5* على مهمة التعرف على الكيانات المسماة | [Ogundepo Odunayo](https://github.com/ToluClassics) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1obr78FY_cBmWY5ODViCmzdY6O1KB65Vc?usp=sharing) |
+| [Fine-Tuning Open-Source LLM using QLoRA with MLflow and PEFT](https://github.com/mlflow/mlflow/blob/master/docs/source/llms/transformers/tutorials/fine-tuning/transformers-peft.ipynb) | كيفية استخدام [QLoRA](https://github.com/artidoro/qlora) و [PEFT](https://huggingface.co/docs/peft/en/index) لضبط نموذج LLM بطريقة فعالة من حيث الذاكرة، مع استخدام [MLflow](https://mlflow.org/docs/latest/llms/transformers/index.html) لإدارة تتبع التجارب | [Yuki Watanabe](https://github.com/B-Step62) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mlflow/mlflow/blob/master/docs/source/llms/transformers/tutorials/fine-tuning/transformers-peft.ipynb) |
--- a/docs/source/ar/how_to_hack_models.md
+++ b/docs/source/ar/how_to_hack_models.md
@ -0,0 +1,163 @@
+# كيفية تعديل أي نموذج من نماذج Transformers
+
+توفر مكتبة [🤗 Transformers](https://github.com/huggingface/transformers) مجموعة من النماذج المسبقة التدريب والأدوات لمعالجة اللغات الطبيعية، والرؤية، وما إلى ذلك. على الرغم من أن هذه النماذج تغطي مجموعة واسعة من التطبيقات، فقد تواجه حالات استخدام لا تدعمها المكتبة بشكل افتراضي. يُمكن للتخصيص أن يفتح إمكانيات جديدة، مثل إضافة طبقات جديدة، أو تعديل البنية المعمارية، أو تحسين آليات الانتباه. سيُوضح لك هذا الدليل كيفية تعديل نماذج Transformers الموجودة لتلبية احتياجاتك المحددة. الشيء الرائع هو أنك لست بحاجة إلى الخروج من إطار عمل Transformers لإجراء هذه التغييرات. ي يمكنك تعديل النماذج مباشرةً في Transformers والاستفادة من الميزات مثل [واجهة برمجة التطبيقات Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer)، و [PreTrainedModel](https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel)، والضبط الدقيق الفعال باستخدام أدوات مثل [PEFT](https://huggingface.co/docs/peft/index).
+
+سنرشدك في هذا الدليل  لكيفية تخصيص نماذج Transformers الموجودة لتلبية متطلباتك، دون فقدان مزايا الإطار. ستتعلم كيفية:
+
+- تعديل بنية نموذج ما من خلال تغيير آلية الانتباه الخاصة به.
+- تطبيق تقنيات مثل Low-Rank Adaptation (LoRA) على مكونات نموذج محددة.
+
+نحن نشجعك على المساهمة باختراقاتك الخاصة ومشاركتها هنا مع المجتمع1
+
+## مثال: تعديل آلية الانتباه في نموذج Segment Anything (SAM)
+
+نموذج **Segment Anything (SAM)** هو نموذج رائد في مجال تجزئة الصور. في تنفيذه الافتراضي، يستخدم SAM إسقاطًا مجمعًا للاستعلام والمفتاح والقيمة (`qkv`) في آلية الانتباه الخاصة به. ومع ذلك، قد ترغب في ضبط مكونات محددة فقط من آلية الانتباه، مثل إسقاطات الاستعلام (`q`) والقيمة (`v`)، لتقليل عدد المعلمات القابلة للتدريب والموارد الحسابية المطلوبة.
+
+### الدافع
+
+من خلال تقسيم الإسقاط المجمع `qkv` إلى إسقاطات منفصلة `q` و `k` و `v`، يمكنك تطبيق تقنيات مثل **LoRA** (Low-Rank Adaptation) على إسقاطي `q` و `v` فقط. يسمح لك هذا بما يلي:
+
+- ضبط عدد أقل من المعلمات، مما يقلل من العبء الحسابي.
+- تحقيق أداء أفضل من خلال التركيز على مكونات محددة.
+- تجربة استراتيجيات تعديل مختلفة في آلية الانتباه.
+
+### التنفيذ
+
+#### **الخطوة 1: إنشاء فئة اهتمام مخصصة**
+
+بعد ذلك، قم بإنشاء فئة فرعية من فئة `SamVisionAttention` الأصلية وعدلها لتضم إسقاطات `q` و `k` و `v` منفصلة.
+
+```python
+import torch
+import torch.nn as nn
+from transformers.models.sam.modeling_sam import SamVisionAttention
+
+class SamVisionAttentionSplit(SamVisionAttention, nn.Module):
+    def __init__(self, config, window_size):
+        super().__init__(config, window_size)
+        del self.qkv
+        # إسقاطات منفصلة q و k و v
+        self.q = nn.Linear(config.hidden_size, config.hidden_size, bias=config.qkv_bias)
+        self.k = nn.Linear(config.hidden_size, config.hidden_size, bias=config.qkv_bias)
+        self.v = nn.Linear(config.hidden_size, config.hidden_size, bias=config.qkv_bias)
+        self._register_load_state_dict_pre_hook(self.split_q_k_v_load_hook)
+
+    def split_q_k_v_load_hook(self, state_dict, prefix, *args):
+        keys_to_delete = []
+        for key in list(state_dict.keys()):
+            if "qkv." in key:
+                # تقسيم q و k و v من الإسقاط المجمع
+                q, k, v = state_dict[key].chunk(3, dim=0)
+                # استبدال الإسقاطات الفردية q و k و v
+                state_dict[key.replace("qkv.", "q.")] = q
+                state_dict[key.replace("qkv.", "k.")] = k
+                state_dict[key.replace("qkv.", "v.")] = v
+                # وضع علامة على مفتاح qkv القديم للحذف
+                keys_to_delete.append(key)
+        
+        # حذف مفاتيح qkv القديمة
+        for key in keys_to_delete:
+            del state_dict[key]
+
+    def forward(self, hidden_states: torch.Tensor, output_attentions=False) -> torch.Tensor:
+        batch_size, height, width, _ = hidden_states.shape
+        qkv_shapes = (batch_size *  self.num_attention_heads,  height * width, -1)
+        query = self.q(hidden_states).reshape((batch_size,  height * width,self.num_attention_heads, -1)).permute(0,2,1,3).reshape(qkv_shapes)
+        key = self.k(hidden_states).reshape((batch_size,  height * width,self.num_attention_heads, -1)).permute(0,2,1,3).reshape(qkv_shapes)
+        value = self.v(hidden_states).reshape((batch_size,  height * width,self.num_attention_heads, -1)).permute(0,2,1,3).reshape(qkv_shapes)
+
+        attn_weights = (query * self.scale) @ key.transpose(-2, -1)
+
+        if self.use_rel_pos:
+            attn_weights = self.add_decomposed_rel_pos(
+                attn_weights, query, self.rel_pos_h, self.rel_pos_w, (height, width), (height, width)
+            )
+
+        attn_weights = torch.nn.functional.softmax(attn_weights, dtype=torch.float32, dim=-1).to(query.dtype)
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = (attn_probs @ value).reshape(batch_size, self.num_attention_heads, height, width, -1)
+        attn_output = attn_output.permute(0, 2, 3, 1, 4).reshape(batch_size, height, width, -1)
+        attn_output = self.proj(attn_output)
+
+        if output_attentions:
+            outputs = (attn_output, attn_weights)
+        else:
+            outputs = (attn_output, None)
+        return outputs
+```
+
+**الشرح:**
+
+- **الإسقاطات المنفصلة:** يتم إزالة الإسقاط المُجمع `qkv`، وإنشاء إسقاطات خطية منفصلة `q` و `k` و `v`.
+- **دالة استدعاء  تحميل الأوزان:** تقوم طريقة `_split_qkv_load_hook` بتقسيم أوزان `qkv` المسبقة التدريب إلى أوزان `q` و `k` و `v` منفصلة عند تحميل النموذج. يضمن هذا التوافق مع أي نموذج مسبق التدريب.
+- **التنفيذ الأمامي:** يتم حساب الاستعلامات والمفاتيح والقيم بشكل منفصل، وتستمر آلية الانتباه كالمعتاد.
+
+#### **الخطوة 2: استبدال فئة الانتباه الأصلية**
+
+استبدل فئة `SamVisionAttention` الأصلية بفئتك المخصصة بحيث يستخدم النموذج آلية الانتباه المعدلة.
+
+```python
+from transformers import SamModel
+from transformers.models.sam import modeling_sam
+
+# استبدال فئة الاهتمام في وحدة نمطية modeling_sam
+modeling_sam.SamVisionAttention = SamVisionAttentionSplit
+
+# تحميل نموذج SAM المسبق التدريب
+model = SamModel.from_pretrained("facebook/sam-vit-base")
+```
+
+**الشرح:**
+
+- **استبدال الفئة:** من خلال تعيين فئتك المخصصة إلى `modeling_sam.SamVisionAttention`، فإن أي حالات من فئة `SamVisionAttention` في النموذج ستستخدم النسخة المعدلة. وبالتالي، عند استدعاء `SamModel`، سيتم استخدام `SamVisionAttentionSplit` المحددة حديثًا.
+- **تحميل النموذج:** يتم تحميل النموذج باستخدام `from_pretrained`، ويتم دمج آلية الانتباه المخصصة.
+
+#### **الخطوة 3: تطبيق LoRA على إسقاطات محددة**
+
+مع وجود إسقاطات `q` و `k` و `v` منفصلة، يمكنك الآن تطبيق LoRA على مكونات محددة، مثل إسقاطات `q` و `v`.
+
+```python
+from peft import LoraConfig, get_peft_model
+
+config = LoraConfig(
+    r=16,
+    lora_alpha=32,
+    target_modules=["q", "v"],  # تطبيق LoRA على إسقاطات q و v
+    lora_dropout=0.1,
+    task_type="mask-generation"
+)
+
+# تطبيق LoRA على النموذج
+model = get_peft_model(model, config)
+```
+
+**الشرح:**
+
+- **تكوين LoRA:** تحدد `LoraConfig` المرتبة `r`، وعامل القياس `lora_alpha`، والوحدات المستهدفة (`"q"` و `"v"`)، ومعدل التخلي، ونوع المهمة.
+- **تطبيق LoRA:** تقوم دالة `get_peft_model` بتطبيق LoRA على الوحدات المحددة في النموذج.
+- **تقليل المعلمات:** من خلال التركيز على `q` و `v`، فإنك تقلل عدد المعلمات القابلة للتدريب، مما يؤدي إلى تسريع التدريب وتقليل استخدام الذاكرة.
+
+#### **الخطوة 4: التحقق من عدد المعلمات القابلة للتدريب**
+
+من السهل التحقق من عدد المعلمات القابلة للتدريب ومعرفة تأثير تعديلك.
+
+```python
+model.print_trainable_parameters()
+```
+
+**الناتج المتوقع:**
+
+```
+عدد المعلمات القابلة للتدريب: 608,256 || جميع المعلمات: 94,343,728 || نسبة المعلمات القابلة للتدريب: 0.6447
+عدد المعلمات القابلة للتدريب: 912,384 || جميع المعلمات: 94,647,856 || نسبة المعلمات القابلة للتدريب: 0.9640 # مع k
+```
+
+## المساهمة بابداعاتك الخاصة
+
+يمكن لتعديل النماذج المسبقة التدريب أن يفتح آفاقًا جديدة للبحث والتطبيق. من خلال فهم وتعديل الآليات الداخلية للنماذج مثل SAM، يمكنك تخصيصها لتلبية احتياجاتك المحددة، وتحسين الأداء، وتجربة أفكار جديدة.
+
+إذا قمت بتطوير تعديﻻتك الخاصة لنماذج Transformers وترغب في مشاركتها، ففكر في المساهمة في هذه الوثيقة.
+
+- **إنشاء طلب سحب (Pull Request):** شارك تغييراتك وتحسيناتك في التعليمات البرمجية مباشرة في المستودع.
+- **كتابة التوثيق:** قدم تفسيرات وأمثلة واضحة لتعديلاتك.
+- **التفاعل مع المجتمع:** ناقش أفكارك واحصل على تعليقات من المطورين والباحثين الآخرين من خلال فتح مشكلة.
--- a/docs/source/ar/modular_transformers.md
+++ b/docs/source/ar/modular_transformers.md
@ -0,0 +1,184 @@
+# المحولات النمطية
+
+مكتبة `transformers` هي إطار عمل ذو فلسفة محدد؛ يتم تعريف فلسفتنا في [الدليل المفاهيمي](./philosophy).
+
+جوهر هذه الفلسفة يتمثل في مبدأ [نموذج واحد، ملف واحد](https://huggingface.co/blog/transformers-design-philosophy)
+في المكتبة. الجانب السلبي لهذا المكون هو تقييده لوراثة واستيراد مكونات الملفات.
+
+نتيجة لذلك، تتكرر مكونات النموذج عبر العديد من الملفات. يحتوي `transformers` على عدد كبير من طبقات الانتباه، يقارب عدد النماذج، والكثير منها متطابق.  يتسبب هذا في تباعد عمليات التنفيذ المستقلة مع تطبيق الإصلاحات والتغييرات.
+على أجزاء محددة من التعليمات البرمجية.
+
+ولمعالجة ذلك، اعتمدنا مفهوم "النسخ" في المكتبة.  فبإضافة تعليق يُشير إلى أن التعليمات البرمجية هي نسخة من أخرى، نضمن من خلال أنظمة  CI والأوامر المحلية عدم تباعد النسخ.  لكن هذه العملية، رغم بساطتها، تُسبب إرهاقاً.  كما أنها تزيد العبء على المساهمين، وهو ما نهدف إلى تجاوزه.
+
+غالباً ما تتطلب مساهمات النماذج إضافة تعليمات برمجية (حوالي 1000 سطر)، ومعالج (حوالي 500 سطر)، واختبارات، ووثائق، إلخ. ونادراً ما تقل مساهمات النماذج عن 3000-5000 سطر من التعليمات البرمجية،  معظمها أكواد نمطية.  هذا يرفع مستوى  المساهمات،
+
+ونهدف مع المحولات النمطية إلى خفض هذا المستوى إلى حدّ مقبول.
+
+## ما هو؟
+
+تقدم المحولات النمطية مفهوم ملف "نمطي" لمجلد نموذج. يقبل هذا الملف النمطي تعليمات برمجية
+غير مقبولة عادة في ملفات النمذجة/المعالجة، حيث يسمح بالاستيراد من نماذج مجاورة وكذلك
+الوراثة من الفئات إلى فئات أخرى.
+
+يعرّف هذا الملف النمطي النماذج والمعالجات وفئة التكوين التي سيتم تعريفها في وحداتهم
+المتعلقة.
+
+وأخيرًا، يقدم هذا الميزة أداة `linter` جديدة والتي ستعمل على "تفكيك" الملف النمطي إلى بنية "نموذج واحد، ملف واحد"
+هيكل الدليل. سيتم إنشاء هذه الملفات تلقائيًا في كل مرة يتم فيها تشغيل البرنامج النصي؛ مما يقلل من المساهمات المطلوبة
+إلى الملف النمطي، وبالتالي فقط إلى التغييرات بين النموذج المساهم والنماذج الأخرى.
+
+سيقوم مستخدمو النموذج في النهاية باستيراد واستخدام واجهة الملف الواحد، لذا لا يتوقع حدوث أي تغيير هنا. من خلال القيام بذلك،
+نأمل في الجمع بين أفضل ما في العالمين: تمكين المساهمات البسيطة مع الالتزام بفلسفتنا.
+
+لذلك، هذا بديل لعلامات `# Copied from`، ويمكن توقع انتقال النماذج المساهمة سابقًا إلى
+تنسيق المحولات النمطية الجديد في الأشهر المقبلة.
+
+### التفاصيل
+
+تُبسط أداة "linter" الوراثة، مُنشئةً جميع الملفات المفردة من الملف النمطي، مع الحفاظ على شفافيتها أمام مستخدمي Python. حاليًا، تُبسط الأداة مستوىً واحدًا من الوراثة
+
+على سبيل المثال:
+- إذا ورثت فئة التكوين من فئة أخرى وأضافت/حذفت معامل، فسيتم إما الإشارة إلى الملف المولد مباشرةً
+  (في حالة الإضافة) أو إزالته تمامًا (في حالة الحذف).
+- إذا ورثت فئة من فئة أخرى، على سبيل المثال: `class GemmaModel(LlamaModel):`، تُستنتج التبعيات تلقائيًا
+  سيتم استنتاج جميع الوحدات الفرعية تلقائيًا من الفئة الأصلية.
+- إذا قمت بتعريف وظائف جديدة في الملف `modular` واستخدمتها داخل الفئات، فستستنتج أداة linter ذلك تلقائيًا
+
+يجب أن تكون قادرًا على كتابة كل شيء (المجزىء اللغوي، ومُعالِج الصور، والنموذج، والتكوين) في الملف `modular`، وسيتم إنشاء الملفات المُقابلة تلقائيًا.
+
+### التطبيق
+
+[TODO] نقدم اختبارًا جديدًا، للتأكد من أن المحتوى المولد يتطابق مع ما هو موجود في `modular_xxxx.py`
+
+### الأمثلة
+
+هنا مثال سريع باستخدام BERT و RoBERTa. النموذجان مرتبطان ارتباطًا وثيقًا: يختلف تنفيذهما النموذجي في طبقة تضمين.
+
+بدلاً من إعادة تعريف النموذج بالكامل، إليك كيف يبدو ملف `modular_roberta.py` لفئات النمذجة والتكوين (لأغراض المثال، يتم تجاهل المجزىء اللغوي في هذا الوقت حيث أنه مختلف جدًا).
+
+```python
+from torch import nn
+from ..bert.configuration_bert import BertConfig
+from ..bert.modeling_bert import (
+    BertModel,
+    BertEmbeddings,
+    BertForMaskedLM
+)
+
+# تكوين RoBERTa مطابق لتكوين BERT
+class RobertaConfig(BertConfig):
+  model_type = 'roberta'
+
+# نعيد تعريف الإضافات هنا لتسليط الضوء على اختلاف معرف الحشو، ونعيد تعريف الإضافات الموضعية
+class RobertaEmbeddings(BertEmbeddings):
+    def __init__(self, config):
+        super().__init__(config())
+
+        self.padding_idx = config.pad_token_id
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
+        )
+
+# نموذج RoBERTa مطابق لنموذج BERT، باستثناء طبقة الإضافات.
+# نعيد تعريف الإضافات أعلاه، لذا هنا لا توجد حاجة لعمل إضافي
+class RobertaModel(BertModel):
+  def __init__(self, config):
+    super().__init__(config)
+    self.embeddings = RobertaEmbeddings(config)
+
+      
+# الرؤوس الآن تحتاج فقط إلى إعادة تعريف النموذج داخل `RobertaModel` الصحيح
+class RobertaForMaskedLM(BertForMaskedLM):
+  def __init__(self, config):
+    super().__init__(config)
+    self.model = RobertaModel(config)
+```
+
+لاحظ أنه إذا لم تستخدم الاعتماد الذي حددته، فستحصل على الخطأ التالي:
+
+```bash
+ValueError: You defined `RobertaEmbeddings` in the modular_roberta.py, it should be used
+                                    when you define `BertModel`, as it is one of it's direct dependencies. Make sure
+                                    you use it in the `__init__` function.
+```
+
+بالإضافة إلى ذلك، قد تجد قائمة بالأمثلة هنا:
+
+## ما هو ليس كذلك
+
+ليس بديلاً لتعليمات برمجة النمذجة (بعد؟)، وإذا لم يكن نموذجك يعتمد على أي شيء آخر موجود من قبل، فيمكنك إضافة ملف `نمذجة` كالعادة.
+
+
+## الاستخدام المتقدم
+
+### إزالة السمات والوظائف
+لإزالة السمات التي لا تستخدم في نموذجك النمطي، والتي لا تريد رؤيتها في النمذجة المفككة:
+
+```python
+class GemmaModel(LlamaModel):                 |           class GemmaModel(PreTrainedModel):
+    def __init__(self, config):               |              def __init__(self, config):
+        super().__init__(self, eos_token)     |                 super().__init__(config)
+        del self.embed_tokens                 |                 self.padding_idx = config.pad_token_id
+                                              |                 self.vocab_size = config.vocab_size
+                                              |
+                                              |                 self.layers = nn.ModuleList(
+                                              |                     [LlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+                                              |                 )
+                                              |                 self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+                                              |                 self.rotary_emb = LlamaRotaryEmbedding(config=config)
+                                              |                 self.gradient_checkpointing = False
+                                              |                 
+                                              |                 # Initialize weights and apply final processing
+                                              |                 self.post_init()
+```
+إذا قمت بالتحقق من `LlamaModel` الأصلي، فستجد `embed_tokens` الذي تمت إزالته هنا (كما هو متوقع!)
+
+إزالة وظيفة مشابهة، تحتاج فقط إلى كتابتها مع `raise ValueError("")` لمحاكاة السلوك الذي تريده فعليًا عند إزالة وظيفة أصلية في بايثون.
+
+```python
+class GemmaTokenizer(LlamaTokenizer):
+    ...
+
+    def get_spm_processor(self):
+        raise AttributeError("Not needed for Gemma")
+
+    def unk_token_length(self):
+        raise AttributeError("Not needed for Gemma")
+```
+
+### تعريف وظائف جديدة
+
+إذا قمت بتعريف وظيفة جديدة في الملف `modular` لاستخدامها داخل فئة، على سبيل المثال
+
+```python
+def my_new_function(*args, **kwargs):
+  # Do something here
+  pass
+
+class GemmaModel(LlamaModel):
+    def forward(*args, **kwargs):
+      # Call the function
+      example = my_new_function(*args, **kwargs)
+      # continue here
+```
+
+سيتم نسخ وظيفة `my_new_function` (وبشكل متكرر، أي وظائف أخرى جديدة يتم استدعاؤها في جسمها) تلقائيًا
+في الملف الذي يتم استخدامه.
+
+### استدعاء `super()`
+قمنا مؤخرًا بشحن بعض الميزات التي تسمح لك بالانتقال من:
+```python
+class GemmaTokenizer(LlamaTokenizer, PretrainedTokenizerFast):         |           class GemmaModel(nn.Module):
+    def __init__(self, eos_token="</s>"):                              |             def __init__(self):
+        eos_token = AddedToken(eos_token)                              |                eos_token = AddedToken(eos_token)
+        PretrainedTokenizerFast.__init__(self, eos_token)              |                super().__init__(eos_token)
+```
+هذا مفيد عندما لا تريد تفكيك استدعاء `super()`، وتريد التمييز بين أي استدعاء super init تقوم به!
+
+### التسمية الخاصة
+ندعم الآن أيضًا حالات خاصة مثل
+```python
+class GemmaVisionModel(CLIPModel):                                 
+    pass
+```
+حيث اسم فئة `GemmaVision` الخاصة بك ليس هو نفسه `Gemma` النمطي. هذا مفيد للغاية للنماذج المركبة.
--- a/docs/source/ar/quicktour.md
+++ b/docs/source/ar/quicktour.md
@ -347,8 +347,8 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
 ```py
 >>> from transformers import AutoModel

->>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
->>> pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True)
+>>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True)
 ```
 </pt>
 <tf>
@ -356,8 +356,8 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
 ```py
 >>> from transformers import TFAutoModel

->>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True)
+>>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True)
 ```
 </tf>
 </frameworkcontent>
--- a/docs/source/ar/tasks/multiple_choice.md
+++ b/docs/source/ar/tasks/multiple_choice.md
@ -0,0 +1,452 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# الاختيار من متعدد (Multiple choice)
+
+[[open-in-colab]]
+
+مهمة الاختيار من متعدد مشابهة لمهمة الإجابة على الأسئلة، ولكن مع توفير عدة إجابات محتملة مع سياق، ويُدرّب النموذج على تحديد الإجابة الصحيحة.
+
+سيوضح لك هذا الدليل كيفية:
+
+1. ضبط نموذج [BERT](https://huggingface.co/google-bert/bert-base-uncased)  باستخدام الإعداد `regular` لمجموعة بيانات [SWAG](https://huggingface.co/datasets/swag) لاختيار الإجابة الأفضل من بين الخيارات المتعددة المتاحة مع السياق.
+2. استخدام النموذج المضبوط للاستدلال.
+
+قبل البدء، تأكد من تثبيت جميع المكتبات الضرورية:
+
+```bash
+pip install transformers datasets evaluate
+```
+
+نشجعك على تسجيل الدخول إلى حساب Hugging Face الخاص بك حتى تتمكن من تحميل نموذجك ومشاركته مع المجتمع. عند المطالبة، أدخل الرمز المميز الخاص بك لتسجيل الدخول:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## تحميل مجموعة بيانات SWAG
+
+ابدأ بتحميل تهيئة `regular` لمجموعة بيانات SWAG من مكتبة 🤗 Datasets:
+
+```py
+>>> from datasets import load_dataset
+
+>>> swag = load_dataset("swag", "regular")
+```
+
+ثم ألق نظرة على مثال:
+
+```py
+>>> swag["train"][0]
+{'ending0': 'passes by walking down the street playing their instruments.',
+ 'ending1': 'has heard approaching them.',
+ 'ending2': "arrives and they're outside dancing and asleep.",
+ 'ending3': 'turns the lead singer watches the performance.',
+ 'fold-ind': '3416',
+ 'gold-source': 'gold',
+ 'label': 0,
+ 'sent1': 'Members of the procession walk down the street holding small horn brass instruments.',
+ 'sent2': 'A drum line',
+ 'startphrase': 'Members of the procession walk down the street holding small horn brass instruments. A drum line',
+ 'video-id': 'anetv_jkn6uvmqwh4'}
+```
+
+على الرغم من أن الحقول تبدو كثيرة، إلا أنها في الواقع بسيطة جداً:
+
+- `sent1` و `sent2`: يعرض هذان الحقلان بداية الجملة، وبدمجهما معًا، نحصل على حقل `startphrase`.
+- `ending`: يقترح نهاية محتملة للجملة، واحدة منها فقط هي الصحيحة.
+- `label`: يحدد نهاية الجملة الصحيحة.
+
+## المعالجة المسبقة (Preprocess)
+
+الخطوة التالية هي استدعاء مُجزئ BERT لمعالجة بدايات الجمل والنهايات الأربع المحتملة:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+```
+
+تحتاج دالة المعالجة المسبقة التي تريد إنشاءها إلى:
+
+1.  إنشاء أربع نسخ من حقل `sent1` ودمج كل منها مع `sent2` لإعادة إنشاء كيفية بدء الجملة.
+2. دمج `sent2` مع كل من نهايات الجمل الأربع المحتملة.
+3. تتجميع هاتين القائمتين لتتمكن من تجزئتهما، ثم إعادة ترتيبها بعد ذلك بحيث يكون لكل مثال حقول `input_ids` و `attention_mask` و `labels` مقابلة.
+
+
+```py
+>>> ending_names = ["ending0", "ending1", "ending2", "ending3"]
+
+>>> def preprocess_function(examples):
+...     first_sentences = [[context] * 4 for context in examples["sent1"]]
+...     question_headers = examples["sent2"]
+...     second_sentences = [
+...         [f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)
+...     ]
+
+...     first_sentences = sum(first_sentences, [])
+...     second_sentences = sum(second_sentences, [])
+
+...     tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)
+...     return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}
+```
+
+لتطبيق دالة المعالجة المسبقة على مجموعة البيانات بأكملها، استخدم طريقة [`~datasets.Dataset.map`] الخاصة بـ 🤗 Datasets. يمكنك تسريع دالة `map` عن طريق تعيين `batched=True` لمعالجة عناصر متعددة من مجموعة البيانات في وقت واحد:
+
+```py
+tokenized_swag = swag.map(preprocess_function, batched=True)
+```
+
+لا يحتوي 🤗 Transformers على مجمع بيانات للاختيار من متعدد، لذلك ستحتاج إلى تكييف [`DataCollatorWithPadding`] لإنشاء دفعة من الأمثلة. من الأكفأ إضافة حشو (padding) ديناميكي للجمل إلى أطول طول في دفعة أثناء التجميع، بدلاً من حشو مجموعة البيانات بأكملها إلى الحد الأقصى للطول.
+
+يقوم `DataCollatorForMultipleChoice` بتجميع جميع مدخلات النموذج، ويطبق الحشو، ثم يعيد تجميع النتائج في شكلها الأصلي:
+
+<frameworkcontent>
+<pt>
+
+```py
+>>> from dataclasses import dataclass
+>>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
+>>> from typing import Optional, Union
+>>> import torch
+
+>>> @dataclass
+... class DataCollatorForMultipleChoice:
+...     """
+...     Data collator that will dynamically pad the inputs for multiple choice received.
+...     """
+
+...     tokenizer: PreTrainedTokenizerBase
+...     padding: Union[bool, str, PaddingStrategy] = True
+...     max_length: Optional[int] = None
+...     pad_to_multiple_of: Optional[int] = None
+
+...     def __call__(self, features):
+...         label_name = "label" if "label" in features[0].keys() else "labels"
+...         labels = [feature.pop(label_name) for feature in features]
+...         batch_size = len(features)
+...         num_choices = len(features[0]["input_ids"])
+...         flattened_features = [
+...             [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
+...         ]
+...         flattened_features = sum(flattened_features, [])
+
+...         batch = self.tokenizer.pad(
+...             flattened_features,
+...             padding=self.padding,
+...             max_length=self.max_length,
+...             pad_to_multiple_of=self.pad_to_multiple_of,
+...             return_tensors="pt",
+...         )
+
+...         batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
+...         batch["labels"] = torch.tensor(labels, dtype=torch.int64)
+...         return batch
+```
+</pt>
+<tf>
+ 
+```py
+>>> from dataclasses import dataclass
+>>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
+>>> from typing import Optional, Union
+>>> import tensorflow as tf
+
+>>> @dataclass
+... class DataCollatorForMultipleChoice:
+...     """
+...     Data collator that will dynamically pad the inputs for multiple choice received.
+...     """
+
+...     tokenizer: PreTrainedTokenizerBase
+...     padding: Union[bool, str, PaddingStrategy] = True
+...     max_length: Optional[int] = None
+...     pad_to_multiple_of: Optional[int] = None
+
+...     def __call__(self, features):
+...         label_name = "label" if "label" in features[0].keys() else "labels"
+...         labels = [feature.pop(label_name) for feature in features]
+...         batch_size = len(features)
+...         num_choices = len(features[0]["input_ids"])
+...         flattened_features = [
+...             [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
+...         ]
+...         flattened_features = sum(flattened_features, [])
+
+...         batch = self.tokenizer.pad(
+...             flattened_features,
+...             padding=self.padding,
+...             max_length=self.max_length,
+...             pad_to_multiple_of=self.pad_to_multiple_of,
+...             return_tensors="tf",
+...         )
+
+...         batch = {k: tf.reshape(v, (batch_size, num_choices, -1)) for k, v in batch.items()}
+...         batch["labels"] = tf.convert_to_tensor(labels, dtype=tf.int64)
+...         return batch
+```
+</tf>
+</frameworkcontent>
+
+## التقييم (Evaluate)
+
+يُفضل غالبًا تضمين مقياس أثناء التدريب لتقييم أداء نموذجك. يمكنك تحميل طريقة تقييم بسرعة باستخدام مكتبة 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index). لهذه المهمة، قم بتحميل مقياس [الدقة](https://huggingface.co/spaces/evaluate-metric/accuracy) (انظر إلى [الجولة السريعة](https://huggingface.co/docs/evaluate/a_quick_tour) لـ 🤗 Evaluate لمعرفة المزيد حول كيفية تحميل المقياس وحسابه):
+
+```py
+>>> import evaluate
+
+>>> accuracy = evaluate.load("accuracy")
+```
+
+ثم أنشئ دالة لتمرير التنبؤات والتسميات إلى [`~evaluate.EvaluationModule.compute`] لحساب الدقة:
+
+```py
+>>> import numpy as np
+
+>>> def compute_metrics(eval_pred):
+...     predictions, labels = eval_pred
+...     predictions = np.argmax(predictions, axis=1)
+...     return accuracy.compute(predictions=predictions, references=labels)
+```
+
+دالتك `compute_metrics` جاهزة الآن، وستعود إليها عند إعداد تدريبك.
+
+## التدريب (Train)
+
+<frameworkcontent>
+<pt>
+
+<Tip>
+
+إذا لم تكن معتادًا على ضبط نموذج باستخدام [`Trainer`], فراجع الدرس الأساسي [هنا](../training#train-with-pytorch-trainer)!
+
+</Tip>
+
+أنت جاهز لبدء تدريب نموذجك الآن! قم بتحميل BERT باستخدام [`AutoModelForMultipleChoice`]:
+
+```py
+>>> from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
+
+>>> model = AutoModelForMultipleChoice.from_pretrained("google-bert/bert-base-uncased")
+```
+
+في هذه المرحلة، تبقى ثلاث خطوات فقط:
+
+1. حدد معلمات التدريب الخاصة بك في [`TrainingArguments`]. المعلمة الوحيدة المطلوبة هي `output_dir` التي تحدد مكان حفظ نموذجك. ستدفع هذا النموذج إلى Hub عن طريق تعيين `push_to_hub=True` (يجب عليك تسجيل الدخول إلى Hugging Face لتحميل نموذجك). في نهاية كل حقبة، سيقوم [`Trainer`] بتقييم الدقة وحفظ نقطة فحص التدريب.
+2. مرر معلمات التدريب إلى [`Trainer`] جنبًا إلى جنب مع النموذج ومُجمِّع البيانات والمعالج ودالة تجميع البيانات ودالة `compute_metrics`.
+3. استدعي [`~Trainer.train`] لضبط نموذجك.
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="my_awesome_swag_model",
+...     eval_strategy="epoch",
+...     save_strategy="epoch",
+...     load_best_model_at_end=True,
+...     learning_rate=5e-5,
+...     per_device_train_batch_size=16,
+...     per_device_eval_batch_size=16,
+...     num_train_epochs=3,
+...     weight_decay=0.01,
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=tokenized_swag["train"],
+...     eval_dataset=tokenized_swag["validation"],
+...     processing_class=tokenizer,
+...     data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
+...     compute_metrics=compute_metrics,
+... )
+
+>>> trainer.train()
+```
+
+بمجرد اكتمال التدريب، شارك نموذجك مع Hub باستخدام طريقة [`~transformers.Trainer.push_to_hub`] حتى يتمكن الجميع من استخدام نموذجك:
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+<Tip>
+
+إذا لم تكن معتادًا على ضبط نموذج باستخدام Keras، فراجع الدرس الأساسي [هنا](../training#train-a-tensorflow-model-with-keras)!
+
+</Tip>
+لضبط نموذج في TensorFlow، ابدأ بإعداد دالة مُحسِّن وجدول معدل التعلم وبعض معلمات التدريب:
+
+```py
+>>> from transformers import create_optimizer
+
+>>> batch_size = 16
+>>> num_train_epochs = 2
+>>> total_train_steps = (len(tokenized_swag["train"]) // batch_size) * num_train_epochs
+>>> optimizer, schedule = create_optimizer(init_lr=5e-5, num_warmup_steps=0, num_train_steps=total_train_steps)
+```
+
+ثم يمكنك تحميل BERT باستخدام [`TFAutoModelForMultipleChoice`]:
+
+```py
+>>> from transformers import TFAutoModelForMultipleChoice
+
+>>> model = TFAutoModelForMultipleChoice.from_pretrained("google-bert/bert-base-uncased")
+```
+
+حوّل مجموعات البيانات الخاصة بك إلى تنسيق `tf.data.Dataset` باستخدام [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]:
+
+```py
+>>> data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)
+>>> tf_train_set = model.prepare_tf_dataset(
+...     tokenized_swag["train"],
+...     shuffle=True,
+...     batch_size=batch_size,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_validation_set = model.prepare_tf_dataset(
+...     tokenized_swag["validation"],
+...     shuffle=False,
+...     batch_size=batch_size,
+...     collate_fn=data_collator,
+... )
+```
+
+قم بتهيئة النموذج للتدريب باستخدام [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). لاحظ أن جميع نماذج Transformers تحتوي على دالة خسارة مناسبة للمهمة بشكل افتراضي، لذلك لا تحتاج إلى تحديد واحدة ما لم ترغب في ذلك:
+
+```py
+>>> model.compile(optimizer=optimizer)  # لا توجد وسيطة خسارة!
+```
+
+الخطوتان الأخيرتان قبل بدء التدريب هما: حساب دقة التنبؤات، وتوفير طريقة لرفع النموذج إلى Hub. ويمكن تحقيق ذلك باستخدام [استدعاءات Keras](../main_classes/keras_callbacks)
+
+مرر دالتك `compute_metrics` إلى [`~transformers.KerasMetricCallback`]:
+
+```py
+>>> from transformers.keras_callbacks import KerasMetricCallback
+
+>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
+```
+
+حدد مكان دفع نموذجك ومعالجك في [`~transformers.PushToHubCallback`]:
+
+```py
+>>> from transformers.keras_callbacks import PushToHubCallback
+
+>>> push_to_hub_callback = PushToHubCallback(
+...     output_dir="my_awesome_model",
+...     tokenizer=tokenizer,
+... )
+```
+
+ثم قم بتضمين الاستدعاءات معًا:
+
+```py
+>>> callbacks = [metric_callback, push_to_hub_callback]
+```
+
+أخيرًا، أنت جاهز لبدء تدريب نموذجك! استدعِ[`fit`](https://keras.io/api/models/model_training_apis/#fit-method) مع مجموعات بيانات التدريب والتحقق من الصحة وعدد الحقب والاستدعاءات لضبط النموذج:
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=2, callbacks=callbacks)
+```
+
+بمجرد اكتمال التدريب، يتم تحميل نموذجك تلقائيًا إلى Hub حتى يتمكن الجميع من استخدامه!
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+للحصول على مثال أكثر تعمقًا حول كيفية ضبط نموذج للاختيار من متعدد، ألق نظرة على [دفتر ملاحظات PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)
+أو [دفتر ملاحظات TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb) المقابل.
+
+</Tip>
+
+## الاستدلال  (Inference)
+
+رائع، الآن بعد أن قمت بضبط نموذج، يمكنك استخدامه للاستدلال!
+
+قم بإنشاء نص واقتراح إجابتين محتملتين:
+
+```py
+>>> prompt = "France has a bread law, Le Décret Pain, with strict rules on what is allowed in a traditional baguette."
+>>> candidate1 = "The law does not apply to croissants and brioche."
+>>> candidate2 = "The law applies to baguettes."
+```
+
+<frameworkcontent>
+<pt>
+قم بتحليل كل مطالبة وزوج إجابة مرشح وأعد تنسورات PyTorch. يجب عليك أيضًا إنشاء بعض `العلامات`:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_swag_model")
+>>> inputs = tokenizer([[prompt, candidate1], [prompt, candidate2]], return_tensors="pt", padding=True)
+>>> labels = torch.tensor(0).unsqueeze(0)
+```
+
+مرر مدخلاتك والعلامات إلى النموذج وأرجع`logits`:
+
+```py
+>>> from transformers import AutoModelForMultipleChoice
+
+>>> model = AutoModelForMultipleChoice.from_pretrained("username/my_awesome_swag_model")
+>>> outputs = model(**{k: v.unsqueeze(0) for k, v in inputs.items()}, labels=labels)
+>>> logits = outputs.logits
+```
+
+استخرج الفئة ذات الاحتمالية الأكبر:
+
+```py
+>>> predicted_class = logits.argmax().item()
+>>> predicted_class
+0
+```
+</pt>
+<tf>
+قم بتحليل كل مطالبة وزوج إجابة مرشح وأعد موترات TensorFlow:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_swag_model")
+>>> inputs = tokenizer([[prompt, candidate1], [prompt, candidate2]], return_tensors="tf", padding=True)
+```
+
+مرر مدخلاتك إلى النموذج وأعد القيم logits:
+
+```py
+>>> from transformers import TFAutoModelForMultipleChoice
+
+>>> model = TFAutoModelForMultipleChoice.from_pretrained("username/my_awesome_swag_model")
+>>> inputs = {k: tf.expand_dims(v, 0) for k, v in inputs.items()}
+>>> outputs = model(inputs)
+>>> logits = outputs.logits
+```
+
+استخرج الفئة ذات الاحتمالية الأكبر:
+
+```py
+>>> predicted_class = int(tf.math.argmax(logits, axis=-1)[0])
+>>> predicted_class
+0
+```
+</tf>
+</frameworkcontent>
--- a/docs/source/ar/tasks/question_answering.md
+++ b/docs/source/ar/tasks/question_answering.md
@ -0,0 +1,432 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+#  الإجابة على الأسئلة (Question answering)
+
+[[open-in-colab]]
+
+<Youtube id="ajPx5LwJD-I"/>
+
+تُقدّم مهام الإجابة على الأسئلة إجابةً بناءً على سؤال. إذا سبق لك أن سألت مساعدًا افتراضيًا مثل Alexa أو Siri أو Google عن حالة الطقس، فأنت قد استخدمت نموذج للإجابة على الأسئلة من قبل. هناك نوعان شائعان لمهام الإجابة على الأسئلة:
+
+- الاستخراجية: استخراج الإجابة من السياق المحدد.
+- التلخيصية: إنشاء إجابة من السياق تجيب على السؤال بشكل صحيح.
+
+سيوضح لك هذا الدليل كيفية:
+
+1. ضبط [DistilBERT](https://huggingface.co/distilbert/distilbert-base-uncased) على مجموعة بيانات [SQuAD](https://huggingface.co/datasets/squad) للإجابة على الأسئلة الاستخراجية.
+2. استخدام النموذج المضبوط للاستدلال.
+
+<Tip>
+
+لمشاهدة جميع الهياكل والنسخ المتوافقة مع هذه المهمة، نوصي بالرجوع إلى [صفحة المهمة](https://huggingface.co/tasks/question-answering)
+
+</Tip>
+
+قبل البدء، تأكد من تثبيت جميع المكتبات الضرورية:
+
+```bash
+pip install transformers datasets evaluate
+```
+
+نشجعك على تسجيل الدخول إلى حساب Hugging Face الخاص بك حتى تتمكن من تحميل نموذجك ومشاركته مع المجتمع. عند المطالبة، أدخل الرمز المميز الخاص بك لتسجيل الدخول:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## تحميل مجموعة بيانات SQuAD
+
+ابدأ بتحميل جزء أصغر من مجموعة بيانات SQuAD من مكتبة 🤗 Datasets. سيتيح لك ذلك فرصة للتجربة والتحقق من عمل كل شيء بشكل صحيح قبل قضاء المزيد من الوقت في التدريب على مجموعة البيانات الكاملة.
+
+```py
+>>> from datasets import load_dataset
+
+>>> squad = load_dataset("squad", split="train[:5000]")
+```
+
+قم بتقسيم تقسيم `train` لمجموعة البيانات إلى مجموعة تدريب واختبار باستخدام طريقة [`~datasets.Dataset.train_test_split`]:
+
+```py
+>>> squad = squad.train_test_split(test_size=0.2)
+```
+
+ثم ألق نظرة على مثال:
+
+```py
+>>> squad["train"][0]
+{'answers': {'answer_start': [515], 'text': ['Saint Bernadette Soubirous']},
+ 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
+ 'id': '5733be284776f41900661182',
+ 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
+ 'title': 'University_of_Notre_Dame'
+}
+```
+
+هناك العديد من الحقول المهمة هنا:
+
+- `answers`: موقع بداية الرمز المميز للإجابة ونص الإجابة.
+- `context`: معلومات أساسية يحتاج النموذج إلى استخراج الإجابة منها.
+- `question`: السؤال الذي يجب على النموذج الإجابة عليه.
+
+## المعالجة المسبقة (Preprocess)
+
+<Youtube id="qgaM0weJHpA"/>
+
+الخطوة التالية هي تحميل المحلل اللغوى DistilBERT لمعالجة حقلي `question` و `context`:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
+```
+
+هناك بعض خطوات المعالجة المسبقة الخاصة بمهام الإجابة على الأسئلة التي يجب أن تكون على دراية بها:
+
+1. قد تحتوي بعض الأمثلة في مجموعة البيانات على `context` طويلًا يتجاوز الحد الأقصى لطول مدخل النموذج. للتعامل مع النصوص الأطول، يتم اقتطاع `context` فقط عن طريق تعيين `truncation="only_second"`.
+2. بعد ذلك، يتم تحديد مواضع بداية ونهاية الإجابة في `context` الأصلي عن طريق تعيين
+   `return_offset_mapping=True`.
+3. باستخدام التعيين، يمكن الآن تحديد رموز بداية ونهاية الإجابة. استخدم طريقة [`~tokenizers.Encoding.sequence_ids`]
+   لتحديد أجزاء الإزاحة التي تتوافق مع `question` و `context`.
+
+فيما يلي كيفية إنشاء دالة لقص وتعيين رموز البداية والنهاية لـ `answer` إلى `context`:
+
+```py
+>>> def preprocess_function(examples):
+...     questions = [q.strip() for q in examples["question"]]
+...     inputs = tokenizer(
+...         questions,
+...         examples["context"],
+...         max_length=384,
+...         truncation="only_second",
+...         return_offsets_mapping=True,
+...         padding="max_length",
+...     )
+
+...     offset_mapping = inputs.pop("offset_mapping")
+...     answers = examples["answers"]
+...     start_positions = []
+...     end_positions = []
+
+...     for i, offset in enumerate(offset_mapping):
+...         answer = answers[i]
+...         start_char = answer["answer_start"][0]
+...         end_char = answer["answer_start"][0] + len(answer["text"][0])
+...         sequence_ids = inputs.sequence_ids(i)
+
+...         # Find the start and end of the context
+...         idx = 0
+...         while sequence_ids[idx] != 1:
+...             idx += 1
+...         context_start = idx
+...         while sequence_ids[idx] == 1:
+...             idx += 1
+...         context_end = idx - 1
+
+...         # If the answer is not fully inside the context, label it (0, 0)
+...         if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
+...             start_positions.append(0)
+...             end_positions.append(0)
+...         else:
+...             # Otherwise it's the start and end token positions
+...             idx = context_start
+...             while idx <= context_end and offset[idx][0] <= start_char:
+...                 idx += 1
+...             start_positions.append(idx - 1)
+
+...             idx = context_end
+...             while idx >= context_start and offset[idx][1] >= end_char:
+...                 idx -= 1
+...             end_positions.append(idx + 1)
+
+...     inputs["start_positions"] = start_positions
+...     inputs["end_positions"] = end_positions
+...     return inputs
+```
+
+لتطبيق المعالجة المسبقة على كامل مجموعة البيانات، استخدم [`~datasets.Dataset.map`] من مكتبة 🤗 Datasets. يمكنك تسريع دالة `map` عن طريق تعيين `batched=True` لمعالجة عناصر متعددة من مجموعة البيانات دفعة واحدة. قم بإزالة أي أعمدة لا تحتاجها:
+
+```py
+>>> tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)
+```
+
+الآن قم بإنشاء دفعة من الأمثلة باستخدام [`DefaultDataCollator`]. بخلاف مجمّعات البيانات الأخرى في 🤗 Transformers، لا يطبق [`DefaultDataCollator`] أي معالجة مسبقة إضافية مثل الحشو.
+
+<frameworkcontent>
+<pt>
+ 
+```py
+>>> from transformers import DefaultDataCollator
+
+>>> data_collator = DefaultDataCollator()
+```
+</pt>
+<tf>
+ 
+```py
+>>> from transformers import DefaultDataCollator
+
+>>> data_collator = DefaultDataCollator(return_tensors="tf")
+```
+</tf>
+</frameworkcontent>
+
+## التدريب (Train)
+
+<frameworkcontent>
+<pt>
+
+<Tip>
+
+إذا لم تكن معتادًا على ضبط نموذج باستخدام [`Trainer`], ألق نظرة على البرنامج التعليمي الأساسي [هنا](../training#train-with-pytorch-trainer)!
+
+</Tip>
+
+أنت جاهز لبدء تدريب نموذجك الآن! قم بتحميل DistilBERT باستخدام [`AutoModelForQuestionAnswering`]:
+
+```py
+>>> from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
+
+>>> model = AutoModelForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")
+```
+
+في هذه المرحلة، تبقى ثلاث خطوات فقط:
+
+1. حدد المعاملات الفائقة للتدريب في [`TrainingArguments`]. المعامل الوحيد المطلوب هو `output_dir` الذي يحدد مكان حفظ نموذجك. ستدفع هذا النموذج إلى Hub عن طريق تعيين `push_to_hub=True` (يجب عليك تسجيل الدخول إلى Hugging Face لتحميل نموذجك).
+2. مرر معاملات التدريب إلى [`Trainer`] جنبًا إلى جنب مع النموذج، ومجموعة البيانات، والمُحلّل النصي، ومُجمّع البيانات.
+3. استدعِ ـ [`~Trainer.train`] لضبط النموذج.
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="my_awesome_qa_model",
+...     eval_strategy="epoch",
+...     learning_rate=2e-5,
+...     per_device_train_batch_size=16,
+...     per_device_eval_batch_size=16,
+...     num_train_epochs=3,
+...     weight_decay=0.01,
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=tokenized_squad["train"],
+...     eval_dataset=tokenized_squad["test"],
+...     processing_class=tokenizer,
+...     data_collator=data_collator,
+... )
+
+>>> trainer.train()
+```
+
+بمجرد اكتمال التدريب، شارك نموذجك في Hub باستخدام الدالة [`~transformers.Trainer.push_to_hub`] حتى يتمكن الجميع من استخدام نموذجك:
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+ 
+<Tip>
+
+إذا لم تكن معتادًا على ضبط نموذج باستخدام Keras، فألق نظرة على البرنامج التعليمي الأساسي [هنا](../training#train-a-tensorflow-model-with-keras)!
+
+</Tip>
+لضبط نموذج في TensorFlow، ابدأ بإعداد دالة مُحسِّن، وجدول معدل التعلم، وبعض المعاملات الفائقة للتدريب:
+
+```py
+>>> from transformers import create_optimizer
+
+>>> batch_size = 16
+>>> num_epochs = 2
+>>> total_train_steps = (len(tokenized_squad["train"]) // batch_size) * num_epochs
+>>> optimizer, schedule = create_optimizer(
+...     init_lr=2e-5,
+...     num_warmup_steps=0,
+...     num_train_steps=total_train_steps,
+... )
+```
+
+ثم يمكنك تحميل DistilBERT باستخدام [`TFAutoModelForQuestionAnswering`]:
+
+```py
+>>> from transformers import TFAutoModelForQuestionAnswering
+
+>>> model = TFAutoModelForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")
+```
+
+حوّل مجموعات البيانات الخاصة بك إلى تنسيق `tf.data.Dataset` باستخدام [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]:
+
+```py
+>>> tf_train_set = model.prepare_tf_dataset(
+...     tokenized_squad["train"],
+...     shuffle=True,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_validation_set = model.prepare_tf_dataset(
+...     tokenized_squad["test"],
+...     shuffle=False,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+```
+
+قم بتكوين النموذج للتدريب باستخدام [`compile`](https://keras.io/api/models/model_training_apis/#compile-method):
+
+```py
+>>> import tensorflow as tf
+
+>>> model.compile(optimizer=optimizer)
+```
+
+آخر شيء يجب إعداده قبل بدء التدريب هو توفير طريقة لدفع نموذجك إلى Hub. يمكن القيام بذلك عن طريق تحديد مكان دفع نموذجك ومعالجك المعجمي في [`~transformers.PushToHubCallback`]:
+
+```py
+>>> from transformers.keras_callbacks import PushToHubCallback
+
+>>> callback = PushToHubCallback(
+...     output_dir="my_awesome_qa_model",
+...     tokenizer=tokenizer,
+... )
+```
+
+أخيرًا، أنت جاهز لبدء تدريب نموذجك! اتصل بـ [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) مع مجموعات بيانات التدريب والتحقق من الصحة، وعدد العهود، ومعاودة الاتصال الخاصة بك لضبط النموذج:
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=[callback])
+```
+بمجرد اكتمال التدريب، يتم تحميل نموذجك تلقائيًا إلى Hub حتى يتمكن الجميع من استخدامه!
+</tf>
+</frameworkcontent>
+
+
+<Tip>
+
+للحصول على مثال أكثر تعمقًا حول كيفية ضبط نموذج للإجابة على الأسئلة، ألق نظرة على [دفتر ملاحظات PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb) المقابل
+أو [دفتر ملاحظات TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb).
+
+</Tip>
+
+## التقييم (Evaluate)
+
+يتطلب التقييم للإجابة على الأسئلة قدرًا كبيرًا من المعالجة اللاحقة. لتوفير وقتك، يتخطى هذا الدليل خطوة التقييم. لا يزال [`Trainer`] يحسب خسارة التقييم أثناء التدريب، مما يعني أنك لست تجهل تمامًا أداء نموذجك.
+
+إذا كان لديك المزيد من الوقت وتهتم بكيفية تقييم نموذجك للإجابة على الأسئلة، فألق نظرة على فصل [الإجابة على الأسئلة](https://huggingface.co/course/chapter7/7?fw=pt#post-processing) من دورة 🤗 Hugging Face!
+
+## الاستدلال (Inference)
+
+رائع، الآن بعد أن قمت بضبط نموذج، يمكنك استخدامه للاستدلال!
+
+حدد سؤالًا وسياقًا ليقوم النموذج بالتنبؤ بالإجابة عليه:
+
+```py
+>>> question = "How many programming languages does BLOOM support?"
+>>> context = "BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 programming languages."
+```
+
+أبسط طريقة لتجربة نموذجك المُدرَّب للاستدلال هي استخدامه في [`pipeline`]. قم بإنشاء كائن لـ `pipeline` للإجابة على الأسئلة باستخدام نموذجك، ومرِّر النص إليه:
+
+```py
+>>> from transformers import pipeline
+
+>>> question_answerer = pipeline("question-answering", model="my_awesome_qa_model")
+>>> question_answerer(question=question, context=context)
+{'score': 0.2058267742395401,
+ 'start': 10,
+ 'end': 95,
+ 'answer': '176 مليار معامل ويمكنه إنشاء نصوص بـ 46 لغة طبيعية و 13'}
+```
+
+يمكنك أيضًا تكرار نتائج `pipeline` يدويًا إذا أردت:
+
+<frameworkcontent>
+<pt>
+ 
+ قسّم النص وأرجع تنسورات PyTorch:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_qa_model")
+>>> inputs = tokenizer(question, context, return_tensors="pt")
+```
+
+مرر مدخلاتك إلى النموذج وأرجع `logits`:
+
+```py
+>>> import torch
+>>> from transformers import AutoModelForQuestionAnswering
+
+>>> model = AutoModelForQuestionAnswering.from_pretrained("my_awesome_qa_model")
+>>> with torch.no_grad():
+...     outputs = model(**inputs)
+```
+
+احصل على أعلى احتمال من مخرجات النموذج لموضعي البداية والنهاية:
+
+```py
+>>> answer_start_index = outputs.start_logits.argmax()
+>>> answer_end_index = outputs.end_logits.argmax()
+```
+
+استخلاص الإجابة من الرموز المتوقعة:
+
+```py
+>>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
+>>> tokenizer.decode(predict_answer_tokens)
+'176 billion parameters and can generate text in 46 languages natural languages and 13'
+```
+</pt>
+<tf>
+قم بتحليل النص المعجمي وأعد موترات TensorFlow:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_qa_model")
+>>> inputs = tokenizer(question, context, return_tensors="tf")
+```
+
+مرر مدخلاتك إلى النموذج وأعد `logits`:
+
+```py
+>>> from transformers import TFAutoModelForQuestionAnswering
+
+>>> model = TFAutoModelForQuestionAnswering.from_pretrained("my_awesome_qa_model")
+>>> outputs = model(**inputs)
+```
+
+احصل على أعلى احتمال من مخرجات النموذج لموضعي البداية والنهاية:
+
+```py
+>>> answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0])
+>>> answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0])
+```
+
+استخلاص الإجابة من الرموز المتوقعة:
+
+```py
+>>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
+>>> tokenizer.decode(predict_answer_tokens)
+'176 billion parameters and can generate text in 46 languages natural languages and 13'
+```
+</tf>
+</frameworkcontent>
--- a/docs/source/ar/tasks/summarization.md
+++ b/docs/source/ar/tasks/summarization.md
--- a/docs/source/ar/tasks/translation.md
+++ b/docs/source/ar/tasks/translation.md
@ -0,0 +1,407 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# الترجمة(Translation)
+
+[[open-in-colab]]
+
+<Youtube id="1JvfrvZgi6c"/>
+
+الترجمة هي عملية تحويل سلسلة نصية من لغة إلى أخرى. وهي إحدى المهام التي يمكن صياغتها كمسألة تسلسل إلى تسلسل، وهو إطار عمل قوي لإنتاج مخرجات من مدخلات، مثل الترجمة أو التلخيص. تُستخدم أنظمة الترجمة عادةً للترجمة بين نصوص لغات مختلفة، ويمكن استخدامها أيضًا لترجمة الكلام أو لمهام تجمع بين النصوص والكلام، مثل تحويل النص إلى كلام أو تحويل الكلام إلى نص.
+
+سيوضح لك هذا الدليل كيفية:
+
+1. ضبط دقيق لنموذج [T5](https://huggingface.co/google-t5/t5-small) على المجموعة الفرعية الإنجليزية-الفرنسية من مجموعة بيانات [OPUS Books](https://huggingface.co/datasets/opus_books) لترجمة النص الإنجليزي إلى الفرنسية.
+2. استخدام النموذج المضبوط بدقة للاستدلال.
+
+<Tip>
+
+لمشاهدة جميع البنى والنسخ المتوافقة مع هذه المهمة، نوصي بالتحقق من [صفحة المهمة](https://huggingface.co/tasks/translation).
+
+</Tip>
+
+قبل البدء، تأكد من تثبيت جميع المكتبات الضرورية:
+
+```bash
+pip install transformers datasets evaluate sacrebleu
+```
+
+نشجعك على تسجيل الدخول إلى حساب Hugging Face الخاص بك حتى تتمكن من تحميل نموذجك ومشاركته مع المجتمع. عند الطلب، أدخل الرمز المميز الخاص بك لتسجيل الدخول:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## تحميل مجموعة بيانات OPUS Books
+
+ابدأ بتحميل المجموعة الفرعية الإنجليزية-الفرنسية من مجموعة بيانات [OPUS Books](https://huggingface.co/datasets/opus_books) من مكتبة 🤗 Datasets:
+
+```py
+>>> from datasets import load_dataset
+
+>>> books = load_dataset("opus_books", "en-fr")
+```
+
+قسّم مجموعة البيانات إلى مجموعة تدريب ومجموعة اختبار باستخدام طريقة [`~datasets.Dataset.train_test_split`]:
+
+```py
+>>> books = books["train"].train_test_split(test_size=0.2)
+```
+
+ثم ألقِ نظرة على مثال:
+
+```py
+>>> books["train"][0]
+{'id': '90560',
+ 'translation': {'en': 'But this lofty plateau measured only a few fathoms, and soon we reentered Our Element.',
+  'fr': 'Mais ce plateau élevé ne mesurait que quelques toises, et bientôt nous fûmes rentrés dans notre élément.'}}
+```
+
+`translation`: ترجمة إنجليزية وفرنسية للنص.
+
+## المعالجة المسبقة(Preprocess)
+
+<Youtube id="XAR8jnZZuUs"/>
+
+الخطوة التالية هي تحميل مُجزئ T5 لمعالجة أزواج اللغة الإنجليزية-الفرنسية:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> checkpoint = "google-t5/t5-small"
+>>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+```
+
+يجب أن تقوم دالة المعالجة المسبقة التي تُريد إنشاءها بما يلي:
+
+1. إضافة بادئة إلى المُدخل بمُوجه حتى يعرف T5 أن هذه مهمة ترجمة. تتطلب بعض النماذج القادرة على أداء مهام متعددة توجيهًا لمهام مُحددة.
+2. تعيين اللغة الهدف (الفرنسية) في معامل `text_target` لضمان معالجة المُجزئ للنص بشكل صحيح. إذا لم تُعيّن `text_target`، فسيُعالج المُجزئ النص على أنه إنجليزي.
+3. اقتطاع التسلسلات بحيث لا يزيد طولها عن الحد الأقصى الذي يحدده معامل `max_length`.
+
+```py
+>>> source_lang = "en"
+>>> target_lang = "fr"
+>>> prefix = "translate English to French: "
+
+>>> def preprocess_function(examples):
+...     inputs = [prefix + example[source_lang] for example in examples["translation"]]
+...     targets = [example[target_lang] for example in examples["translation"]]
+...     model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
+...     return model_inputs
+```
+
+لتطبيق دالة المعالجة المسبقة على مجموعة البيانات بأكملها، استخدم طريقة [`~datasets.Dataset.map`] من 🤗 Datasets. يمكنك تسريع دالة `map` عن طريق تعيين `batched=True` لمعالجة عناصر متعددة من مجموعة البيانات في وقت واحد:
+
+```py
+>>> tokenized_books = books.map(preprocess_function, batched=True)
+```
+
+الآن أنشئ دفعة من الأمثلة باستخدام [`DataCollatorForSeq2Seq`]. من الأكثر كفاءة *الحشو الديناميكي* للجمل إلى أطول طول في دفعة أثناء التجميع، بدلاً من حشو مجموعة البيانات بأكملها إلى الحد الأقصى للطول.
+
+<frameworkcontent>
+<pt>
+
+```py
+>>> from transformers import DataCollatorForSeq2Seq
+
+>>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)
+```
+</pt>
+<tf>
+
+```py
+>>> from transformers import DataCollatorForSeq2Seq
+
+>>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint, return_tensors="tf")
+```
+</tf>
+</frameworkcontent>
+
+## التقييم (Evaluate)
+
+غالباً ما يكون تضمين مقياس أثناء التدريب مفيداً لتقييم أداء نموذجك. يمكنك تحميل طريقة تقييم بسرعة باستخدام مكتبة 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index). لهذه المهمة، حمّل مقياس [SacreBLEU](https://huggingface.co/spaces/evaluate-metric/sacrebleu) (راجع [الجولة السريعة](https://huggingface.co/docs/evaluate/a_quick_tour) لـ 🤗 Evaluate لمعرفة المزيد حول كيفية تحميل وحساب مقياس):
+
+```py
+>>> import evaluate
+
+>>> metric = evaluate.load("sacrebleu")
+```
+
+ثم أنشئ دالة تُمرر تنبؤاتك وتسمياتك إلى [`~evaluate.EvaluationModule.compute`] لحساب درجة SacreBLEU:
+
+```py
+>>> import numpy as np
+
+>>> def postprocess_text(preds, labels):
+...     preds = [pred.strip() for pred in preds]
+...     labels = [[label.strip()] for label in labels]
+
+...     return preds, labels
+
+>>> def compute_metrics(eval_preds):
+...     preds, labels = eval_preds
+...     if isinstance(preds, tuple):
+...         preds = preds[0]
+...     decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
+
+...     labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
+...     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
+
+...     decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
+
+...     result = metric.compute(predictions=decoded_preds, references=decoded_labels)
+...     result = {"bleu": result["score"]}
+
+...     prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
+...     result["gen_len"] = np.mean(prediction_lens)
+...     result = {k: round(v, 4) for k, v in result.items()}
+...     return result
+```
+
+دالة `compute_metrics` الخاصة بك جاهزة الآن، وسوف تعود إليها عند إعداد التدريب.
+
+## التدريب (Train)
+
+<frameworkcontent>
+<pt>
+
+<Tip>
+
+إذا لم تكن معتادًا على ضبط دقيق نموذج باستخدام [`Trainer`], فألقِ نظرة على البرنامج التعليمي الأساسي [هنا](../training#train-with-pytorch-trainer)!
+
+</Tip>
+
+أنت جاهز لبدء تدريب نموذجك الآن! حمّل T5 باستخدام [`AutoModelForSeq2SeqLM`]:
+
+```py
+>>> from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
+
+>>> model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
+```
+
+في هذه المرحلة، تبقى ثلاث خطوات فقط:
+
+1. حدد مُعاملات للتدريب في [`Seq2SeqTrainingArguments`]. المُعامل الوحيدة المطلوبة هي `output_dir` التي تحدد مكان حفظ النموذج الخاص بك. ستقوم بدفع هذا النموذج إلى Hub عن طريق تعيين `push_to_hub=True` (يجب عليك تسجيل الدخول إلى Hugging Face لتحميل نموذجك). في نهاية كل حقبة، سيقوم [`Trainer`] بتقييم مقياس SacreBLEU وحفظ نقطة تدقيق التدريب.
+2. مرر مُعاملات التدريب إلى [`Seq2SeqTrainer`] جنبًا إلى جنب مع النموذج ومجموعة البيانات والمعالج اللغوي وجامع البيانات ووظيفة `compute_metrics`.
+3. نفّذ [`~Trainer.train`] لضبط نموذجك.
+
+```py
+>>> training_args = Seq2SeqTrainingArguments(
+...     output_dir="my_awesome_opus_books_model",
+...     eval_strategy="epoch",
+...     learning_rate=2e-5,
+...     per_device_train_batch_size=16,
+...     per_device_eval_batch_size=16,
+...     weight_decay=0.01,
+...     save_total_limit=3,
+...     num_train_epochs=2,
+...     predict_with_generate=True,
+...     fp16=True, #change to bf16=True for XPU
+...     push_to_hub=True,
+... )
+
+>>> trainer = Seq2SeqTrainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=tokenized_books["train"],
+...     eval_dataset=tokenized_books["test"],
+...     processing_class=tokenizer,
+...     data_collator=data_collator,
+...     compute_metrics=compute_metrics,
+... )
+
+>>> trainer.train()
+```
+
+بمجرد اكتمال التدريب، شارك نموذجك مع Hub باستخدام طريقة [`~transformers.Trainer.push_to_hub`] حتى يتمكن الجميع من استخدام نموذجك:
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+<Tip>
+
+إذا لم تكن معتادًا على ضبط نموذج باستخدام Keras، فألق نظرة على البرنامج التعليمي الأساسي [هنا](../training#train-a-tensorflow-model-with-keras)!
+
+</Tip>
+لضبط نموذج في TensorFlow، ابدأ بإعداد دالة مُحسِّن وجدول معدل تعلم وبعض المعلمات الفائقة للتدريب:
+
+```py
+>>> from transformers import AdamWeightDecay
+
+>>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
+```
+
+ثم يمكنك تحميل T5 باستخدام [`TFAutoModelForSeq2SeqLM`]:
+
+```py
+>>> from transformers import TFAutoModelForSeq2SeqLM
+
+>>> model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint)
+```
+
+حوّل مجموعات البيانات الخاصة بك إلى تنسيق `tf.data.Dataset` باستخدام [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]:
+
+```py
+>>> tf_train_set = model.prepare_tf_dataset(
+...     tokenized_books["train"],
+...     shuffle=True,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_test_set = model.prepare_tf_dataset(
+...     tokenized_books["test"],
+...     shuffle=False,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+```
+
+قم بتكوين النموذج للتدريب باستخدام [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). لاحظ أن جميع نماذج Transformers تحتوي على دالة خسارة ذات صلة بالمهمة بشكل افتراضي، لذلك لا تحتاج إلى تحديد واحدة إلا إذا كنت ترغب في ذلك:
+
+```py
+>>> import tensorflow as tf
+
+>>> model.compile(optimizer=optimizer)  # No loss argument!
+```
+
+آخر شيئين يجب إعدادهما قبل بدء التدريب هما حساب مقياس SacreBLEU من التوقعات، وتوفير طريقة لدفع نموذجك إلى Hub. يتم كلاهما باستخدام [استدعاءات Keras](../main_classes/keras_callbacks).
+
+مرر دالة `compute_metrics` الخاصة بك إلى [`~transformers.KerasMetricCallback`]:
+
+```py
+>>> from transformers.keras_callbacks import KerasMetricCallback
+
+>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_test_set)
+```
+
+حدد مكان دفع نموذجك ومعالجك اللغوي في [`~transformers.PushToHubCallback`]:
+
+```py
+>>> from transformers.keras_callbacks import PushToHubCallback
+
+>>> push_to_hub_callback = PushToHubCallback(
+...     output_dir="my_awesome_opus_books_model",
+...     tokenizer=tokenizer,
+... )
+```
+
+ثم اجمع استدعاءاتك معًا:
+
+```py
+>>> callbacks = [metric_callback, push_to_hub_callback]
+```
+
+أخيرًا، أنت جاهز لبدء تدريب نموذجك! اتصل بـ [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) مع مجموعات بيانات التدريب والتحقق من الصحة وعدد الحقب واستدعاءاتك لضبط النموذج:
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=callbacks)
+```
+
+بمجرد اكتمال التدريب، يتم تحميل نموذجك تلقائيًا إلى Hub حتى يتمكن الجميع من استخدامه!
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+للحصول على مثال أكثر تعمقًا لكيفية ضبط نموذج للترجمة، ألق نظرة على [دفتر ملاحظات PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation.ipynb) المقابل
+أو [دفتر ملاحظات TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation-tf.ipynb).
+
+</Tip>
+
+## الاستدلال (Inference)
+
+رائع، الآن بعد أن قمت بضبط نموذج، يمكنك استخدامه للاستدلال!
+
+أحضر بعض النصوص التي ترغب في ترجمتها إلى لغة أخرى. بالنسبة لـ T5، تحتاج إلى إضافة بادئة إلى مدخلاتك اعتمادًا على المهمة التي تعمل عليها. للترجمة من الإنجليزية إلى الفرنسية، يجب عليك إضافة بادئة إلى مدخلاتك كما هو موضح أدناه:
+
+```py
+>>> text = "translate English to French: Legumes share resources with nitrogen-fixing bacteria."
+```
+
+أبسط طريقة لتجربة نموذجك المضبوط للاستدلال هي استخدامه في [`pipeline`]. قم بإنشاء مثيل لـ `pipeline` للترجمة باستخدام نموذجك، ومرر النص الخاص بك إليه:
+
+```py
+>>> from transformers import pipeline
+
+# تغيير `xx` إلى لغة الإدخال و `yy` إلى لغة المخرجات المطلوبة.
+# أمثلة: "en" للغة الإنجليزية، "fr" للغة الفرنسية، "de" للغة الألمانية، "es" للغة الإسبانية، "zh" للغة الصينية، إلخ؛ translation_en_to_fr تترجم من الإنجليزية إلى الفرنسية
+# يمكنك عرض جميع قوائم اللغات هنا - https://huggingface.co/languages
+>>> translator = pipeline("translation_xx_to_yy", model="username/my_awesome_opus_books_model")
+>>> translator(text)
+[{'translation_text': 'Legumes partagent des ressources avec des bactéries azotantes.'}]
+```
+
+يمكنك أيضًا تكرار نتائج `pipeline` يدويًا إذا أردت:
+
+<frameworkcontent>
+<pt>
+قم بتحويل النص إلى رموز وإرجاع `input_ids` كموترات PyTorch:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_opus_books_model")
+>>> inputs = tokenizer(text, return_tensors="pt").input_ids
+```
+
+استخدم الدالة [`~generation.GenerationMixin.generate`] لإنشاء الترجمة. لمزيد من التفاصيل حول استراتيجيات توليد النصوص المختلفة والمعلمات للتحكم في التوليد، تحقق من واجهة برمجة تطبيقات [توليد النصوص](../main_classes/text_generation).
+
+```py
+>>> from transformers import AutoModelForSeq2SeqLM
+
+>>> model = AutoModelForSeq2SeqLM.from_pretrained("username/my_awesome_opus_books_model")
+>>> outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)
+```
+
+فك تشفير معرفات الرموز المولدة مرة أخرى إلى نص:
+
+```py
+>>> tokenizer.decode(outputs[0], skip_special_tokens=True)
+'Les lignées partagent des ressources avec des bactéries enfixant l'azote.'
+```
+</pt>
+<tf>
+قم بتحويل النص إلى رموز وإرجاع `input_ids` كموترات TensorFlow:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_opus_books_model")
+>>> inputs = tokenizer(text, return_tensors="tf").input_ids
+```
+
+استخدم طريقة [`~transformers.generation_tf_utils.TFGenerationMixin.generate`] لإنشاء الترجمة. لمزيد من التفاصيل حول استراتيجيات توليد النصوص المختلفة والمعلمات للتحكم في التوليد، تحقق من واجهة برمجة تطبيقات [توليد النصوص](../main_classes/text_generation).
+
+```py
+>>> from transformers import TFAutoModelForSeq2SeqLM
+
+>>> model = TFAutoModelForSeq2SeqLM.from_pretrained("username/my_awesome_opus_books_model")
+>>> outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)
+```
+
+فك تشفير معرفات الرموز المولدة مرة أخرى إلى نص:
+
+```py
+>>> tokenizer.decode(outputs[0], skip_special_tokens=True)
+'Les lugumes partagent les ressources avec des bactéries fixatrices d'azote.'
+```
+</tf>
+</frameworkcontent>
--- a/docs/source/ar/tiktoken.md
+++ b/docs/source/ar/tiktoken.md
@ -0,0 +1,41 @@
+# Tiktoken والتفاعل مع Transformers
+
+يتم دمج دعم ملفات نموذج tiktoken بسلاسة في 🤗 transformers عند تحميل النماذج
+`from_pretrained` مع ملف `tokenizer.model` tiktoken على Hub، والذي يتم تحويله تلقائيًا إلى [المحلل اللغوي السريع](https://huggingface.co/docs/transformers/main/en/main_classes/tokenizer#transformers.PreTrainedTokenizerFast).
+
+### النماذج المعروفة التي تم إصدارها مع `tiktoken.model`:
+	- gpt2
+	- llama3
+
+## مثال على الاستخدام
+
+من أجل تحميل ملفات `tiktoken` في `transformers`، تأكد من أن ملف `tokenizer.model` هو ملف tiktoken وسيتم تحميله تلقائيًا عند التحميل `from_pretrained`. إليك كيفية تحميل مجزىء لغوي ونموذج، والذي
+يمكن تحميله من نفس الملف بالضبط:
+
+```py
+from transformers import AutoTokenizer
+
+model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
+tokenizer = AutoTokenizer.from_pretrained(model_id, subfolder="original")
+```
+## إنشاء مجزىء لغوي tiktoken
+
+لا يحتوي ملف `tokenizer.model` على أي معلومات حول الرموز أو الأنماط الإضافية. إذا كانت هذه الأمور مهمة، قم بتحويل المحلل اللغوي إلى `tokenizer.json`، وهو التنسيق المناسب لـ [`PreTrainedTokenizerFast`].
+
+قم بتوليد ملف `tokenizer.model` باستخدام [tiktoken.get_encoding](https://github.com/openai/tiktoken/blob/63527649963def8c759b0f91f2eb69a40934e468/tiktoken/registry.py#L63) ثم قم بتحويله إلى `tokenizer.json` باستخدام [`convert_tiktoken_to_fast`].
+
+```py
+
+from transformers.integrations.tiktoken import convert_tiktoken_to_fast
+from tiktoken import get_encoding
+
+# يمكنك تحميل ترميزك المخصص أو الترميز الذي توفره OpenAI
+encoding = get_encoding("gpt2")
+convert_tiktoken_to_fast(encoding, "config/save/dir")
+```
+
+يتم حفظ ملف `tokenizer.json` الناتج في الدليل المحدد ويمكن تحميله باستخدام [`PreTrainedTokenizerFast`].
+
+```py
+tokenizer = PreTrainedTokenizerFast.from_pretrained("config/save/dir")
+```
--- a/docs/source/de/quicktour.md
+++ b/docs/source/de/quicktour.md
@ -109,7 +109,7 @@ label: NEGATIVE, with score: 0.5309
 Die [`pipeline`] kann auch über einen ganzen Datensatz iterieren. Starten wir mit der Installation der [🤗 Datasets](https://huggingface.co/docs/datasets/) Bibliothek:

 ```bash
-pip install datasets 
+pip install datasets
 ```

 Erstellen wir eine [`pipeline`] mit der Aufgabe die wir lösen und dem Modell welches wir nutzen möchten.
@ -191,7 +191,7 @@ Wenn Sie kein Modell für Ihren Anwendungsfall finden können, müssen Sie ein v

 <Youtube id="AhChOFRegn4"/>

-Unter der Haube arbeiten die Klassen [`AutoModelForSequenceClassification`] und [`AutoTokenizer`] zusammen, um die [`pipeline`] zu betreiben. Eine [`AutoClass`](./model_doc/auto) ist eine Abkürzung, die automatisch die Architektur eines trainierten Modells aus dessen Namen oder Pfad abruft. Sie müssen nur die passende `AutoClass` für Ihre Aufgabe und den zugehörigen Tokenizer mit [`AutoTokenizer`] auswählen. 
+Unter der Haube arbeiten die Klassen [`AutoModelForSequenceClassification`] und [`AutoTokenizer`] zusammen, um die [`pipeline`] zu betreiben. Eine [`AutoClass`](./model_doc/auto) ist eine Abkürzung, die automatisch die Architektur eines trainierten Modells aus dessen Namen oder Pfad abruft. Sie müssen nur die passende `AutoClass` für Ihre Aufgabe und den zugehörigen Tokenizer mit [`AutoTokenizer`] auswählen.

 Kehren wir zu unserem Beispiel zurück und sehen wir uns an, wie Sie die `AutoClass` verwenden können, um die Ergebnisse der [`pipeline`] zu replizieren.

@ -281,7 +281,7 @@ Jetzt können Sie Ihren vorverarbeiteten Stapel von Eingaben direkt an das Model
 ```

 Das Modell gibt die endgültigen Aktivierungen in dem Attribut "logits" aus. Wenden Sie die Softmax-Funktion auf die "logits" an, um die Wahrscheinlichkeiten zu erhalten:
-  
+
 ```py
 >>> from torch import nn

@ -308,7 +308,7 @@ In der [Aufgabenzusammenfassung](./task_summary) steht, welche [AutoModel]-Klass
 </Tip>

 Jetzt können Sie Ihren vorverarbeiteten Stapel von Eingaben direkt an das Modell übergeben, indem Sie die Wörterbuchschlüssel direkt an die Tensoren übergeben:
-  
+
 ```py
 >>> tf_outputs = tf_model(tf_batch)
 ```
@ -383,8 +383,8 @@ Ein besonders cooles 🤗 Transformers-Feature ist die Möglichkeit, ein Modell
 ```py
 >>> from transformers import AutoModel

->>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
->>> pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True)
+>>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True)
 ```
 </pt>
 <tf>
@ -392,8 +392,8 @@ Ein besonders cooles 🤗 Transformers-Feature ist die Möglichkeit, ein Modell
 ```py
 >>> from transformers import TFAutoModel

->>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True)
+>>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True)
 ```
 </tf>
 </frameworkcontent>
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@ -167,10 +167,14 @@
    title: AWQ
  - local: quantization/aqlm
    title: AQLM
+  - local: quantization/vptq
+    title: VPTQ
  - local: quantization/quanto
    title: Quanto
  - local: quantization/eetq
    title: EETQ
+  - local: quantization/higgs
+    title: HIGGS
  - local: quantization/hqq
    title: HQQ
  - local: quantization/fbgemm_fp8
@ -322,6 +326,8 @@
      sections:
      - local: model_doc/albert
        title: ALBERT
+      - local: model_doc/bamba
+        title: Bamba
      - local: model_doc/bart
        title: BART
      - local: model_doc/barthez
@ -362,6 +368,8 @@
        title: CodeLlama
      - local: model_doc/cohere
        title: Cohere
+      - local: model_doc/cohere2
+        title: Cohere2
      - local: model_doc/convbert
        title: ConvBERT
      - local: model_doc/cpm
@ -378,6 +386,8 @@
        title: DeBERTa-v2
      - local: model_doc/dialogpt
        title: DialoGPT
+      - local: model_doc/diffllama
+        title: DiffLlama
      - local: model_doc/distilbert
        title: DistilBERT
      - local: model_doc/dpr
@ -394,6 +404,8 @@
        title: ESM
      - local: model_doc/falcon
        title: Falcon
+      - local: model_doc/falcon3
+        title: Falcon3
      - local: model_doc/falcon_mamba
        title: FalconMamba
      - local: model_doc/fastspeech2_conformer
@ -492,6 +504,10 @@
        title: mLUKE
      - local: model_doc/mobilebert
        title: MobileBERT
+      - local: model_doc/modernbert
+        title: ModernBert
+      - local: model_doc/moonshine
+        title: moonshine
      - local: model_doc/mpnet
        title: MPNet
      - local: model_doc/mpt
@ -643,6 +659,8 @@
        title: DiNAT
      - local: model_doc/dinov2
        title: DINOV2
+      - local: model_doc/dinov2_with_registers
+        title: DINOv2 with Registers
      - local: model_doc/dit
        title: DiT
      - local: model_doc/dpt
@ -705,6 +723,10 @@
        title: Swin2SR
      - local: model_doc/table-transformer
        title: Table Transformer
+      - local: model_doc/textnet
+        title: TextNet
+      - local: model_doc/timm_wrapper
+        title: Timm Wrapper
      - local: model_doc/upernet
        title: UperNet
      - local: model_doc/van
@ -721,6 +743,8 @@
        title: ViTMatte
      - local: model_doc/vit_msn
        title: ViTMSN
+      - local: model_doc/vitpose
+        title: ViTPose
      - local: model_doc/yolos
        title: YOLOS
      - local: model_doc/zoedepth
@ -738,8 +762,6 @@
        title: dac
      - local: model_doc/encodec
        title: EnCodec
-      - local: model_doc/hiera
-        title: Hiera
      - local: model_doc/hubert
        title: Hubert
      - local: model_doc/mctct
@ -810,6 +832,8 @@
        title: ALIGN
      - local: model_doc/altclip
        title: AltCLIP
+      - local: model_doc/aria
+        title: Aria
      - local: model_doc/blip
        title: BLIP
      - local: model_doc/blip-2
@ -828,12 +852,16 @@
        title: CLIPSeg
      - local: model_doc/clvp
        title: CLVP
+      - local: model_doc/colpali
+        title: ColPali
      - local: model_doc/data2vec
        title: Data2Vec
      - local: model_doc/deplot
        title: DePlot
      - local: model_doc/donut
        title: Donut
+      - local: model_doc/emu3
+        title: Emu3
      - local: model_doc/flava
        title: FLAVA
      - local: model_doc/git
--- a/docs/source/en/add_new_pipeline.md
+++ b/docs/source/en/add_new_pipeline.md
@ -184,7 +184,7 @@ class PairClassificationPipeline(Pipeline):
 ```

 The implementation is framework agnostic, and will work for PyTorch and TensorFlow models. If we have saved this in
-a file named `pair_classification.py`, we can then import it and register it like this:
+a file named `pair_classification.py`, we can then import it and register it like this.

 ```py
 from pair_classification import PairClassificationPipeline
@ -199,6 +199,22 @@ PIPELINE_REGISTRY.register_pipeline(
 )
 ```

+The [register_pipeline](https://github.com/huggingface/transformers/blob/9feae5fb0164e89d4998e5776897c16f7330d3df/src/transformers/pipelines/base.py#L1387) function registers the pipeline details (task type, pipeline class, supported backends) to a models `config.json` file.
+
+```json
+  "custom_pipelines": {
+    "pair-classification": {
+      "impl": "pair_classification.PairClassificationPipeline",
+      "pt": [
+        "AutoModelForSequenceClassification"
+      ],
+      "tf": [
+        "TFAutoModelForSequenceClassification"
+      ],
+    }
+  },
+```
+
 Once this is done, we can use it with a pretrained model. For instance `sgugger/finetuned-bert-mrpc` has been
 fine-tuned on the MRPC dataset, which classifies pairs of sentences as paraphrases or not.

--- a/docs/source/en/chat_templating.md
+++ b/docs/source/en/chat_templating.md
@ -683,7 +683,7 @@ one is a little simplified from the actual one!

 ```
 {%- for message in messages %}
-    {{- '<|' + message['role'] + |>\n' }}
+    {{- '<|' + message['role'] + '|>\n' }}
    {{- message['content'] + eos_token }}
 {%- endfor %}
 {%- if add_generation_prompt %}
@ -1116,4 +1116,4 @@ name to be included in the tool response, then rendering it can be as simple as:
 ```

 Again, remember that the actual formatting and special tokens are model-specific - you should take a lot of care
-to ensure that tokens, whitespace and everything else exactly match the format your model was trained with!
+to ensure that tokens, whitespace and everything else exactly match the format your model was trained with!
--- a/docs/source/en/deepspeed.md
+++ b/docs/source/en/deepspeed.md
@ -586,6 +586,20 @@ You can choose the communication data type by setting the `communication_data_ty
 }
 ```

+### Universal Checkpointing
+
+[Universal Checkpointing](https://www.deepspeed.ai/tutorials/universal-checkpointing) is an efficient and flexible feature for saving and loading model checkpoints. It enables seamless model training continuation and fine-tuning across different model architectures, parallelism techniques, and training configurations.
+
+Resume training with a universal checkpoint by setting [load_universal](https://www.deepspeed.ai/docs/config-json/#checkpoint-options) to `true` in the config file.
+
+```yaml
+{
+    "checkpoint": {
+        "load_universal": true
+    }
+}
+```
+
 ## Deployment

 DeepSpeed can be deployed by different launchers such as [torchrun](https://pytorch.org/docs/stable/elastic/run.html), the `deepspeed` launcher, or [Accelerate](https://huggingface.co/docs/accelerate/basic_tutorials/launch#using-accelerate-launch). To deploy, add `--deepspeed ds_config.json` to the [`Trainer`] command line. It’s recommended to use DeepSpeed’s [`add_config_arguments`](https://deepspeed.readthedocs.io/en/latest/initialize.html#argument-parsing) utility to add any necessary command line arguments to your code.
--- a/docs/source/en/fsdp.md
+++ b/docs/source/en/fsdp.md
@ -58,7 +58,7 @@ Otherwise, you can choose a size-based wrapping policy where FSDP is applied to

 ### Checkpointing

-Intermediate checkpoints should be saved with `fsdp_state_dict_type: SHARDED_STATE_DICT` because saving the full state dict with CPU offloading on rank 0 takes a lot of time and often results in `NCCL Timeout` errors due to indefinite hanging during broadcasting. You can resume training with the sharded state dicts with the [`~accelerate.Accelerator.load_state`]` method.
+Intermediate checkpoints should be saved with `fsdp_state_dict_type: SHARDED_STATE_DICT` because saving the full state dict with CPU offloading on rank 0 takes a lot of time and often results in `NCCL Timeout` errors due to indefinite hanging during broadcasting. You can resume training with the sharded state dicts with the [`~accelerate.Accelerator.load_state`] method.

 ```py
 # directory containing checkpoints
--- a/docs/source/en/generation_strategies.md
+++ b/docs/source/en/generation_strategies.md
@ -96,6 +96,12 @@ distribution over the entire vocabulary with various strategy-specific adjustmen
 the decoding strategies that support multiple sequence candidates, e.g. variations of beam search and sampling. Decoding
 strategies like greedy search and contrastive search return a single output sequence.

+It is also possible to extend `generate()` with external libraries or handcrafted code. The `logits_processor` argument
+allows you to pass custom [`LogitsProcessor`] instances, allowing you to manipulate the next token probability
+distributions. Likewise, the `stopping_criteria` argument lets you set custom [`StoppingCriteria`] to stop text generation.
+The [`logits-processor-zoo`](https://github.com/NVIDIA/logits-processor-zoo) library contains examples of external
+`generate()`-compatible extensions.
+
 ## Save a custom decoding strategy with your model

 If you would like to share your fine-tuned model with a specific generation configuration, you can:
@ -435,6 +441,28 @@ To enable assisted decoding, set the `assistant_model` argument with a model.
 ['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a']
 ```

+<Tip>
+
+If you're using a `pipeline` object, all you need to do is to pass the assistant checkpoint under `assistant_model`
+
+```python
+>>> from transformers import pipeline
+>>> import torch
+
+>>> pipe = pipeline(
+...     "text-generation",
+...     model="meta-llama/Llama-3.1-8B",
+...     assistant_model="meta-llama/Llama-3.2-1B",  # This extra line is all that's needed, also works with UAD
+...     torch_dtype=torch.bfloat16
+>>> )
+>>> pipe_output = pipe("Once upon a time, ", max_new_tokens=50, do_sample=False)
+>>> pipe_output[0]["generated_text"]
+'Once upon a time, 3D printing was a niche technology that was only'
+```
+
+</Tip>
+
+
 When using assisted decoding with sampling methods, you can use the `temperature` argument to control the randomness,
 just like in multinomial sampling. However, in assisted decoding, reducing the temperature may help improve the latency.

--- a/docs/source/en/gguf.md
+++ b/docs/source/en/gguf.md
@ -88,6 +88,7 @@ For now the supported model architectures are the architectures that have been v
 - T5
 - Mamba
 - Nemotron
+- Gemma2

 ## Example usage

--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@ -62,8 +62,11 @@ Flax), PyTorch, and/or TensorFlow.
 |                        [ALBERT](model_doc/albert)                        |       ✅        |         ✅         |      ✅      |
 |                         [ALIGN](model_doc/align)                         |       ✅        |         ❌         |      ❌      |
 |                       [AltCLIP](model_doc/altclip)                       |       ✅        |         ❌         |      ❌      |
+|                          [Aria](model_doc/aria)                          |       ✅        |         ❌         |      ❌      |
+|                     [AriaText](model_doc/aria_text)                      |       ✅        |         ❌         |      ❌      |
 | [Audio Spectrogram Transformer](model_doc/audio-spectrogram-transformer) |       ✅        |         ❌         |      ❌      |
 |                    [Autoformer](model_doc/autoformer)                    |       ✅        |         ❌         |      ❌      |
+|                         [Bamba](model_doc/bamba)                         |       ✅        |         ❌         |      ❌      |
 |                          [Bark](model_doc/bark)                          |       ✅        |         ❌         |      ❌      |
 |                          [BART](model_doc/bart)                          |       ✅        |         ✅         |      ✅      |
 |                       [BARThez](model_doc/barthez)                       |       ✅        |         ✅         |      ✅      |
@ -97,6 +100,8 @@ Flax), PyTorch, and/or TensorFlow.
 |                       [CodeGen](model_doc/codegen)                       |       ✅        |         ❌         |      ❌      |
 |                    [CodeLlama](model_doc/code_llama)                     |       ✅        |         ❌         |      ✅      |
 |                        [Cohere](model_doc/cohere)                        |       ✅        |         ❌         |      ❌      |
+|                       [Cohere2](model_doc/cohere2)                       |       ✅        |         ❌         |      ❌      |
+|                       [ColPali](model_doc/colpali)                       |       ✅        |         ❌         |      ❌      |
 |              [Conditional DETR](model_doc/conditional_detr)              |       ✅        |         ❌         |      ❌      |
 |                      [ConvBERT](model_doc/convbert)                      |       ✅        |         ✅         |      ❌      |
 |                      [ConvNeXT](model_doc/convnext)                      |       ✅        |         ✅         |      ❌      |
@ -120,8 +125,10 @@ Flax), PyTorch, and/or TensorFlow.
 |                          [DETA](model_doc/deta)                          |       ✅        |         ❌         |      ❌      |
 |                          [DETR](model_doc/detr)                          |       ✅        |         ❌         |      ❌      |
 |                      [DialoGPT](model_doc/dialogpt)                      |       ✅        |         ✅         |      ✅      |
+|                     [DiffLlama](model_doc/diffllama)                     |       ✅        |         ❌         |      ❌      |
 |                         [DiNAT](model_doc/dinat)                         |       ✅        |         ❌         |      ❌      |
 |                        [DINOv2](model_doc/dinov2)                        |       ✅        |         ❌         |      ✅      |
+|         [DINOv2 with Registers](model_doc/dinov2_with_registers)         |       ✅        |         ❌         |      ❌      |
 |                    [DistilBERT](model_doc/distilbert)                    |       ✅        |         ✅         |      ✅      |
 |                           [DiT](model_doc/dit)                           |       ✅        |         ❌         |      ✅      |
 |                       [DonutSwin](model_doc/donut)                       |       ✅        |         ❌         |      ❌      |
@ -130,6 +137,7 @@ Flax), PyTorch, and/or TensorFlow.
 |               [EfficientFormer](model_doc/efficientformer)               |       ✅        |         ✅         |      ❌      |
 |                  [EfficientNet](model_doc/efficientnet)                  |       ✅        |         ❌         |      ❌      |
 |                       [ELECTRA](model_doc/electra)                       |       ✅        |         ✅         |      ✅      |
+|                          [Emu3](model_doc/emu3)                          |       ✅        |         ❌         |      ❌      |
 |                       [EnCodec](model_doc/encodec)                       |       ✅        |         ❌         |      ❌      |
 |               [Encoder decoder](model_doc/encoder-decoder)               |       ✅        |         ✅         |      ✅      |
 |                         [ERNIE](model_doc/ernie)                         |       ✅        |         ❌         |      ❌      |
@ -137,6 +145,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                           [ESM](model_doc/esm)                           |       ✅        |         ✅         |      ❌      |
 |              [FairSeq Machine-Translation](model_doc/fsmt)               |       ✅        |         ❌         |      ❌      |
 |                        [Falcon](model_doc/falcon)                        |       ✅        |         ❌         |      ❌      |
+|                       [Falcon3](model_doc/falcon3)                       |       ✅        |         ❌         |      ✅      |
 |                  [FalconMamba](model_doc/falcon_mamba)                   |       ✅        |         ❌         |      ❌      |
 |         [FastSpeech2Conformer](model_doc/fastspeech2_conformer)          |       ✅        |         ❌         |      ❌      |
 |                       [FLAN-T5](model_doc/flan-t5)                       |       ✅        |         ✅         |      ✅      |
@ -172,6 +181,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                       [IDEFICS](model_doc/idefics)                       |       ✅        |         ✅         |      ❌      |
 |                      [Idefics2](model_doc/idefics2)                      |       ✅        |         ❌         |      ❌      |
 |                      [Idefics3](model_doc/idefics3)                      |       ✅        |         ❌         |      ❌      |
+|          [Idefics3VisionTransformer](model_doc/idefics3_vision)          |       ❌        |         ❌         |      ❌      |
 |                      [ImageGPT](model_doc/imagegpt)                      |       ✅        |         ❌         |      ❌      |
 |                      [Informer](model_doc/informer)                      |       ✅        |         ❌         |      ❌      |
 |                  [InstructBLIP](model_doc/instructblip)                  |       ✅        |         ❌         |      ❌      |
@ -225,6 +235,8 @@ Flax), PyTorch, and/or TensorFlow.
 |                  [MobileNetV2](model_doc/mobilenet_v2)                   |       ✅        |         ❌         |      ❌      |
 |                     [MobileViT](model_doc/mobilevit)                     |       ✅        |         ✅         |      ❌      |
 |                   [MobileViTV2](model_doc/mobilevitv2)                   |       ✅        |         ❌         |      ❌      |
+|                    [ModernBERT](model_doc/modernbert)                    |       ✅        |         ❌         |      ❌      |
+|                     [Moonshine](model_doc/moonshine)                     |       ✅        |         ❌         |      ❌      |
 |                         [Moshi](model_doc/moshi)                         |       ✅        |         ❌         |      ❌      |
 |                         [MPNet](model_doc/mpnet)                         |       ✅        |         ✅         |      ❌      |
 |                           [MPT](model_doc/mpt)                           |       ✅        |         ❌         |      ❌      |
@ -316,8 +328,10 @@ Flax), PyTorch, and/or TensorFlow.
 |             [Table Transformer](model_doc/table-transformer)             |       ✅        |         ❌         |      ❌      |
 |                         [TAPAS](model_doc/tapas)                         |       ✅        |         ✅         |      ❌      |
 |                         [TAPEX](model_doc/tapex)                         |       ✅        |         ✅         |      ✅      |
+|                       [TextNet](model_doc/textnet)                       |       ✅        |         ❌         |      ❌      |
 |       [Time Series Transformer](model_doc/time_series_transformer)       |       ✅        |         ❌         |      ❌      |
 |                   [TimeSformer](model_doc/timesformer)                   |       ✅        |         ❌         |      ❌      |
+|                [TimmWrapperModel](model_doc/timm_wrapper)                |       ✅        |         ❌         |      ❌      |
 |        [Trajectory Transformer](model_doc/trajectory_transformer)        |       ✅        |         ❌         |      ❌      |
 |                  [Transformer-XL](model_doc/transfo-xl)                  |       ✅        |         ✅         |      ❌      |
 |                         [TrOCR](model_doc/trocr)                         |       ✅        |         ❌         |      ❌      |
@ -344,6 +358,8 @@ Flax), PyTorch, and/or TensorFlow.
 |                       [ViTMAE](model_doc/vit_mae)                        |       ✅        |         ✅         |      ❌      |
 |                      [ViTMatte](model_doc/vitmatte)                      |       ✅        |         ❌         |      ❌      |
 |                       [ViTMSN](model_doc/vit_msn)                        |       ✅        |         ❌         |      ❌      |
+|                       [VitPose](model_doc/vitpose)                       |       ✅        |         ❌         |      ❌      |
+|              [VitPoseBackbone](model_doc/vitpose_backbone)               |       ✅        |         ❌         |      ❌      |
 |                          [VITS](model_doc/vits)                          |       ✅        |         ❌         |      ❌      |
 |                         [ViViT](model_doc/vivit)                         |       ✅        |         ❌         |      ❌      |
 |                      [Wav2Vec2](model_doc/wav2vec2)                      |       ✅        |         ✅         |      ✅      |
--- a/docs/source/en/internal/generation_utils.md
+++ b/docs/source/en/internal/generation_utils.md
@ -352,6 +352,8 @@ A [`Constraint`] can be used to force the generation to include specific tokens

 [[autodoc]] TextIteratorStreamer

+[[autodoc]] AsyncTextIteratorStreamer
+
 ## Caches

 [[autodoc]] Cache
--- a/docs/source/en/llm_optims.md
+++ b/docs/source/en/llm_optims.md
@ -156,9 +156,11 @@ def decode_one_tokens(model, cur_token, input_pos, cache_position, past_key_valu
 There are a few important things you must do to enable static kv-cache and `torch.compile` with the `StaticCache` method:
 1. Initialize the [`StaticCache`] instance before using the model for inference. There you can configure parameters like the maximum batch size and sequence length.
 2. Call `torch.compile` on the model to compile the forward pass with the static kv-cache.
-3. Set `enable_math=True` in the [torch.backends.cuda.sdp_kernel](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html) context manager to enable the native PyTorch C++ implementation of scaled dot product attention to speed up inference even more.
+3. Use `SDPBackend.MATH` in the [torch.nn.attention.sdpa_kernel](https://pytorch.org/docs/stable/generated/torch.nn.attention.sdpa_kernel.html) context manager to enable the native PyTorch C++ implementation of scaled dot product attention to speed up inference even more.

 ```py
+from torch.nn.attention import SDPBackend, sdpa_kernel
+
 batch_size, seq_length = inputs["input_ids"].shape
 with torch.no_grad():
    past_key_values = StaticCache(
@ -179,7 +181,7 @@ with torch.no_grad():
    decode_one_tokens = torch.compile(decode_one_tokens, mode="reduce-overhead", fullgraph=True)
    cache_position = torch.tensor([seq_length + 1], device=torch_device)
    for _ in range(1, NUM_TOKENS_TO_GENERATE):
-        with torch.backends.cuda.sdp_kernel(enable_flash=False, enable_mem_efficient=False, enable_math=True):
+        with sdpa_kernel(SDPBackend.MATH):
            next_token = decode_one_tokens(model, next_token.clone(), None, cache_position, past_key_values)
            generated_ids[:, cache_position] = next_token.int()
        cache_position += 1
@ -453,10 +455,11 @@ Scaled dot product attention (SDPA) is automatically enabled in PyTorch 2.0 and
 > [!TIP]
 > SDPA supports FlashAttention-2 as long as you have the latest PyTorch version installed.

-Use the [torch.backends.cuda.sdp_kernel](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html) context manager to explicitly enable or disable any of the three attention algorithms. For example, set `enable_flash=True` to enable FlashAttention.
+Use the [torch.nn.attention.sdpa_kernel](https://pytorch.org/docs/stable/generated/torch.nn.attention.sdpa_kernel.html) context manager to explicitly enable or disable any of the four attention algorithms. For example, use `SDPBackend.FLASH_ATTENTION` to enable FlashAttention.

 ```py
 import torch
+from torch.nn.attention import SDPBackend, sdpa_kernel
 from transformers import AutoModelForCausalLM

 model = AutoModelForCausalLM.from_pretrained(
@ -464,7 +467,7 @@ model = AutoModelForCausalLM.from_pretrained(
    torch_dtype=torch.bfloat16,
 )

-with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
+with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
    outputs = model.generate(**inputs)
 ```

@ -473,7 +476,7 @@ with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable
 Quantization reduces the size of the LLM weights by storing them in a lower precision. This translates to lower memory usage and makes loading LLMs for inference more accessible if you're constrained by your GPUs memory. If you aren't limited by your GPU, you don't necessarily need to quantize your model because it can incur a small latency cost (except for AWQ and fused AWQ modules) due to the extra step required to quantize and dequantize the weights.

 > [!TIP]
-> There are many quantization libraries (see the [Quantization](./quantization) guide for more details) available, such as Quanto, AQLM, AWQ, and AutoGPTQ. Feel free to try them out and see which one works best for your use case. We also recommend reading the [Overview of natively supported quantization schemes in 🤗 Transformers](https://hf.co/blog/overview-quantization-transformers) blog post which compares AutoGPTQ and bitsandbytes.
+> There are many quantization libraries (see the [Quantization](./quantization) guide for more details) available, such as Quanto, AQLM, VPTQ, AWQ, and AutoGPTQ. Feel free to try them out and see which one works best for your use case. We also recommend reading the [Overview of natively supported quantization schemes in 🤗 Transformers](https://hf.co/blog/overview-quantization-transformers) blog post which compares AutoGPTQ and bitsandbytes.

 Use the Model Memory Calculator below to estimate and compare how much memory is required to load a model. For example, try estimating how much memory it costs to load [Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1).

--- a/docs/source/en/llm_tutorial.md
+++ b/docs/source/en/llm_tutorial.md
@ -265,8 +265,9 @@ While the autoregressive generation process is relatively straightforward, makin

 ### Related libraries

-1. [`optimum`](https://github.com/huggingface/optimum), an extension of 🤗 Transformers that optimizes for specific hardware devices.
+1. [`optimum`](https://github.com/huggingface/optimum), an extension of 🤗 Transformers that optimizes for specific hardware devices;
 2. [`outlines`](https://github.com/outlines-dev/outlines), a library where you can constrain text generation (e.g. to generate JSON files);
-3. [`SynCode`](https://github.com/uiuc-focal-lab/syncode), a library for context-free grammar guided generation. (e.g. JSON, SQL, Python)
+3. [`SynCode`](https://github.com/uiuc-focal-lab/syncode), a library for context-free grammar guided generation (e.g. JSON, SQL, Python);
 4. [`text-generation-inference`](https://github.com/huggingface/text-generation-inference), a production-ready server for LLMs;
 5. [`text-generation-webui`](https://github.com/oobabooga/text-generation-webui), a UI for text generation;
+6. [`logits-processor-zoo`](https://github.com/NVIDIA/logits-processor-zoo), containing additional options to control text generation with 🤗 Transformers. See our related [blog post](https://huggingface.co/blog/logits-processor-zoo).
--- a/docs/source/en/main_classes/image_processor.md
+++ b/docs/source/en/main_classes/image_processor.md
@ -27,6 +27,7 @@ from transformers import AutoImageProcessor

 processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50", use_fast=True)
 ```
+Note that `use_fast` will be set to `True` by default in a future release.

 When using a fast image processor, you can also set the `device` argument to specify the device on which the processing should be done. By default, the processing is done on the same device as the inputs if the inputs are tensors, or on the CPU otherwise.

@ -42,21 +43,17 @@ images_processed = processor(images, return_tensors="pt", device="cuda")
 Here are some speed comparisons between the base and fast image processors for the `DETR` and `RT-DETR` models, and how they impact overall inference time:

 <div class="flex">
-  <div>
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/benchmark_results_full_pipeline_detr_fast_padded.png" />
-  </div>
-  <div>
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/benchmark_results_full_pipeline_detr_fast_batched_compiled.png" />
-  </div>
+  <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/benchmark_results_full_pipeline_detr_fast_padded.png" />
+</div>
+<div class="flex">
+  <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/benchmark_results_full_pipeline_detr_fast_batched_compiled.png" />
 </div>

 <div class="flex">
-  <div>
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/benchmark_results_full_pipeline_rt_detr_fast_single.png" />
-  </div>
-  <div>
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/benchmark_results_full_pipeline_rt_detr_fast_batched.png" />
-  </div>
+  <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/benchmark_results_full_pipeline_rt_detr_fast_single.png" />
+</div>
+<div class="flex">
+  <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/benchmark_results_full_pipeline_rt_detr_fast_batched.png" />
 </div>

 These benchmarks were run on an [AWS EC2 g5.2xlarge instance](https://aws.amazon.com/ec2/instance-types/g5/), utilizing an NVIDIA A10G Tensor Core GPU.
--- a/docs/source/en/main_classes/quantization.md
+++ b/docs/source/en/main_classes/quantization.md
@ -34,6 +34,10 @@ Learn how to quantize models in the [Quantization](../quantization) guide.

 [[autodoc]] AqlmConfig

+## VptqConfig
+
+[[autodoc]] VptqConfig
+
 ## AwqConfig

 [[autodoc]] AwqConfig
@ -53,6 +57,10 @@ Learn how to quantize models in the [Quantization](../quantization) guide.

 [[autodoc]] quantizers.base.HfQuantizer

+## HiggsConfig
+
+[[autodoc]] HiggsConfig
+
 ## HqqConfig

 [[autodoc]] HqqConfig
--- a/docs/source/en/model_doc/aria.md
+++ b/docs/source/en/model_doc/aria.md
@ -0,0 +1,106 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Aria
+
+## Overview
+
+The Aria model was proposed in [Aria: An Open Multimodal Native Mixture-of-Experts Model](https://huggingface.co/papers/2410.05993) by Li et al. from the Rhymes.AI team.
+
+Aria is an open multimodal-native model with best-in-class performance across a wide range of multimodal, language, and coding tasks. It has a Mixture-of-Experts architecture, with respectively 3.9B and 3.5B activated parameters per visual token and text token. 
+
+The abstract from the paper is the following:
+
+*Information comes in diverse modalities. Multimodal native AI models are essential to integrate real-world information and deliver comprehensive understanding. While proprietary multimodal native models exist, their lack of openness imposes obstacles for adoptions, let alone adaptations. To fill this gap, we introduce Aria, an open multimodal native model with best-in-class performance across a wide range of multimodal, language, and coding tasks. Aria is a mixture-of-expert model with 3.9B and 3.5B activated parameters per visual token and text token, respectively. It outperforms Pixtral-12B and Llama3.2-11B, and is competitive against the best proprietary models on various multimodal tasks. We pre-train Aria from scratch following a 4-stage pipeline, which progressively equips the model with strong capabilities in language understanding, multimodal understanding, long context window, and instruction following. We open-source the model weights along with a codebase that facilitates easy adoptions and adaptations of Aria in real-world applications.*
+
+This model was contributed by [m-ric](https://huggingface.co/m-ric).
+The original code can be found [here](https://github.com/rhymes-ai/Aria).
+
+## Usage tips
+
+Here's how to use the model for vision tasks:
+```python
+import requests
+import torch
+from PIL import Image
+
+from transformers import AriaProcessor, AriaForConditionalGeneration
+
+model_id_or_path = "rhymes-ai/Aria"
+
+model = AriaForConditionalGeneration.from_pretrained(
+    model_id_or_path, device_map="auto"
+)
+
+processor = AriaProcessor.from_pretrained(model_id_or_path)
+
+image = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
+
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "image"},
+            {"text": "what is the image?", "type": "text"},
+        ],
+    }
+]
+
+text = processor.apply_chat_template(messages, add_generation_prompt=True)
+inputs = processor(text=text, images=image, return_tensors="pt")
+inputs.to(model.device)
+
+output = model.generate(
+    **inputs,
+    max_new_tokens=15,
+    stop_strings=["<|im_end|>"],
+    tokenizer=processor.tokenizer,
+    do_sample=True,
+    temperature=0.9,
+)
+output_ids = output[0][inputs["input_ids"].shape[1]:]
+response = processor.decode(output_ids, skip_special_tokens=True)
+```
+
+
+## AriaImageProcessor
+
+[[autodoc]] AriaImageProcessor
+
+## AriaProcessor
+
+[[autodoc]] AriaProcessor
+
+## AriaTextConfig
+
+[[autodoc]] AriaTextConfig
+
+## AriaConfig
+
+[[autodoc]] AriaConfig
+
+## AriaTextModel
+
+[[autodoc]] AriaTextModel
+
+## AriaTextForCausalLM
+
+[[autodoc]] AriaTextForCausalLM
+
+## AriaForConditionalGeneration
+
+[[autodoc]] AriaForConditionalGeneration
+    - forward
--- a/docs/source/en/model_doc/bamba.md
+++ b/docs/source/en/model_doc/bamba.md
@ -0,0 +1,64 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Bamba
+
+
+## Overview
+
+Bamba-9B is a decoder-only language model based on the [Mamba-2](https://github.com/state-spaces/mamba) architecture and is designed to handle a wide range of text generation tasks. It is trained from scratch using a two-stage training approach. In the first stage, the model is trained on 2 trillion tokens from the Dolma v1.7 dataset. In the second stage, it undergoes additional training on 200 billion tokens, leveraging a carefully curated blend of high-quality data to further refine its performance and enhance output quality.
+
+Checkout all Bamba-9B model checkpoints [here](https://github.com/foundation-model-stack/bamba).
+
+## BambaConfig
+
+| Model            | Params       | # Layers | Hidden Dim. | Attention Heads | GQA | KV Heads | Context Length |  Tied Embeddings |
+|-------------------|--------------|----------|-------------|-----------------|-----|----------|----------------|------------------|
+| Bamba  | 9B (9.78B)   | 32       | 4096        | 32              | Yes | 8        | 4096           | True |
+
+[[autodoc]] BambaConfig
+
+<!---
+## Usage Tips
+
+Tips: 
+
+- The architecture is based on Mamba-2 models.
+
+## BambaModel
+
+[[autodoc]] BambaModel
+    - forward
+-->
+
+## BambaForCausalLM
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model = AutoModelForCausalLM.from_pretrained("ibm-fms/Bamba-9B")
+tokenizer = AutoTokenizer.from_pretrained("ibm-fms/Bamba-9B")
+
+message = ["Mamba is a snake with following properties  "]
+inputs = tokenizer(message, return_tensors='pt', return_token_type_ids=False)
+response = model.generate(**inputs, max_new_tokens=64)
+print(tokenizer.batch_decode(response, skip_special_tokens=True)[0])
+```
+
+[[autodoc]] BambaForCausalLM
+    - forward
+
+This HF implementation is contributed by [ani300](https://github.com/ani300) and [fabianlim](https://github.com/fabianlim). 
--- a/docs/source/en/model_doc/beit.md
+++ b/docs/source/en/model_doc/beit.md
@ -71,6 +71,43 @@ alt="drawing" width="600"/>

 <small> BEiT pre-training. Taken from the <a href="https://arxiv.org/abs/2106.08254">original paper.</a> </small>

+### Using Scaled Dot Product Attention (SDPA)
+
+PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function 
+encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the 
+[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) 
+or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
+page for more information.
+
+SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set 
+`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
+
+```
+from transformers import BeitForImageClassification
+model = BeitForImageClassification.from_pretrained("microsoft/beit-base-patch16-224", attn_implementation="sdpa", torch_dtype=torch.float16)
+...
+```
+
+For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).
+
+On a local benchmark (NVIDIA GeForce RTX 2060-8GB, PyTorch 2.5.1, OS Ubuntu 20.04) with `float16` and 
+`microsoft/beit-base-patch16-224` model, we saw the following improvements during training and inference:
+
+#### Training
+
+| num_training_steps | batch_size | image_size   | is_cuda | Time per batch (eager - s) | Time per batch (sdpa - s) | Speedup (%) | Eager peak mem (MB) | SDPA peak mem (MB) | Mem saving (%) |
+|--------------------|------------|--------------|---------|----------------------------|---------------------------|-------------|----------------------|--------------------|----------------|
+| 50                 | 2          | (1048, 640)  | True    | 0.984                      | 0.746                     | 31.975      | 6738.915            | 4319.886          | 55.998         |
+
+#### Inference
+
+|   Image batch size |   Eager (s/iter) | Eager CI, %   |   Eager memory (MB) |   SDPA (s/iter) | SDPA CI, %   |   SDPA memory (MB) |   SDPA speedup | SDPA memory saved (%) |
+|-------------------:|-----------------:|:--------------|--------------------:|----------------:|:-------------|-------------------:|---------------:|----------------------:|
+|                  1 |            0.012 | ±0.3%         |         3.76657e+08 |           0.011 | ±0.5%        |        3.75739e+08 |          1.05  |                 0.244 |
+|                  4 |            0.013 | ±0.1%         |         4.03147e+08 |           0.011 | ±0.2%        |        3.90554e+08 |          1.178 |                 3.225 |
+|                 16 |            0.045 | ±0.1%         |         4.96697e+08 |           0.035 | ±0.1%        |        4.51232e+08 |          1.304 |                10.076 |
+|                 32 |            0.088 | ±0.1%         |         6.24417e+08 |           0.066 | ±0.1%        |        5.33488e+08 |          1.325 |                17.044 |
+
 ## Resources

 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with BEiT.
--- a/docs/source/en/model_doc/cohere2.md
+++ b/docs/source/en/model_doc/cohere2.md
@ -0,0 +1,51 @@
+# Cohere
+
+## Overview
+[C4AI Command R7B](https://cohere.com/blog/command-r7b) is an open weights research release of a 7B billion parameter model developed by Cohere and Cohere For AI. It has advanced capabilities optimized for various use cases, including reasoning, summarization, question answering, and code. The model is trained to perform sophisticated tasks including Retrieval Augmented Generation (RAG) and tool use. The model also has powerful agentic capabilities that can use and combine multiple tools over multiple steps to accomplish more difficult tasks. It obtains top performance on enterprise-relevant code use cases. C4AI Command R7B is a multilingual model trained on 23 languages.
+
+The model features three layers with sliding window attention (window size 4096) and ROPE for efficient local context modeling and relative positional encoding. A fourth layer uses global attention without positional embeddings, enabling unrestricted token interactions across the entire sequence.
+
+The model has been trained on 23 languages: English, French, Spanish, Italian, German, Portuguese, Japanese, Korean, Arabic, Chinese, Russian, Polish, Turkish, Vietnamese, Dutch, Czech, Indonesian, Ukrainian, Romanian, Greek, Hindi, Hebrew, and Persian.
+
+## Usage tips
+The model and tokenizer can be loaded via:
+
+```python
+# pip install transformers
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+model_id = "CohereForAI/c4ai-command-r7b-12-2024"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id)
+
+# Format message with the command-r chat template
+messages = [{"role": "user", "content": "Hello, how are you?"}]
+input_ids = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
+
+gen_tokens = model.generate(
+    input_ids,
+    max_new_tokens=100,
+    do_sample=True,
+    temperature=0.3,
+)
+
+gen_text = tokenizer.decode(gen_tokens[0])
+print(gen_text)
+```
+
+## Cohere2Config
+
+[[autodoc]] Cohere2Config
+
+## Cohere2Model
+
+[[autodoc]] Cohere2Model
+    - forward
+
+
+## Cohere2ForCausalLM
+
+[[autodoc]] Cohere2ForCausalLM
+    - forward
+
+
--- a/docs/source/en/model_doc/colpali.md
+++ b/docs/source/en/model_doc/colpali.md
@ -0,0 +1,90 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# ColPali
+
+## Overview
+
+The *ColPali* model was proposed in [ColPali: Efficient Document Retrieval with Vision Language Models](https://doi.org/10.48550/arXiv.2407.01449) by **Manuel Faysse***, **Hugues Sibille***, **Tony Wu***, Bilel Omrani, Gautier Viaud, Céline Hudelot, Pierre Colombo (* denotes equal contribution). Work lead by ILLUIN Technology.
+
+In our proposed *ColPali* approach, we leverage VLMs to construct efficient multi-vector embeddings directly from document images (“screenshots”) for document retrieval. We train the model to maximize the similarity between these document embeddings and the corresponding query embeddings, using the late interaction method introduced in ColBERT.
+
+Using *ColPali* removes the need for potentially complex and brittle layout recognition and OCR pipelines with a single model that can take into account both the textual and visual content (layout, charts, etc.) of a document.
+
+## Resources
+
+- The *ColPali* arXiv paper can be found [here](https://doi.org/10.48550/arXiv.2407.01449). 📄
+- The official blog post detailing ColPali can be found [here](https://huggingface.co/blog/manu/colpali). 📝
+- The original model implementation code for the ColPali model and for the `colpali-engine` package can be found [here](https://github.com/illuin-tech/colpali). 🌎
+- Cookbooks for learning to use the transformers-native version of *ColPali*, fine-tuning, and similarity maps generation can be found [here](https://github.com/tonywu71/colpali-cookbooks). 📚
+
+This model was contributed by [@tonywu71](https://huggingface.co/tonywu71) and [@yonigozlan](https://huggingface.co/yonigozlan).
+
+## Usage
+
+This example demonstrates how to use *ColPali* to embed both queries and images, calculate their similarity scores, and identify the most relevant matches. For a specific query, you can retrieve the top-k most similar images by selecting the ones with the highest similarity scores.
+
+```python
+import torch
+from PIL import Image
+
+from transformers import ColPaliForRetrieval, ColPaliProcessor
+
+model_name = "vidore/colpali-v1.2-hf"
+
+model = ColPaliForRetrieval.from_pretrained(
+    model_name,
+    torch_dtype=torch.bfloat16,
+    device_map="cuda:0",  # or "mps" if on Apple Silicon
+).eval()
+
+processor = ColPaliProcessor.from_pretrained(model_name)
+
+# Your inputs (replace dummy images with screenshots of your documents)
+images = [
+    Image.new("RGB", (32, 32), color="white"),
+    Image.new("RGB", (16, 16), color="black"),
+]
+queries = [
+    "What is the organizational structure for our R&D department?",
+    "Can you provide a breakdown of last year’s financial performance?",
+]
+
+# Process the inputs
+batch_images = processor(images=images).to(model.device)
+batch_queries = processor(text=queries).to(model.device)
+
+# Forward pass
+with torch.no_grad():
+    image_embeddings = model(**batch_images).embeddings
+    query_embeddings = model(**batch_queries).embeddings
+
+# Score the queries against the images
+scores = processor.score_retrieval(query_embeddings, image_embeddings)
+```
+
+## ColPaliConfig
+
+[[autodoc]] ColPaliConfig
+
+## ColPaliProcessor
+
+[[autodoc]] ColPaliProcessor
+
+## ColPaliForRetrieval
+
+[[autodoc]] ColPaliForRetrieval
+    - forward
--- a/docs/source/en/model_doc/data2vec.md
+++ b/docs/source/en/model_doc/data2vec.md
@ -48,6 +48,46 @@ The original code for vision can be found [here](https://github.com/facebookrese
 - For Data2VecText, preprocessing is identical to [`RobertaModel`], including tokenization.
 - For Data2VecVision, preprocessing is identical to [`BeitModel`], including feature extraction.

+### Using Scaled Dot Product Attention (SDPA)
+
+PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function 
+encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the 
+[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) 
+or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
+page for more information.
+
+SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set 
+`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
+
+The SDPA implementation is currently available for the Data2VecAudio and Data2VecVision models.
+
+```
+from transformers import Data2VecVisionForImageClassification
+model = Data2VecVisionForImageClassification.from_pretrained("facebook/data2vec-vision-base", attn_implementation="sdpa", torch_dtype=torch.float16)
+...
+```
+
+For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).
+
+For the Data2VecVision model, on a local benchmark (NVIDIA GeForce RTX 2060-8GB, PyTorch 2.5.1, OS Ubuntu 20.04)
+with `float16` and `facebook/data2vec-vision-base` model, we saw the following improvements during training and
+inference:
+
+#### Training
+
+| num_training_steps | batch_size | image_size   | is_cuda | Time per batch (eager - s) | Time per batch (sdpa - s) | Speedup (%) | Eager peak mem (MB) | SDPA peak mem (MB) | Mem saving (%) |
+|--------------------|------------|--------------|---------|----------------------------|---------------------------|-------------|----------------------|--------------------|----------------|
+| 50                 | 2          | (1048, 640)  | True    | 0.996                      | 0.754                     | 32.147      | 6722.198            | 4264.653          | 57.626         |
+
+#### Inference
+
+|   Image batch size |   Eager (s/iter) | Eager CI, %   |   Eager memory (MB) |   SDPA (s/iter) | SDPA CI, %   |   SDPA memory (MB) |   SDPA speedup |   SDPA memory saved |
+|-------------------:|-----------------:|:--------------|--------------------:|----------------:|:-------------|-------------------:|---------------:|--------------------:|
+|                  1 |            0.011 | ±0.3%         |         3.76143e+08 |           0.01  | ±0.3%        |        3.74397e+08 |          1.101 |               0.466 |
+|                  4 |            0.014 | ±0.1%         |         4.02756e+08 |           0.012 | ±0.2%        |        3.91373e+08 |          1.219 |               2.909 |
+|                 16 |            0.046 | ±0.3%         |         4.96482e+08 |           0.035 | ±0.2%        |        4.51017e+08 |          1.314 |              10.081 |
+|                 32 |            0.088 | ±0.1%         |         6.23903e+08 |           0.067 | ±0.1%        |        5.32974e+08 |          1.33  |              17.061 |
+
 ## Resources

 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Data2Vec.
--- a/docs/source/en/model_doc/diffllama.md
+++ b/docs/source/en/model_doc/diffllama.md
@ -0,0 +1,59 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# DiffLlama
+
+## Overview
+
+The DiffLlama model was proposed in [Differential Transformer](https://arxiv.org/abs/2410.05258) by Kazuma Matsumoto and .
+This model is combine Llama model and Differential Transformer's Attention.
+
+The abstract from the paper is the following:
+
+*Transformer tends to overallocate attention to irrelevant context. In this work, we introduce Diff Transformer, which amplifies attention to the relevant context while canceling noise. Specifically, the differential attention mechanism calculates attention scores as the difference between two separate softmax attention maps. The subtraction cancels noise, promoting the emergence of sparse attention patterns. Experimental results on language modeling show that Diff Transformer outperforms Transformer in various settings of scaling up model size and training tokens. More intriguingly, it offers notable advantages in practical applications, such as long-context modeling, key information retrieval, hallucination mitigation, in-context learning, and reduction of activation outliers. By being less distracted by irrelevant context, Diff Transformer can mitigate hallucination in question answering and text summarization. For in-context learning, Diff Transformer not only enhances accuracy but is also more robust to order permutation, which was considered as a chronic robustness issue. The results position Diff Transformer as a highly effective and promising architecture to advance large language models.*
+
+### Usage tips
+The hyperparameters of this model is the same as Llama model.
+
+
+## DiffLlamaConfig
+
+[[autodoc]] DiffLlamaConfig
+
+## DiffLlamaModel
+
+[[autodoc]] DiffLlamaModel
+    - forward
+
+## DiffLlamaForCausalLM
+
+[[autodoc]] DiffLlamaForCausalLM
+    - forward
+
+## DiffLlamaForSequenceClassification
+
+[[autodoc]] DiffLlamaForSequenceClassification
+    - forward
+
+## DiffLlamaForQuestionAnswering
+
+[[autodoc]] DiffLlamaForQuestionAnswering
+    - forward
+
+## DiffLlamaForTokenClassification
+
+[[autodoc]] DiffLlamaForTokenClassification
+    - forward
--- a/docs/source/en/model_doc/dinov2_with_registers.md
+++ b/docs/source/en/model_doc/dinov2_with_registers.md
@ -0,0 +1,54 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# DINOv2 with Registers
+
+## Overview
+
+The DINOv2 with Registers model was proposed in [Vision Transformers Need Registers](https://arxiv.org/abs/2309.16588) by Timothée Darcet, Maxime Oquab, Julien Mairal, Piotr Bojanowski.
+
+The [Vision Transformer](vit) (ViT) is a transformer encoder model (BERT-like) originally introduced to do supervised image classification on ImageNet.
+
+Next, people figured out ways to make ViT work really well on self-supervised image feature extraction (i.e. learning meaningful features, also called embeddings) on images without requiring any labels. Some example papers here include [DINOv2](dinov2) and [MAE](vit_mae).
+
+The authors of DINOv2 noticed that ViTs have artifacts in attention maps. It’s due to the model using some image patches as “registers”. The authors propose a fix: just add some new tokens (called "register" tokens), which you only use during pre-training (and throw away afterwards). This results in:
+- no artifacts
+- interpretable attention maps
+- and improved performances.
+
+The abstract from the paper is the following:
+
+*Transformers have recently emerged as a powerful tool for learning visual representations. In this paper, we identify and characterize artifacts in feature maps of both supervised and self-supervised ViT networks. The artifacts correspond to high-norm tokens appearing during inference primarily in low-informative background areas of images, that are repurposed for internal computations. We propose a simple yet effective solution based on providing additional tokens to the input sequence of the Vision Transformer to fill that role. We show that this solution fixes that problem entirely for both supervised and self-supervised models, sets a new state of the art for self-supervised visual models on dense visual prediction tasks, enables object discovery methods with larger models, and most importantly leads to smoother feature maps and attention maps for downstream visual processing.*
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/dinov2_with_registers_visualization.png"
+alt="drawing" width="600"/>
+
+<small> Visualization of attention maps of various models trained with vs. without registers. Taken from the <a href="https://arxiv.org/abs/2309.16588">original paper</a>. </small>
+
+Tips:
+
+- Usage of DINOv2 with Registers is identical to DINOv2 without, you'll just get better performance.
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr).
+The original code can be found [here](https://github.com/facebookresearch/dinov2).
+
+
+## Dinov2WithRegistersConfig
+
+[[autodoc]] Dinov2WithRegistersConfig
+
+## Dinov2WithRegistersModel
+
+[[autodoc]] Dinov2WithRegistersModel
+    - forward
+
+## Dinov2WithRegistersForImageClassification
+
+[[autodoc]] Dinov2WithRegistersForImageClassification
+    - forward
--- a/docs/source/en/model_doc/emu3.md
+++ b/docs/source/en/model_doc/emu3.md
@ -0,0 +1,179 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Emu3
+
+## Overview
+
+The Emu3 model was proposed in [Emu3: Next-Token Prediction is All You Need](https://arxiv.org/abs/2409.18869) by Xinlong Wang, Xiaosong Zhang, Zhengxiong Luo, Quan Sun, Yufeng Cui, Jinsheng Wang, Fan Zhang, Yueze Wang, Zhen Li, Qiying Yu, Yingli Zhao, Yulong Ao, Xuebin Min, Tao Li, Boya Wu, Bo Zhao, Bowen Zhang, Liangdong Wang, Guang Liu, Zheqi He, Xi Yang, Jingjing Liu, Yonghua Lin, Tiejun Huang, Zhongyuan Wang.
+
+Emu3 is a multimodal LLM that uses vector quantization to tokenize images into discrete tokens. Discretized image tokens are later fused with text token ids for image and text generation. The model can additionally generate images by predicting image token ids. 
+
+
+The abstract from the paper is the following:
+
+*While next-token prediction is considered a promising path towards artificial general intelligence, it has struggled to excel in multimodal tasks, which are still dominated by diffusion models (e.g., Stable Diffusion) and compositional approaches (e.g., CLIP combined with LLMs). In this paper, we introduce Emu3, a new suite of state-of-the-art multimodal models trained solely with next-token prediction. By tokenizing images, text, and videos into a discrete space, we train a single transformer from scratch on a mixture of multimodal sequences. Emu3 outperforms several well-established task-specific models in both generation and perception tasks, surpassing flagship models such as SDXL and LLaVA-1.6, while eliminating the need for diffusion or compositional architectures. Emu3 is also capable of generating high-fidelity video via predicting the next token in a video sequence. We simplify complex multimodal model designs by converging on a singular focus: tokens, unlocking great potential for scaling both during training and inference. Our results demonstrate that next-token prediction is a promising path towards building general multimodal intelligence beyond language. We open-source key techniques and models to support further research in this direction.*
+
+Tips:
+
+- We advise users to set `processor.tokenizer.padding_side = "left"` before batched generation as it leads to more accurate results.
+
+- Note that the model has been trained with a specific prompt format for chatting. Use `processor.apply_chat_template(my_conversation_dict)` to correctly format your prompts.
+
+- Emu3 has two different checkpoints for image-generation and text-generation, make sure to use the correct checkpoint when loading the model. To generate an image, it is advised to use `prefix_constraints` so that the generated tokens are sampled only from possible image tokens. See more below for usage examples.
+
+> [!TIP]
+> Emu3 implementation in Transformers uses a special image token to indicate where to merge image embeddings. The special image token isn't new and uses one of the reserved tokens: `<|extra_0|>`. You have to add `<image>` to your prompt in the place where the image should be embedded for correct generation.
+
+
+This model was contributed by [RaushanTurganbay](https://huggingface.co/RaushanTurganbay).
+The original code can be found [here](https://github.com/baaivision/Emu3).
+
+
+## Usage example
+
+### Text generation inference
+
+Here's how to load the model and perform inference in half-precision (`torch.bfloat16`) to generate textual output from text or text and image inputs:
+
+```python
+from transformers import Emu3Processor, Emu3ForConditionalGeneration
+import torch
+from PIL import Image
+import requests
+
+processor = Emu3Processor.from_pretrained("Emu3-community/Emu3-Chat-hf")
+model = Emu3ForConditionalGeneration.from_pretrained("Emu3-community/Emu3-Chat-hf", torch_dtype=torch.bfloat16, device_map="cuda")
+
+# prepare image and text prompt
+url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+image = Image.open(requests.get(url, stream=True).raw)
+prompt = "What do you see in this image?<image>"
+
+inputs = processor(images=image, text=prompt, return_tensors="pt").to(model.device, dtype=torch.bfloat16)
+
+# autoregressively complete prompt
+output = model.generate(**inputs, max_new_tokens=50)
+print(processor.decode(output[0], skip_special_tokens=True))
+```
+
+### Image generation inference
+
+Emu3 can also generate images from textual input. Here is how you can do it:
+
+```python
+processor = Emu3Processor.from_pretrained("Emu3-community/Emu3-Gen-hf")
+model = Emu3ForConditionalGeneration.from_pretrained("Emu3-community/Emu3-Gen-hf", torch_dtype="bfloat16", device_map="auto", attn_implementation="flash_attention_2")
+
+
+inputs = processor(
+    text=["a portrait of young girl. masterpiece, film grained, best quality.", "a dog running under the rain"],
+    padding=True,
+    return_tensors="pt",
+    return_for_image_generation=True,
+)
+inputs = inputs.to(device="cuda:0", dtype=torch.bfloat16)
+
+neg_prompt = "lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry."
+neg_inputs = processor(text=[neg_prompt] * 2, return_tensors="pt").to(device="cuda:0")
+
+image_sizes = inputs.pop("image_sizes")
+HEIGHT, WIDTH = image_sizes[0]
+VISUAL_TOKENS = model.vocabulary_mapping.image_tokens
+
+def prefix_allowed_tokens_fn(batch_id, input_ids):
+    height, width = HEIGHT, WIDTH
+    visual_tokens = VISUAL_TOKENS
+    image_wrapper_token_id = torch.tensor([processor.tokenizer.image_wrapper_token_id], device=model.device)
+    eoi_token_id = torch.tensor([processor.tokenizer.eoi_token_id], device=model.device)
+    eos_token_id = torch.tensor([processor.tokenizer.eos_token_id], device=model.device)
+    pad_token_id = torch.tensor([processor.tokenizer.pad_token_id], device=model.device)
+    eof_token_id = torch.tensor([processor.tokenizer.eof_token_id], device=model.device)
+    eol_token_id = processor.tokenizer.encode("<|extra_200|>", return_tensors="pt")[0]
+
+    position = torch.nonzero(input_ids == image_wrapper_token_id, as_tuple=True)[0][0]
+    offset = input_ids.shape[0] - position
+    if offset % (width + 1) == 0:
+        return (eol_token_id, )
+    elif offset == (width + 1) * height + 1:
+        return (eof_token_id, )
+    elif offset == (width + 1) * height + 2:
+        return (eoi_token_id, )
+    elif offset == (width + 1) * height + 3:
+        return (eos_token_id, )
+    elif offset > (width + 1) * height + 3:
+        return (pad_token_id, )
+    else:
+        return visual_tokens
+
+
+out = model.generate(
+    **inputs,
+    max_new_tokens=50_000, # make sure to have enough tokens for one image
+    prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+    return_dict_in_generate=True,
+    negative_prompt_ids=neg_inputs.input_ids, # indicate for Classifier-Free Guidance
+    negative_prompt_attention_mask=neg_inputs.attention_mask,
+)
+
+image = model.decode_image_tokens(out.sequences[:, inputs.input_ids.shape[1]: ], height=HEIGHT, width=WIDTH)
+images = processor.postprocess(list(image.float()), return_tensors="PIL.Image.Image") # internally we convert to np but it's not supported in bf16 precision
+for i, image in enumerate(images['pixel_values']):
+    image.save(f"result{i}.png")
+
+```
+
+
+## Emu3Config
+
+[[autodoc]] Emu3Config
+
+## Emu3VQVAEConfig
+
+[[autodoc]] Emu3VQVAEConfig
+
+## Emu3TextConfig
+
+[[autodoc]] Emu3TextConfig
+
+## Emu3Processor
+
+[[autodoc]] Emu3Processor
+
+## Emu3ImageProcessor
+
+[[autodoc]] Emu3ImageProcessor
+    - preprocess
+
+## Emu3VQVAE
+
+[[autodoc]] Emu3VQVAE
+    - forward
+
+## Emu3TextModel
+
+[[autodoc]] Emu3TextModel
+    - forward
+
+## Emu3ForCausalLM
+
+[[autodoc]] Emu3ForCausalLM
+    - forward
+
+## Emu3ForConditionalGeneration
+
+[[autodoc]] Emu3ForConditionalGeneration
+    - forward
--- a/docs/source/en/model_doc/falcon3.md
+++ b/docs/source/en/model_doc/falcon3.md
@ -0,0 +1,29 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Falcon3
+
+## Overview
+
+Falcon3 represents a natural evolution from previous releases, emphasizing expanding the models' science, math, and code capabilities. This iteration includes five base models: Falcon3-1B-Base, Falcon3-3B-Base, Falcon3-Mamba-7B-Base, Falcon3-7B-Base, and Falcon3-10B-Base. In developing these models, we incorporated several key innovations aimed at improving the models' performances while reducing training costs:
+
+One pre-training: We conducted a single large-scale pretraining run on the 7B model, using 2048 H100 GPU chips, leveraging 14 trillion tokens featuring web, code, STEM, and curated high-quality and multilingual data.
+Depth up-scaling for improved reasoning: Building on recent studies on the effects of model depth, we upscaled the 7B model to a 10B parameters model by duplicating the redundant layers and continuing pre-training with 2TT of high-quality data. This yielded Falcon3-10B-Base which achieves state-of-the-art zero-shot and few-shot performance for models under 13B parameters.
+Knowledge distillation for better tiny models: To provide compact and efficient alternatives, we developed Falcon3-1B-Base and Falcon3-3B-Base by leveraging pruning and knowledge distillation techniques, using less than 100GT of curated high-quality data, thereby redefining pre-training efficiency.
+
+## Resources
+- [Blog post](https://huggingface.co/blog/falcon3)
+- [Models on Huggingface](https://huggingface.co/collections/tiiuae/falcon3-67605ae03578be86e4e87026)
--- a/docs/source/en/model_doc/idefics2.md
+++ b/docs/source/en/model_doc/idefics2.md
@ -141,7 +141,7 @@ Do note that when training Idefics2 on multi-turn conversations between a user a

 ## Model optimizations: Flash Attention

-The code snippets above showcase inference without any optimization tricks. However, one can drastically speed up the model by leveraging [Flash Attention](../perf_train_gpu_one.md#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model.
+The code snippets above showcase inference without any optimization tricks. However, one can drastically speed up the model by leveraging [Flash Attention](../perf_train_gpu_one#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model.

 First, make sure to install the latest version of Flash Attention 2 to include the sliding window attention feature.

--- a/docs/source/en/model_doc/idefics3.md
+++ b/docs/source/en/model_doc/idefics3.md
@ -51,6 +51,13 @@ This model was contributed by [amyeroberts](https://huggingface.co/amyeroberts)

 [[autodoc]] Idefics3Config

+## Idefics3VisionConfig
+
+[[autodoc]] Idefics3VisionConfig
+
+## Idefics3VisionTransformer
+
+[[autodoc]] Idefics3VisionTransformer

 ## Idefics3Model

--- a/docs/source/en/model_doc/ijepa.md
+++ b/docs/source/en/model_doc/ijepa.md
@ -18,13 +18,18 @@ rendered properly in your Markdown viewer.

 ## Overview

-The I-JEPA model was proposed in [Image-based Joint-Embedding Predictive Architecture](https://arxiv.org/pdf/2301.08243.pdf) by Mahmoud Assran, Quentin Duval, Ishan Misra, Piotr Bojanowski, Pascal Vincent, Michael Rabbat, Yann LeCun, Nicolas Ballas.
+The I-JEPA model was proposed in [Image-based Joint-Embedding Predictive Architecture](https://arxiv.org/abs/2301.08243) by Mahmoud Assran, Quentin Duval, Ishan Misra, Piotr Bojanowski, Pascal Vincent, Michael Rabbat, Yann LeCun, Nicolas Ballas.
 I-JEPA is a self-supervised learning method that predicts the representations of one part of an image based on other parts of the same image. This approach focuses on learning semantic features without relying on pre-defined invariances from hand-crafted data transformations, which can bias specific tasks, or on filling in pixel-level details, which often leads to less meaningful representations.

 The abstract from the paper is the following:

 This paper demonstrates an approach for learning highly semantic image representations without relying on hand-crafted data-augmentations. We introduce the Image- based Joint-Embedding Predictive Architecture (I-JEPA), a non-generative approach for self-supervised learning from images. The idea behind I-JEPA is simple: from a single context block, predict the representations of various target blocks in the same image. A core design choice to guide I-JEPA towards producing semantic representations is the masking strategy; specifically, it is crucial to (a) sample tar- get blocks with sufficiently large scale (semantic), and to (b) use a sufficiently informative (spatially distributed) context block. Empirically, when combined with Vision Transform- ers, we find I-JEPA to be highly scalable. For instance, we train a ViT-Huge/14 on ImageNet using 16 A100 GPUs in under 72 hours to achieve strong downstream performance across a wide range of tasks, from linear classification to object counting and depth prediction.

+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/ijepa_architecture.jpg"
+alt="drawing" width="600"/>
+
+<small> I-JEPA architecture. Taken from the <a href="https://arxiv.org/abs/2301.08243">original paper.</a> </small>
+
 This model was contributed by [jmtzt](https://huggingface.co/jmtzt).
 The original code can be found [here](https://github.com/facebookresearch/ijepa).

@ -45,7 +50,7 @@ url_2 = "http://images.cocodataset.org/val2017/000000219578.jpg"
 image_1 = Image.open(requests.get(url_1, stream=True).raw)
 image_2 = Image.open(requests.get(url_2, stream=True).raw)

-model_id = "jmtzt/ijepa_vith14_1k"
+model_id = "facebook/ijepa_vith14_1k"
 processor = AutoProcessor.from_pretrained(model_id)
 model = AutoModel.from_pretrained(model_id)

@ -63,6 +68,15 @@ similarity = cosine_similarity(embed_1, embed_2)
 print(similarity)
 ```

+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with I-JEPA.
+
+<PipelineTag pipeline="image-classification"/>
+
+- [`IJepaForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
+- See also: [Image classification task guide](../tasks/image_classification)
+
 ## IJepaConfig

 [[autodoc]] IJepaConfig
@ -75,4 +89,4 @@ print(similarity)
 ## IJepaForImageClassification

 [[autodoc]] IJepaForImageClassification
-    - forward
+    - forward
--- a/docs/source/en/model_doc/llava.md
+++ b/docs/source/en/model_doc/llava.md
@ -131,7 +131,7 @@ prompt_2 = processor.apply_chat_template(conversation_2, add_generation_prompt=T
 prompts = [prompt_1, prompt_2]

 # We can simply feed images in the order they have to be used in the text prompt
-inputs = processor(images=[image_stop, image_cats, image_snowman], text=prompts, padding=True, return_tensors="pt").to(model.device, torch.float16)
+inputs = processor(images=[image_stop, image_cats], text=prompts, padding=True, return_tensors="pt").to(model.device, torch.float16)

 # Generate
 generate_ids = model.generate(**inputs, max_new_tokens=30)
--- a/docs/source/en/model_doc/llava_next_video.md
+++ b/docs/source/en/model_doc/llava_next_video.md
@ -240,7 +240,7 @@ model = LlavaNextVideoForConditionalGeneration.from_pretrained("llava-hf/LLaVA-N

 ### Flash-Attention 2 to speed-up generation

-Additionally, we can greatly speed-up model inference by using [Flash Attention](../perf_train_gpu_one.md#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model.
+Additionally, we can greatly speed-up model inference by using [Flash Attention](../perf_train_gpu_one#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model.

 First, make sure to install the latest version of Flash Attention 2:

--- a/docs/source/en/model_doc/mistral.md
+++ b/docs/source/en/model_doc/mistral.md
@ -91,7 +91,7 @@ As can be seen, the instruction-tuned model requires a [chat template](../chat_t

 ## Speeding up Mistral by using Flash Attention

-The code snippets above showcase inference without any optimization tricks. However, one can drastically speed up the model by leveraging [Flash Attention](../perf_train_gpu_one.md#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model.
+The code snippets above showcase inference without any optimization tricks. However, one can drastically speed up the model by leveraging [Flash Attention](../perf_train_gpu_one#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model.

 First, make sure to install the latest version of Flash Attention 2 to include the sliding window attention feature.

--- a/docs/source/en/model_doc/mixtral.md
+++ b/docs/source/en/model_doc/mixtral.md
@ -93,7 +93,7 @@ As can be seen, the instruction-tuned model requires a [chat template](../chat_t

 ## Speeding up Mixtral by using Flash Attention

-The code snippets above showcase inference without any optimization tricks. However, one can drastically speed up the model by leveraging [Flash Attention](../perf_train_gpu_one.md#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model.
+The code snippets above showcase inference without any optimization tricks. However, one can drastically speed up the model by leveraging [Flash Attention](../perf_train_gpu_one#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model.

 First, make sure to install the latest version of Flash Attention 2 to include the sliding window attention feature.

--- a/docs/source/en/model_doc/modernbert.md
+++ b/docs/source/en/model_doc/modernbert.md
@ -0,0 +1,95 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# ModernBERT
+
+<div class="flex flex-wrap space-x-1">
+<a href="https://huggingface.co/models?filter=modernbert">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-modernbert-blueviolet">
+</a>
+<a href="https://arxiv.org/abs/2412.13663">
+<img alt="Paper page" src="https://img.shields.io/badge/Paper%20page-2412.13663-green">
+</a>
+</div>
+
+## Overview
+
+The ModernBERT model was proposed in [Smarter, Better, Faster, Longer: A Modern Bidirectional Encoder for Fast, Memory Efficient, and Long Context Finetuning and Inference](https://arxiv.org/abs/2412.13663) by Benjamin Warner, Antoine Chaffin, Benjamin Clavié, Orion Weller, Oskar Hallström, Said Taghadouini, Alexis Galalgher, Raja Bisas, Faisal Ladhak, Tom Aarsen, Nathan Cooper, Grifin Adams, Jeremy Howard and Iacopo Poli.
+
+It is a refresh of the traditional encoder architecture, as used in previous models such as [BERT](https://huggingface.co/docs/transformers/en/model_doc/bert) and [RoBERTa](https://huggingface.co/docs/transformers/en/model_doc/roberta). 
+
+It builds on BERT and implements many modern architectural improvements which have been developed since its original release, such as:
+- [Rotary Positional Embeddings](https://huggingface.co/blog/designing-positional-encoding) to support sequences of up to 8192 tokens.
+- [Unpadding](https://arxiv.org/abs/2208.08124) to ensure no compute is wasted on padding tokens, speeding up processing time for batches with mixed-length sequences.
+- [GeGLU](https://arxiv.org/abs/2002.05202) Replacing the original MLP layers with GeGLU layers, shown to improve performance.
+- [Alternating Attention](https://arxiv.org/abs/2004.05150v2) where most attention layers employ a sliding window of 128 tokens, with Global Attention only used every 3 layers.
+- [Flash Attention](https://github.com/Dao-AILab/flash-attention) to speed up processing.
+- A model designed following recent [The Case for Co-Designing Model Architectures with Hardware](https://arxiv.org/abs/2401.14489), ensuring maximum efficiency across inference GPUs.
+- Modern training data scales (2 trillion tokens) and mixtures (including code ande math data)
+
+The abstract from the paper is the following:
+
+*Encoder-only transformer models such as BERT offer a great performance-size tradeoff for retrieval and classification tasks with respect to larger decoder-only models. Despite being the workhorse of numerous production pipelines, there have been limited Pareto improvements to BERT since its release. In this paper, we introduce ModernBERT, bringing modern model optimizations to encoder-only models and representing a major Pareto improvement over older encoders. Trained on 2 trillion tokens with a native 8192 sequence length, ModernBERT models exhibit state-of-the-art results on a large pool of evaluations encompassing diverse classification tasks and both single and multi-vector retrieval on different domains (including code). In addition to strong downstream performance, ModernBERT is also the most speed and memory efficient encoder and is designed for inference on common GPUs.*
+
+The original code can be found [here](https://github.com/answerdotai/modernbert).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ModernBert.
+
+<PipelineTag pipeline="text-classification"/>
+
+- A notebook on how to [finetune for General Language Understanding Evaluation (GLUE) with Transformers](https://github.com/AnswerDotAI/ModernBERT/blob/main/examples/finetune_modernbert_on_glue.ipynb), also available as a Google Colab [notebook](https://colab.research.google.com/github/AnswerDotAI/ModernBERT/blob/main/examples/finetune_modernbert_on_glue.ipynb). 🌎
+
+<PipelineTag pipeline="sentence-similarity"/>
+
+- A script on how to [finetune for text similarity or information retrieval with Sentence Transformers](https://github.com/AnswerDotAI/ModernBERT/blob/main/examples/train_st.py). 🌎
+- A script on how to [finetune for information retrieval with PyLate](https://github.com/AnswerDotAI/ModernBERT/blob/main/examples/train_pylate.py). 🌎
+
+<PipelineTag pipeline="fill-mask"/>
+
+- [Masked language modeling task guide](../tasks/masked_language_modeling)
+
+
+## ModernBertConfig
+
+[[autodoc]] ModernBertConfig
+
+<frameworkcontent>
+<pt>
+
+## ModernBertModel
+
+[[autodoc]] ModernBertModel
+    - forward
+
+## ModernBertForMaskedLM
+
+[[autodoc]] ModernBertForMaskedLM
+    - forward
+
+## ModernBertForSequenceClassification
+
+[[autodoc]] ModernBertForSequenceClassification
+    - forward
+
+## ModernBertForTokenClassification
+
+[[autodoc]] ModernBertForTokenClassification
+    - forward
+
+</pt>
+</frameworkcontent>
--- a/docs/source/en/model_doc/moonshine.md
+++ b/docs/source/en/model_doc/moonshine.md
@ -0,0 +1,56 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Moonshine
+
+## Overview
+
+The Moonshine model was proposed in [Moonshine: Speech Recognition for Live Transcription and Voice Commands
+](https://arxiv.org/abs/2410.15608) by Nat Jeffries, Evan King, Manjunath Kudlur, Guy Nicholson, James Wang, Pete Warden.
+
+The abstract from the paper is the following:
+
+*This paper introduces Moonshine, a family of speech recognition models optimized for live transcription and voice command processing. Moonshine is based on an encoder-decoder transformer architecture and employs Rotary Position Embedding (RoPE) instead of traditional absolute position embeddings. The model is trained on speech segments of various lengths, but without using zero-padding, leading to greater efficiency for the encoder during inference time. When benchmarked against OpenAI's Whisper tiny-en, Moonshine Tiny demonstrates a 5x reduction in compute requirements for transcribing a 10-second speech segment while incurring no increase in word error rates across standard evaluation datasets. These results highlight Moonshine's potential for real-time and resource-constrained applications.*
+
+Tips:
+
+- Moonshine improves upon Whisper's architecture:
+  1. It uses SwiGLU activation instead of GELU in the decoder layers
+  2. Most importantly, it replaces absolute position embeddings with Rotary Position Embeddings (RoPE). This allows Moonshine to handle audio inputs of any length, unlike Whisper which is restricted to fixed 30-second windows.
+
+This model was contributed by [Eustache Le Bihan (eustlb)](https://huggingface.co/eustlb).
+The original code can be found [here](https://github.com/usefulsensors/moonshine).
+
+## Resources
+
+- [Automatic speech recognition task guide](../tasks/asr)
+
+## MoonshineConfig
+
+[[autodoc]] MoonshineConfig
+
+## MoonshineModel
+
+[[autodoc]] MoonshineModel
+    - forward
+    - _mask_input_features
+
+## MoonshineForConditionalGeneration
+
+[[autodoc]] MoonshineForConditionalGeneration
+    - forward
+    - generate
+
--- a/docs/source/en/model_doc/musicgen_melody.md
+++ b/docs/source/en/model_doc/musicgen_melody.md
@ -266,7 +266,6 @@ Tips:
 ## MusicgenMelodyFeatureExtractor

 [[autodoc]] MusicgenMelodyFeatureExtractor
-    - _extract_stem_indices

 ## MusicgenMelodyConfig

--- a/docs/source/en/model_doc/qwen2_audio.md
+++ b/docs/source/en/model_doc/qwen2_audio.md
@ -34,6 +34,37 @@ The abstract from the paper is the following:

 `Qwen2-Audio-7B` and `Qwen2-Audio-7B-Instruct` can be found on the [Huggingface Hub](https://huggingface.co/Qwen)

+### Inference
+
+```python
+from io import BytesIO
+from urllib.request import urlopen
+import librosa
+from transformers import AutoProcessor, Qwen2AudioForConditionalGeneration
+
+model = Qwen2AudioForConditionalGeneration.from_pretrained("Qwen/Qwen2-Audio-7B", trust_remote_code=True, device_map="auto")
+processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B", trust_remote_code=True)
+
+prompt = "<|audio_bos|><|AUDIO|><|audio_eos|>Generate the caption in English:"
+url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-Audio/glass-breaking-151256.mp3"
+audio, sr = librosa.load(BytesIO(urlopen(url).read()), sr=processor.feature_extractor.sampling_rate)
+inputs = processor(text=prompt, audios=audio, return_tensors="pt").to(model.device)
+
+generate_ids = model.generate(**inputs, max_length=256)
+generate_ids = generate_ids[:, inputs.input_ids.size(1):]
+
+response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+
+# We can also omit the audio_bos and audio_eos tokens
+prompt = "<|AUDIO|>Generate the caption in English:"
+inputs = processor(text=prompt, audios=audio, return_tensors="pt").to(model.device)
+
+generate_ids = model.generate(**inputs, max_length=256)
+generate_ids = generate_ids[:, inputs.input_ids.size(1):]
+
+response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+```
+
 In the following, we demonstrate how to use `Qwen2-Audio-7B-Instruct` for the inference, supporting both voice chat and audio analysis modes. Note that we have used the ChatML format for dialog, in this demo we show how to leverage `apply_chat_template` for this purpose.

 ### Voice Chat Inference
--- a/docs/source/en/model_doc/textnet.md
+++ b/docs/source/en/model_doc/textnet.md
@ -0,0 +1,55 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# TextNet
+
+## Overview
+
+The TextNet model was proposed in [FAST: Faster Arbitrarily-Shaped Text Detector with Minimalist Kernel Representation](https://arxiv.org/abs/2111.02394) by Zhe Chen, Jiahao Wang, Wenhai Wang, Guo Chen, Enze Xie, Ping Luo, Tong Lu. TextNet is a vision backbone useful for text detection tasks. It is the result of neural architecture search (NAS) on backbones with reward function as text detection task (to provide powerful features for text detection).
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/fast_architecture.png"
+alt="drawing" width="600"/>
+
+<small> TextNet backbone as part of FAST. Taken from the <a href="https://arxiv.org/abs/2111.02394">original paper.</a> </small>
+
+This model was contributed by [Raghavan](https://huggingface.co/Raghavan), [jadechoghari](https://huggingface.co/jadechoghari) and [nielsr](https://huggingface.co/nielsr).
+
+## Usage tips
+
+TextNet is mainly used as a backbone network for the architecture search of text detection. Each stage of the backbone network is comprised of a stride-2 convolution and searchable blocks. 
+Specifically, we present a layer-level candidate set, defined as {conv3×3, conv1×3, conv3×1, identity}. As the 1×3 and 3×1 convolutions have asymmetric kernels and oriented structure priors, they may help to capture the features of extreme aspect-ratio and rotated text lines.
+
+TextNet is the backbone for Fast, but can also be used as an efficient text/image classification, we add a `TextNetForImageClassification` as is it would allow people to train an image classifier on top of the pre-trained textnet weights
+
+## TextNetConfig
+
+[[autodoc]] TextNetConfig
+
+## TextNetImageProcessor
+
+[[autodoc]] TextNetImageProcessor
+    - preprocess
+
+## TextNetModel
+
+[[autodoc]] TextNetModel
+    - forward
+
+## TextNetForImageClassification
+
+[[autodoc]] TextNetForImageClassification
+    - forward
+
--- a/docs/source/en/model_doc/timm_wrapper.md
+++ b/docs/source/en/model_doc/timm_wrapper.md
@ -0,0 +1,67 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# TimmWrapper
+
+## Overview
+
+Helper class to enable loading timm models to be used with the transformers library and its autoclasses.
+
+```python
+>>> import torch
+>>> from PIL import Image
+>>> from urllib.request import urlopen
+>>> from transformers import AutoModelForImageClassification, AutoImageProcessor
+
+>>> # Load image
+>>> image = Image.open(urlopen(
+...     'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png'
+... ))
+
+>>> # Load model and image processor
+>>> checkpoint = "timm/resnet50.a1_in1k"
+>>> image_processor = AutoImageProcessor.from_pretrained(checkpoint)
+>>> model = AutoModelForImageClassification.from_pretrained(checkpoint).eval()
+
+>>> # Preprocess image
+>>> inputs = image_processor(image)
+
+>>> # Forward pass
+>>> with torch.no_grad():
+...     logits = model(**inputs).logits
+
+>>> # Get top 5 predictions
+>>> top5_probabilities, top5_class_indices = torch.topk(logits.softmax(dim=1) * 100, k=5)
+```
+
+## TimmWrapperConfig
+
+[[autodoc]] TimmWrapperConfig
+
+## TimmWrapperImageProcessor
+
+[[autodoc]] TimmWrapperImageProcessor
+    - preprocess
+
+## TimmWrapperModel
+
+[[autodoc]] TimmWrapperModel
+    - forward
+
+## TimmWrapperForImageClassification
+
+[[autodoc]] TimmWrapperForImageClassification
+    - forward
--- a/docs/source/en/model_doc/video_llava.md
+++ b/docs/source/en/model_doc/video_llava.md
@ -174,7 +174,7 @@ model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-L

 ### Flash-Attention 2 to speed-up generation

-Additionally, we can greatly speed-up model inference by using [Flash Attention](../perf_train_gpu_one.md#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model.
+Additionally, we can greatly speed-up model inference by using [Flash Attention](../perf_train_gpu_one#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model.

 First, make sure to install the latest version of Flash Attention 2:

--- a/docs/source/en/model_doc/vitpose.md
+++ b/docs/source/en/model_doc/vitpose.md
@ -0,0 +1,254 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# VitPose
+
+## Overview
+
+The VitPose model was proposed in [ViTPose: Simple Vision Transformer Baselines for Human Pose Estimation](https://arxiv.org/abs/2204.12484) by Yufei Xu, Jing Zhang, Qiming Zhang, Dacheng Tao. VitPose employs a standard, non-hierarchical [Vision Transformer](https://arxiv.org/pdf/2010.11929v2) as backbone for the task of keypoint estimation. A simple decoder head is added on top to predict the heatmaps from a given image. Despite its simplicity, the model gets state-of-the-art results on the challenging MS COCO Keypoint Detection benchmark.
+
+The abstract from the paper is the following:
+
+*Although no specific domain knowledge is considered in the design, plain vision transformers have shown excellent performance in visual recognition tasks. However, little effort has been made to reveal the potential of such simple structures for pose estimation tasks. In this paper, we show the surprisingly good capabilities of plain vision transformers for pose estimation from various aspects, namely simplicity in model structure, scalability in model size, flexibility in training paradigm, and transferability of knowledge between models, through a simple baseline model called ViTPose. Specifically, ViTPose employs plain and non-hierarchical vision transformers as backbones to extract features for a given person instance and a lightweight decoder for pose estimation. It can be scaled up from 100M to 1B parameters by taking the advantages of the scalable model capacity and high parallelism of transformers, setting a new Pareto front between throughput and performance. Besides, ViTPose is very flexible regarding the attention type, input resolution, pre-training and finetuning strategy, as well as dealing with multiple pose tasks. We also empirically demonstrate that the knowledge of large ViTPose models can be easily transferred to small ones via a simple knowledge token. Experimental results show that our basic ViTPose model outperforms representative methods on the challenging MS COCO Keypoint Detection benchmark, while the largest model sets a new state-of-the-art.*
+
+![vitpose-architecture](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/vitpose-architecture.png)
+
+This model was contributed by [nielsr](https://huggingface.co/nielsr) and [sangbumchoi](https://github.com/SangbumChoi).
+The original code can be found [here](https://github.com/ViTAE-Transformer/ViTPose).
+
+## Usage Tips
+
+ViTPose is a so-called top-down keypoint detection model. This means that one first uses an object detector, like [RT-DETR](rt_detr.md), to detect people (or other instances) in an image. Next, ViTPose takes the cropped images as input and predicts the keypoints.
+
+```py
+import torch
+import requests
+import numpy as np
+
+from PIL import Image
+
+from transformers import (
+    AutoProcessor,
+    RTDetrForObjectDetection,
+    VitPoseForPoseEstimation,
+)
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+url = "http://images.cocodataset.org/val2017/000000000139.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+
+# ------------------------------------------------------------------------
+# Stage 1. Detect humans on the image
+# ------------------------------------------------------------------------
+
+# You can choose detector by your choice
+person_image_processor = AutoProcessor.from_pretrained("PekingU/rtdetr_r50vd_coco_o365")
+person_model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd_coco_o365", device_map=device)
+
+inputs = person_image_processor(images=image, return_tensors="pt").to(device)
+
+with torch.no_grad():
+    outputs = person_model(**inputs)
+
+results = person_image_processor.post_process_object_detection(
+    outputs, target_sizes=torch.tensor([(image.height, image.width)]), threshold=0.3
+)
+result = results[0]  # take first image results
+
+# Human label refers 0 index in COCO dataset
+person_boxes = result["boxes"][result["labels"] == 0]
+person_boxes = person_boxes.cpu().numpy()
+
+# Convert boxes from VOC (x1, y1, x2, y2) to COCO (x1, y1, w, h) format
+person_boxes[:, 2] = person_boxes[:, 2] - person_boxes[:, 0]
+person_boxes[:, 3] = person_boxes[:, 3] - person_boxes[:, 1]
+
+# ------------------------------------------------------------------------
+# Stage 2. Detect keypoints for each person found
+# ------------------------------------------------------------------------
+
+image_processor = AutoProcessor.from_pretrained("usyd-community/vitpose-base-simple")
+model = VitPoseForPoseEstimation.from_pretrained("usyd-community/vitpose-base-simple", device_map=device)
+
+inputs = image_processor(image, boxes=[person_boxes], return_tensors="pt").to(device)
+
+with torch.no_grad():
+    outputs = model(**inputs)
+
+pose_results = image_processor.post_process_pose_estimation(outputs, boxes=[person_boxes])
+image_pose_result = pose_results[0]  # results for first image
+```
+
+
+### Visualization for supervision user
+```py
+import supervision as sv
+
+xy = torch.stack([pose_result['keypoints'] for pose_result in image_pose_result]).cpu().numpy()
+scores = torch.stack([pose_result['scores'] for pose_result in image_pose_result]).cpu().numpy()
+
+key_points = sv.KeyPoints(
+    xy=xy, confidence=scores
+)
+
+edge_annotator = sv.EdgeAnnotator(
+    color=sv.Color.GREEN,
+    thickness=1
+)
+vertex_annotator = sv.VertexAnnotator(
+    color=sv.Color.RED,
+    radius=2
+)
+annotated_frame = edge_annotator.annotate(
+    scene=image.copy(),
+    key_points=key_points
+)
+annotated_frame = vertex_annotator.annotate(
+    scene=annotated_frame,
+    key_points=key_points
+)
+```
+
+### Visualization for advanced user
+```py
+import math
+import cv2
+
+def draw_points(image, keypoints, scores, pose_keypoint_color, keypoint_score_threshold, radius, show_keypoint_weight):
+    if pose_keypoint_color is not None:
+        assert len(pose_keypoint_color) == len(keypoints)
+    for kid, (kpt, kpt_score) in enumerate(zip(keypoints, scores)):
+        x_coord, y_coord = int(kpt[0]), int(kpt[1])
+        if kpt_score > keypoint_score_threshold:
+            color = tuple(int(c) for c in pose_keypoint_color[kid])
+            if show_keypoint_weight:
+                cv2.circle(image, (int(x_coord), int(y_coord)), radius, color, -1)
+                transparency = max(0, min(1, kpt_score))
+                cv2.addWeighted(image, transparency, image, 1 - transparency, 0, dst=image)
+            else:
+                cv2.circle(image, (int(x_coord), int(y_coord)), radius, color, -1)
+
+def draw_links(image, keypoints, scores, keypoint_edges, link_colors, keypoint_score_threshold, thickness, show_keypoint_weight, stick_width = 2):
+    height, width, _ = image.shape
+    if keypoint_edges is not None and link_colors is not None:
+        assert len(link_colors) == len(keypoint_edges)
+        for sk_id, sk in enumerate(keypoint_edges):
+            x1, y1, score1 = (int(keypoints[sk[0], 0]), int(keypoints[sk[0], 1]), scores[sk[0]])
+            x2, y2, score2 = (int(keypoints[sk[1], 0]), int(keypoints[sk[1], 1]), scores[sk[1]])
+            if (
+                x1 > 0
+                and x1 < width
+                and y1 > 0
+                and y1 < height
+                and x2 > 0
+                and x2 < width
+                and y2 > 0
+                and y2 < height
+                and score1 > keypoint_score_threshold
+                and score2 > keypoint_score_threshold
+            ):
+                color = tuple(int(c) for c in link_colors[sk_id])
+                if show_keypoint_weight:
+                    X = (x1, x2)
+                    Y = (y1, y2)
+                    mean_x = np.mean(X)
+                    mean_y = np.mean(Y)
+                    length = ((Y[0] - Y[1]) ** 2 + (X[0] - X[1]) ** 2) ** 0.5
+                    angle = math.degrees(math.atan2(Y[0] - Y[1], X[0] - X[1]))
+                    polygon = cv2.ellipse2Poly(
+                        (int(mean_x), int(mean_y)), (int(length / 2), int(stick_width)), int(angle), 0, 360, 1
+                    )
+                    cv2.fillConvexPoly(image, polygon, color)
+                    transparency = max(0, min(1, 0.5 * (keypoints[sk[0], 2] + keypoints[sk[1], 2])))
+                    cv2.addWeighted(image, transparency, image, 1 - transparency, 0, dst=image)
+                else:
+                    cv2.line(image, (x1, y1), (x2, y2), color, thickness=thickness)
+
+
+# Note: keypoint_edges and color palette are dataset-specific
+keypoint_edges = model.config.edges
+
+palette = np.array(
+    [
+        [255, 128, 0],
+        [255, 153, 51],
+        [255, 178, 102],
+        [230, 230, 0],
+        [255, 153, 255],
+        [153, 204, 255],
+        [255, 102, 255],
+        [255, 51, 255],
+        [102, 178, 255],
+        [51, 153, 255],
+        [255, 153, 153],
+        [255, 102, 102],
+        [255, 51, 51],
+        [153, 255, 153],
+        [102, 255, 102],
+        [51, 255, 51],
+        [0, 255, 0],
+        [0, 0, 255],
+        [255, 0, 0],
+        [255, 255, 255],
+    ]
+)
+
+link_colors = palette[[0, 0, 0, 0, 7, 7, 7, 9, 9, 9, 9, 9, 16, 16, 16, 16, 16, 16, 16]]
+keypoint_colors = palette[[16, 16, 16, 16, 16, 9, 9, 9, 9, 9, 9, 0, 0, 0, 0, 0, 0]]
+
+numpy_image = np.array(image)
+
+for pose_result in image_pose_result:
+    scores = np.array(pose_result["scores"])
+    keypoints = np.array(pose_result["keypoints"])
+
+    # draw each point on image
+    draw_points(numpy_image, keypoints, scores, keypoint_colors, keypoint_score_threshold=0.3, radius=4, show_keypoint_weight=False)
+
+    # draw links
+    draw_links(numpy_image, keypoints, scores, keypoint_edges, link_colors, keypoint_score_threshold=0.3, thickness=1, show_keypoint_weight=False)
+
+pose_image = Image.fromarray(numpy_image)
+pose_image
+```
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/vitpose-coco.jpg" alt="drawing" width="600"/>
+
+### MoE backbone
+
+To enable MoE (Mixture of Experts) function in the backbone, user has to give appropriate configuration such as `num_experts` and input value `dataset_index` to the backbone model.  However, it is not used in default parameters. Below is the code snippet for usage of MoE function.
+
+```py
+>>> from transformers import VitPoseBackboneConfig, VitPoseBackbone
+>>> import torch
+
+>>> config = VitPoseBackboneConfig(num_experts=3, out_indices=[-1])
+>>> model = VitPoseBackbone(config)
+
+>>> pixel_values = torch.randn(3, 3, 256, 192)
+>>> dataset_index = torch.tensor([1, 2, 3])
+>>> outputs = model(pixel_values, dataset_index)
+```
+
+## VitPoseImageProcessor
+
+[[autodoc]] VitPoseImageProcessor
+    - preprocess
+
+## VitPoseConfig
+
+[[autodoc]] VitPoseConfig
+
+## VitPoseForPoseEstimation
+
+[[autodoc]] VitPoseForPoseEstimation
+    - forward
--- a/docs/source/en/modular_transformers.md
+++ b/docs/source/en/modular_transformers.md
@ -22,6 +22,9 @@ etc. Model contribution PRs rarely add less than 3-5k lines of code, with much o
 This raises the bar for contributions, and with Modular Transformers, we're aiming to lower the bar to a much more
 acceptable point.

+If you plan to add a model to `transformers` make sure you read [How to add a model to 🤗 Transformers?](https://huggingface.co/docs/transformers/add_new_model).
+For any kind of contributions, see [CONTRIBUTING.md](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md).
+
 ## What is it?

 Modular Transformers introduces the concept of a "modular" file to a model folder. This modular file accepts code
@ -43,6 +46,12 @@ be moved to the new Modular Transformers format in the coming months.

 ### Details 

+To generate a single file from the modular file, run the following command.
+
+```bash
+python utils/modular_model_converter.py --files-to-parse src/transformers/models/<your_model>/modular_<your_model>.py
+```
+
 The "linter", which unravels the inheritance and creates all single-files from the modular file, will flatten the 
 inheritance while trying to be invisible to Python users. At this time, the linter flattens a **single** level of
 inheritance.
@ -59,7 +68,11 @@ file, and the corresponding files will be created for you.

 ### Enforcement

-[TODO] We are introducing a new test, that makes sure the generated content matches what is present in the `modular_xxxx.py`
+Run the command below to ensure the generated content matches `modular_<your_model>.py`
+
+```bash
+python utils/check_modular_conversion.py --files src/transformers/models/<your_model>/modular_<your_model>.py
+```

 ### Examples

@ -194,4 +207,4 @@ We now also support special cases like
 class GemmaVisionModel(CLIPModel):                                 
    pass
 ```
-where the name of your class `GemmaVision` is not the same as the modular `Gemma`. This is super useful for composite models.
+where the name of your class `GemmaVision` is not the same as the modular `Gemma`. This is super useful for composite models.
--- a/docs/source/en/perf_infer_gpu_multi.md
+++ b/docs/source/en/perf_infer_gpu_multi.md
@ -64,5 +64,5 @@ You can benefit from considerable speedups for inference, especially for inputs
 For a single forward pass on [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel) with a sequence length of 512 and various batch sizes, the expected speedup is as follows:

 <div style="text-align: center">
-<img src="huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Meta-Llama-3-8B-Instruct, seqlen = 512, python, w_ compile.png">
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Meta-Llama-3-8B-Instruct%2C%20seqlen%20%3D%20512%2C%20python%2C%20w_%20compile.png">
 </div>
--- a/docs/source/en/perf_infer_gpu_one.md
+++ b/docs/source/en/perf_infer_gpu_one.md
@ -37,14 +37,19 @@ FlashAttention-2 is experimental and may change considerably in future versions.
 2. partitioning the work between GPU threads to reduce communication and shared memory reads/writes between them

 FlashAttention-2 is currently supported for the following architectures:
+* [Aria](https://huggingface.co/docs/transformers/model_doc/aria#transformers.AriaForConditionalGeneration)
 * [Bark](https://huggingface.co/docs/transformers/model_doc/bark#transformers.BarkModel)
+* [Bamba](https://huggingface.co/docs/transformers/model_doc/bamba#transformers.BambaModel)
 * [Bart](https://huggingface.co/docs/transformers/model_doc/bart#transformers.BartModel)
 * [Chameleon](https://huggingface.co/docs/transformers/model_doc/chameleon#transformers.Chameleon)
 * [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPModel)
 * [Cohere](https://huggingface.co/docs/transformers/model_doc/cohere#transformers.CohereModel)
+* [Cohere2](https://huggingface.co/docs/transformers/model_doc/cohere2#transformers.Cohere2Model)
 * [GLM](https://huggingface.co/docs/transformers/model_doc/glm#transformers.GLMModel)
 * [Dbrx](https://huggingface.co/docs/transformers/model_doc/dbrx#transformers.DbrxModel)
+* [DiffLlama](https://huggingface.co/docs/transformers/model_doc/diffllama#transformers.DiffLlamaModel)
 * [DistilBert](https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel)
+* [Emu3](https://huggingface.co/docs/transformers/model_doc/emu3)
 * [Gemma](https://huggingface.co/docs/transformers/model_doc/gemma#transformers.GemmaModel)
 * [Gemma2](https://huggingface.co/docs/transformers/model_doc/gemma2#transformers.Gemma2Model)
 * [GPT2](https://huggingface.co/docs/transformers/model_doc/gpt2)
@ -64,6 +69,7 @@ FlashAttention-2 is currently supported for the following architectures:
 * [Llava-NeXT](https://huggingface.co/docs/transformers/model_doc/llava_next)
 * [Llava-NeXT-Video](https://huggingface.co/docs/transformers/model_doc/llava_next_video)
 * [LLaVA-Onevision](https://huggingface.co/docs/transformers/model_doc/llava_onevision)
+* [Moonshine](https://huggingface.co/docs/transformers/model_doc/moonshine#transformers.MoonshineModel)
 * [Mimi](https://huggingface.co/docs/transformers/model_doc/mimi)
 * [VipLlava](https://huggingface.co/docs/transformers/model_doc/vipllava)
 * [VideoLlava](https://huggingface.co/docs/transformers/model_doc/video_llava)
@ -71,6 +77,7 @@ FlashAttention-2 is currently supported for the following architectures:
 * [MBart](https://huggingface.co/docs/transformers/model_doc/mbart#transformers.MBartModel)
 * [Mistral](https://huggingface.co/docs/transformers/model_doc/mistral#transformers.MistralModel)
 * [Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral#transformers.MixtralModel)
+* [ModernBert](https://huggingface.co/docs/transformers/model_doc/modernbert#transformers.ModernBert)
 * [Moshi](https://huggingface.co/docs/transformers/model_doc/moshi#transformers.MoshiModel)
 * [Musicgen](https://huggingface.co/docs/transformers/model_doc/musicgen#transformers.MusicgenModel)
 * [MusicGen Melody](https://huggingface.co/docs/transformers/model_doc/musicgen_melody#transformers.MusicgenMelodyModel)
@ -216,8 +223,11 @@ PyTorch's [`torch.nn.functional.scaled_dot_product_attention`](https://pytorch.o

 For now, Transformers supports SDPA inference and training for the following architectures:
 * [Albert](https://huggingface.co/docs/transformers/model_doc/albert#transformers.AlbertModel)
+* [Aria](https://huggingface.co/docs/transformers/model_doc/aria#transformers.AriaForConditionalGeneration)
 * [Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer#transformers.ASTModel)
+* [Bamba](https://huggingface.co/docs/transformers/model_doc/bamba#transformers.BambaModel)
 * [Bart](https://huggingface.co/docs/transformers/model_doc/bart#transformers.BartModel)
+* [Beit](https://huggingface.co/docs/transformers/model_doc/beit#transformers.BeitModel)
 * [Bert](https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel)
 * [BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt#transformers.BioGptModel)
 * [CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert#transformers.CamembertModel)
@ -225,13 +235,18 @@ For now, Transformers supports SDPA inference and training for the following arc
 * [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPModel)
 * [GLM](https://huggingface.co/docs/transformers/model_doc/glm#transformers.GLMModel)
 * [Cohere](https://huggingface.co/docs/transformers/model_doc/cohere#transformers.CohereModel)
+* [Cohere2](https://huggingface.co/docs/transformers/model_doc/cohere2#transformers.Cohere2Model)
 * [data2vec_audio](https://huggingface.co/docs/transformers/main/en/model_doc/data2vec#transformers.Data2VecAudioModel)
+* [data2vec_vision](https://huggingface.co/docs/transformers/main/en/model_doc/data2vec#transformers.Data2VecVisionModel)
 * [Dbrx](https://huggingface.co/docs/transformers/model_doc/dbrx#transformers.DbrxModel)
 * [DeiT](https://huggingface.co/docs/transformers/model_doc/deit#transformers.DeiTModel)
+* [DiffLlama](https://huggingface.co/docs/transformers/model_doc/diffllama#transformers.DiffLlamaModel)
 * [Dinov2](https://huggingface.co/docs/transformers/en/model_doc/dinov2)
+* [Dinov2_with_registers](https://huggingface.co/docs/transformers/en/model_doc/dinov2)
 * [DistilBert](https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel)
 * [Dpr](https://huggingface.co/docs/transformers/model_doc/dpr#transformers.DprReader)
 * [EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder_decoder#transformers.EncoderDecoderModel)
+* [Emu3](https://huggingface.co/docs/transformers/model_doc/emu3)
 * [Falcon](https://huggingface.co/docs/transformers/model_doc/falcon#transformers.FalconModel)
 * [Gemma](https://huggingface.co/docs/transformers/model_doc/gemma#transformers.GemmaModel)
 * [Gemma2](https://huggingface.co/docs/transformers/model_doc/gemma2#transformers.Gemma2Model)
@ -253,10 +268,12 @@ For now, Transformers supports SDPA inference and training for the following arc
 * [Llava-NeXT-Video](https://huggingface.co/docs/transformers/model_doc/llava_next_video)
 * [LLaVA-Onevision](https://huggingface.co/docs/transformers/model_doc/llava_onevision)
 * [M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100#transformers.M2M100Model)
+* [Moonshine](https://huggingface.co/docs/transformers/model_doc/moonshine#transformers.MoonshineModel)
 * [Mimi](https://huggingface.co/docs/transformers/model_doc/mimi)
 * [Mistral](https://huggingface.co/docs/transformers/model_doc/mistral#transformers.MistralModel)
 * [Mllama](https://huggingface.co/docs/transformers/model_doc/mllama#transformers.MllamaForConditionalGeneration)
 * [Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral#transformers.MixtralModel)
+* [ModernBert](https://huggingface.co/docs/transformers/model_doc/modernbert#transformers.ModernBert)
 * [Moshi](https://huggingface.co/docs/transformers/model_doc/moshi#transformers.MoshiModel)
 * [Musicgen](https://huggingface.co/docs/transformers/model_doc/musicgen#transformers.MusicgenModel)
 * [MusicGen Melody](https://huggingface.co/docs/transformers/model_doc/musicgen_melody#transformers.MusicgenMelodyModel)
@ -270,8 +287,8 @@ For now, Transformers supports SDPA inference and training for the following arc
 * [Phi3](https://huggingface.co/docs/transformers/model_doc/phi3#transformers.Phi3Model)
 * [PhiMoE](https://huggingface.co/docs/transformers/model_doc/phimoe#transformers.PhimoeModel)
 * [Idefics](https://huggingface.co/docs/transformers/model_doc/idefics#transformers.IdeficsModel)
-* [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper#transformers.WhisperModel)
 * [mBart](https://huggingface.co/docs/transformers/model_doc/mbart#transformers.MBartModel)
+* [Moonshine](https://huggingface.co/docs/transformers/model_doc/moonshine#transformers.MoonshineModel)
 * [Mistral](https://huggingface.co/docs/transformers/model_doc/mistral#transformers.MistralModel)
 * [Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral#transformers.MixtralModel)
 * [StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm#transformers.StableLmModel)
@ -321,10 +338,11 @@ In that case, you should see a warning message and we will fall back to the (slo

 </Tip>

-By default, SDPA selects the most performant kernel available but you can check whether a backend is available in a given setting (hardware, problem size) with [`torch.backends.cuda.sdp_kernel`](https://pytorch.org/docs/master/backends.html#torch.backends.cuda.sdp_kernel) as a context manager:
+By default, SDPA selects the most performant kernel available but you can check whether a backend is available in a given setting (hardware, problem size) with [`torch.nn.attention.sdpa_kernel`](https://pytorch.org/docs/stable/generated/torch.nn.attention.sdpa_kernel.html) as a context manager:

 ```diff
 import torch
+ from torch.nn.attention import SDPBackend, sdpa_kernel
 from transformers import AutoModelForCausalLM, AutoTokenizer

 tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
@ -333,7 +351,7 @@ model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype=to
 input_text = "Hello my dog is cute and"
 inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

-+ with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
+ with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
    outputs = model.generate(**inputs)

 print(tokenizer.decode(outputs[0], skip_special_tokens=True))
@ -451,7 +469,7 @@ generated_ids = model.generate(**inputs)
 outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
 ```

-To load a model in 4-bit for inference with multiple GPUs, you can control how much GPU RAM you want to allocate to each GPU. For example, to distribute 1GB of memory to the first GPU and 2GB of memory to the second GPU:
+To load a model in 8-bit for inference with multiple GPUs, you can control how much GPU RAM you want to allocate to each GPU. For example, to distribute 1GB of memory to the first GPU and 2GB of memory to the second GPU:

 ```py
 max_memory_mapping = {0: "1GB", 1: "2GB"}
@ -507,6 +525,7 @@ It is often possible to combine several of the optimization techniques described

 ```py
 import torch
+from torch.nn.attention import SDPBackend, sdpa_kernel
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

 # load model in 4-bit
@ -525,7 +544,7 @@ input_text = "Hello my dog is cute and"
 inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

 # enable FlashAttention
-with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
+with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
    outputs = model.generate(**inputs)

 print(tokenizer.decode(outputs[0], skip_special_tokens=True))
--- a/docs/source/en/quantization/higgs.md
+++ b/docs/source/en/quantization/higgs.md
@ -0,0 +1,66 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# HIGGS
+
+HIGGS is a 0-shot quantization algorithm that combines Hadamard preprocessing with MSE-Optimal quantization grids to achieve lower quantization error and SOTA performance. You can find more information in the paper [arxiv.org/abs/2411.17525](https://arxiv.org/abs/2411.17525).
+
+Runtime support for HIGGS is implemented through [FLUTE](https://arxiv.org/abs/2407.10960), and its [library](https://github.com/HanGuo97/flute).
+
+## Quantization Example
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer, HiggsConfig
+
+model = AutoModelForCausalLM.from_pretrained(
+    "google/gemma-2-9b-it",
+    quantization_config=HiggsConfig(bits=4),
+    device_map="auto",
+)
+
+tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b-it")
+
+tokenizer.decode(model.generate(
+    **tokenizer("Hi,", return_tensors="pt").to(model.device),
+    temperature=0.5,
+    top_p=0.80,
+)[0])
+```
+
+## Pre-quantized models
+
+Some pre-quantized models can be found in the [official collection](https://huggingface.co/collections/ISTA-DASLab/higgs-675308e432fd56b7f6dab94e) on Hugging Face Hub.
+
+## Current Limitations
+
+**Architectures**
+
+Currently, FLUTE, and HIGGS by extension, **only support Llama 3 and 3.0 of 8B, 70B and 405B parameters, as well as Gemma-2 9B and 27B**. We're working on allowing to run more diverse models as well as allow arbitrary models by modifying the FLUTE compilation procedure.
+
+**torch.compile**
+
+HIGGS is fully compatible with `torch.compile`. Compiling `model.forward`, as described [here](../perf_torch_compile.md), here're the speedups it provides on RTX 4090 for `Llama-3.1-8B-Instruct` (forward passes/sec):
+
+| Batch Size | BF16 (With `torch.compile`) | HIGGS 4bit (No `torch.compile`) | HIGGS 4bit (With `torch.compile`) |
+|------------|-----------------------------|----------------------------------|-----------------------------------|
+| 1          | 59                          | 41                               | 124                               |
+| 4          | 57                          | 42                               | 123                               |
+| 16         | 56                          | 41                               | 120                               |
+
+
+**Quantized training**
+
+Currently, HIGGS doesn't support quantized training (and backward passes in general). We're working on adding support for it.
--- a/docs/source/en/quantization/overview.md
+++ b/docs/source/en/quantization/overview.md
@ -54,10 +54,12 @@ Use the table below to help you decide which quantization method to use.
 | [EETQ](./eetq)                                | 🟢                       | 🔴   | 🟢        | 🔴              | 🔴         | 🔴                     | ?                       | 8              | 🟢                                   | 🟢            | 🟢                      | https://github.com/NetEase-FuXi/EETQ        |
 | GGUF / GGML (llama.cpp)             | 🟢                       | 🟢   | 🟢        | 🔴              | 🟢                     | 🔴         | 🔴                       | 1 - 8          | 🔴                                   | [See GGUF section](../gguf)                | [See GGUF section](../gguf)                      | https://github.com/ggerganov/llama.cpp      |
 | [GPTQ](./gptq)                                | 🔴                       | 🔴   | 🟢        | 🟢              | 🔴                     | 🔴         | 🔴                       | 2 - 3 - 4 - 8          | 🟢                                   | 🟢            | 🟢                      | https://github.com/AutoGPTQ/AutoGPTQ        |
+| [HIGGS](./higgs)                             | 🟢                       | 🔴    | 🟢        | 🔴              | 🔴                     | 🔴         | 🟢                       | 2 - 4          | 🔴                                   | 🟢            | 🟢                      | https://github.com/HanGuo97/flute           |       
 | [HQQ](./hqq)                                 | 🟢                       | 🟢    | 🟢        | 🔴              | 🔴                     | 🔴         | 🟢                       | 1 - 8          | 🟢                                   | 🔴            | 🟢                      | https://github.com/mobiusml/hqq/            |
 | [optimum-quanto](./quanto)                              | 🟢                       | 🟢   | 🟢        | 🔴              | 🟢                     | 🔴         | 🟢                       | 2 / 4 / 8      | 🔴                                   | 🔴            | 🟢                      | https://github.com/huggingface/optimum-quanto       |
 | [FBGEMM_FP8](./fbgemm_fp8.md)                              | 🟢                       | 🔴    | 🟢        | 🔴              | 🔴                      | 🔴         | 🔴                        | 8      | 🔴                                   | 🟢            | 🟢                      | https://github.com/pytorch/FBGEMM       |
 | [torchao](./torchao.md)                              | 🟢                       |     | 🟢        | 🔴              | partial support (int4 weight only)       | 🔴         |                       | 4 / 8      |                                   | 🟢🔴           | 🟢                      | https://github.com/pytorch/ao       |
+| [VPTQ](./vptq)                      | 🔴                       |  🔴   |     🟢     | 🟡              | 🔴      | 🔴                | 🟢                      | 1 - 8          | 🔴                                   | 🟢            | 🟢                      | https://github.com/microsoft/VPTQ            |

 <Tip>

@ -71,4 +73,4 @@ We value your feedback to help identify bugs before the full release! Check out

 \** bitsandbytes is seeking contributors to help develop and lead the Apple Silicon backend. Interested? Contact them directly via their repo. Stipends may be available through sponsorships.

-</Tip>
+</Tip>
--- a/docs/source/en/quantization/vptq.md
+++ b/docs/source/en/quantization/vptq.md
@ -0,0 +1,111 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# VPTQ 
+
+> [!TIP]
+> Try VPTQ on [Hugging Face](https://huggingface.co/spaces/microsoft/VPTQ)!
+> Try VPTQ on [Google Colab](https://colab.research.google.com/github/microsoft/VPTQ/blob/main/notebooks/vptq_example.ipynb)!
+> Know more about VPTQ on [ArXiv](https://arxiv.org/pdf/2409.17066)!
+
+Vector Post-Training Quantization ([VPTQ](https://github.com/microsoft/VPTQ)) is a novel Post-Training Quantization method that leverages Vector Quantization to high accuracy on LLMs at an extremely low bit-width (<2-bit). VPTQ can compress 70B, even the 405B model, to 1-2 bits without retraining and maintain high accuracy.
+
+- Better Accuracy on 1-2 bits, (405B @ <2bit, 70B @ 2bit)
+- Lightweight Quantization Algorithm: only cost ~17 hours to quantize 405B Llama-3.1
+- Agile Quantization Inference: low decode overhead, best throughput, and TTFT
+
+Inference support for VPTQ is released in the `vptq` library. Make sure to install it to run the models:
+```bash
+pip install vptq
+```
+
+The library provides efficient kernels for NVIDIA/AMD GPU inference.
+
+To run VPTQ models simply load a model that has been quantized with VPTQ:
+
+## Inference example
+**Run Llama 3.1 70b on RTX4090 (24G @ ~2bits) in real time**
+![Llama3 1-70b-prompt](https://github.com/user-attachments/assets/d8729aca-4e1d-4fe1-ac71-c14da4bdd97f)
+
+
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+quantized_model = AutoModelForCausalLM.from_pretrained(
+    "VPTQ-community/Meta-Llama-3.1-70B-Instruct-v16-k65536-65536-woft",
+    torch_dtype="auto", 
+    device_map="auto"
+)
+tokenizer = AutoTokenizer.from_pretrained("VPTQ-community/Meta-Llama-3.1-70B-Instruct-v16-k65536-65536-woft")
+input_ids = tokenizer("hello, it's me", return_tensors="pt").to("cuda")
+out = model.generate(**input_ids, max_new_tokens=32, do_sample=False)
+```
+
+## Quantize your own model
+VPTQ algorithm early-released at [VPTQ ](https://github.com/microsoft/VPTQ/tree/algorithm), 
+and checkout the [tutorial](https://github.com/microsoft/VPTQ/blob/algorithm/algorithm.md).
+
+## Early Results from Tech Report
+VPTQ achieves better accuracy and higher throughput with lower quantization overhead across models of different sizes. The following experimental results are for reference only; VPTQ can achieve better outcomes under reasonable parameters, especially in terms of model accuracy and inference speed.
+
+
+| Model       | bitwidth | W2↓  | C4↓  | AvgQA↑ | tok/s↑ | mem(GB) | cost/h↓ |
+| ----------- | -------- | ---- | ---- | ------ | ------ | ------- | ------- |
+| LLaMA-2 7B  | 2.02     | 6.13 | 8.07 | 58.2   | 39.9   | 2.28    | 2       |
+|             | 2.26     | 5.95 | 7.87 | 59.4   | 35.7   | 2.48    | 3.1     |
+| LLaMA-2 13B | 2.02     | 5.32 | 7.15 | 62.4   | 26.9   | 4.03    | 3.2     |
+|             | 2.18     | 5.28 | 7.04 | 63.1   | 18.5   | 4.31    | 3.6     |
+| LLaMA-2 70B | 2.07     | 3.93 | 5.72 | 68.6   | 9.7    | 19.54   | 19      |
+|             | 2.11     | 3.92 | 5.71 | 68.7   | 9.7    | 20.01   | 19      |
+
+
+
+## More Models in [VPTQ-community](https://huggingface.co/VPTQ-community) 
+
+⚠️ The repository only provides a method of model quantization algorithm. 
+
+⚠️ The open-source community VPTQ-community provides models based on the technical report and quantization algorithm.
+
+
+
+**Quick Estimation of Model Bitwidth (Excluding Codebook Overhead)**:
+
+- **Model Naming Convention**: The model's name includes the **vector length** $v$, **codebook (lookup table) size**, and **residual codebook size**. For example, "Meta-Llama-3.1-70B-Instruct-v8-k65536-256-woft" is "Meta-Llama-3.1-70B-Instruct", where:
+  - **Vector Length**: 8
+  - **Number of Centroids**: 65536 (2^16)
+  - **Number of Residual Centroids**: 256 (2^8)
+- **Equivalent Bitwidth Calculation**:
+  - **Index**: log2(65536) = 16 / 8 = 2 bits
+  - **Residual Index**: log2(256) = 8 / 8 = 1 bit
+  - **Total Bitwidth**: 2 + 1 = 3 bits
+- **Model Size Estimation**: 70B * 3 bits / 8 bits per Byte = 26.25 GB
+
+- **Note**: This estimate does not include the size of the codebook (lookup table), other parameter overheads, and the padding overhead for storing indices. For the detailed calculation method, please refer to **Tech Report Appendix C.2**.
+
+
+|            Model Series            |  Collections                              | (Estimated) Bit per weight     |
+| :--------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------: | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+|       Llama 3.1 Nemotron 70B Instruct HF        |  [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-llama-31-nemotron-70b-instruct-hf-without-finetune-671730b96f16208d0b3fe942)  | [4 bits](https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v8-k65536-256-woft) [2 bits (1)](https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v16-k65536-65536-woft) [2 bits (2)](https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v8-k65536-0-woft) [1.875 bits](https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v16-k65536-16384-woft) [1.625 bits](https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v16-k65536-1024-woft) [1.5 bits](https://huggingface.co/VPTQ-community/Llama-3.1-Nemotron-70B-Instruct-HF-v16-k65536-256-woft) |
+|       Llama 3.1 8B Instruct        |  [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-llama-31-8b-instruct-without-finetune-66f2b70b1d002ceedef02d2e)  | [4 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-8B-Instruct-v8-k65536-65536-woft) [3.5 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-8B-Instruct-v8-k65536-4096-woft) [3 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-8B-Instruct-v8-k65536-256-woft) [2.3 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-8B-Instruct-v12-k65536-4096-woft)                                                                                                                                                                                                                                                                                                                                                                                                              |
+|       Llama 3.1 70B Instruct       | [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-llama-31-70b-instruct-without-finetune-66f2bf454d3dd78dfee2ff11)  | [4 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v8-k65536-256-woft) [2.25 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v8-k65536-4-woft)  [2 bits (1)](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v16-k65536-65536-woft) [2 bits (2)](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v8-k65536-0-woft) [1.93 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v16-k65536-32768-woft) [1.875 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v8-k32768-0-woft) [1.75 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-70B-Instruct-v8-k16384-0-woft) |
+|      Llama 3.1 405B Instruct       | [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-llama-31-405b-instruct-without-finetune-66f4413f9ba55e1a9e52cfb0) | [4 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v8-k65536-256-woft) [2 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v16-k65536-65536-woft) [1.875 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v16-k32768-32768-woft) [1.625 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v16-k65536-1024-woft) [1.5 bits (1)](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v8-k4096-0-woft) [1.5 bits (2)](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v16-k65536-256-woft) [1.43 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v16-k65536-128-woft) [1.375 bits](https://huggingface.co/VPTQ-community/Meta-Llama-3.1-405B-Instruct-v16-k65536-64-woft) |
+| Mistral Large Instruct 2407 (123B) | [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-mistral-large-instruct-2407-without-finetune-6711ebfb7faf85eed9cceb16) | [4 bits](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v8-k65536-256-woft) [2 bits (1)](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v16-k65536-65536-woft) [2 bits (2)](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v8-k65536-0-woft) [1.875 bits](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v16-k65536-16384-woft) [1.75 bits](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v16-k65536-4096-woft) [1.625 bits](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v16-k65536-1024-woft) [1.5 bits](https://huggingface.co/VPTQ-community/Mistral-Large-Instruct-2407-v16-k65536-256-woft) |
+|        Qwen 2.5 7B Instruct        |  [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-qwen-25-7b-instruct-without-finetune-66f3e9866d3167cc05ce954a)   | [4 bits](https://huggingface.co/VPTQ-community/Qwen2.5-7B-Instruct-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Qwen2.5-7B-Instruct-v8-k65536-256-woft) [2 bits (1)](https://huggingface.co/VPTQ-community/Qwen2.5-7B-Instruct-v8-k256-256-woft) [2 bits (2)](https://huggingface.co/VPTQ-community/Qwen2.5-7B-Instruct-v8-k65536-0-woft)  [2 bits (3)](https://huggingface.co/VPTQ-community/Qwen2.5-7B-Instruct-v16-k65536-65536-woft)                                                                                                                                                                                                                                                                                                                                              |
+|       Qwen 2.5 14B Instruct        |  [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-qwen-25-14b-instruct-without-finetune-66f827f83c7ffa7931b8376c)  | [4 bits](https://huggingface.co/VPTQ-community/Qwen2.5-14B-Instruct-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Qwen2.5-14B-Instruct-v8-k65536-256-woft) [2 bits (1)](https://huggingface.co/VPTQ-community/Qwen2.5-14B-Instruct-v8-k256-256-woft) [2 bits (2)](https://huggingface.co/VPTQ-community/Qwen2.5-14B-Instruct-v8-k65536-0-woft)  [2 bits (3)](https://huggingface.co/VPTQ-community/Qwen2.5-14B-Instruct-v16-k65536-65536-woft)                                                                                                                                                                                                                                                                                                                                         |
+|       Qwen 2.5 32B Instruct        |  [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-qwen-25-32b-instruct-without-finetune-66fe77173bf7d64139f0f613)  | [4 bits](https://huggingface.co/VPTQ-community/Qwen2.5-32B-Instruct-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Qwen2.5-32B-Instruct-v8-k65536-256-woft) [2 bits (1)](https://huggingface.co/VPTQ-community/Qwen2.5-32B-Instruct-v16-k65536-65536-woft) [2 bits (2)](https://huggingface.co/VPTQ-community/Qwen2.5-32B-Instruct-v8-k65536-0-woft) [2 bits (3)](https://huggingface.co/VPTQ-community/Qwen2.5-32B-Instruct-v8-k256-256-woft)                                                                                                                                                                                                                                                                                                                                          |
+|       Qwen 2.5 72B Instruct        |  [HF 🤗](https://huggingface.co/collections/VPTQ-community/vptq-qwen-25-72b-instruct-without-finetune-66f3bf1b3757dfa1ecb481c0)  | [4 bits](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v8-k65536-65536-woft) [3 bits](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v8-k65536-256-woft) [2.38 bits](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v8-k1024-512-woft) [2.25 bits (1)](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v8-k512-512-woft) [2.25 bits (2)](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v8-k65536-4-woft) [2 bits (1)](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v8-k65536-0-woft) [2 bits (2)](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v16-k65536-65536-woft) [1.94 bits](https://huggingface.co/VPTQ-community/Qwen2.5-72B-Instruct-v16-k65536-32768-woft)                                                  |
+|  Reproduced from the tech report   |     [HF 🤗](https://huggingface.co/collections/VPTQ-community/reproduced-vptq-tech-report-baseline-66fbf1dffe741cc9e93ecf04)     | Results from the open source community for reference only, please use them responsibly.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| Hessian and Inverse Hessian Matrix |      [HF 🤗](https://huggingface.co/collections/VPTQ-community/hessian-and-invhessian-checkpoints-66fd249a104850d17b23fd8b)      | Collected from RedPajama-Data-1T-Sample, following [Quip#](https://github.com/Cornell-RelaxML/quip-sharp/blob/main/quantize_llama/hessian_offline_llama.py)                                                                                                                                                                                                               
--- a/docs/source/en/tasks/asr.md
+++ b/docs/source/en/tasks/asr.md
@ -20,12 +20,12 @@ rendered properly in your Markdown viewer.

 <Youtube id="TksaY_FDgnk"/>

-Automatic speech recognition (ASR) converts a speech signal to text, mapping a sequence of audio inputs to text outputs. Virtual assistants like Siri and Alexa use ASR models to help users everyday, and there are many other useful user-facing applications like live captioning and note-taking during meetings.
+Automatic speech recognition (ASR) converts a speech signal to text, mapping a sequence of audio inputs to text outputs. Virtual assistants like Siri and Alexa use ASR models to help users every day, and there are many other useful user-facing applications like live captioning and note-taking during meetings.

 This guide will show you how to:

-1. Finetune [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) on the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset to transcribe audio to text.
-2. Use your finetuned model for inference.
+1. Fine-tune [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) on the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset to transcribe audio to text.
+2. Use your fine-tuned model for inference.

 <Tip>

@ -49,7 +49,7 @@ We encourage you to login to your Hugging Face account so you can upload and sha

 ## Load MInDS-14 dataset

-Start by loading a smaller subset of the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset from the 🤗 Datasets library. This'll give you a chance to experiment and make sure everything works before spending more time training on the full dataset.
+Start by loading a smaller subset of the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset from the 🤗 Datasets library. This will give you a chance to experiment and make sure everything works before spending more time training on the full dataset.

 ```py
 >>> from datasets import load_dataset, Audio
@ -79,13 +79,13 @@ DatasetDict({
 })
 ```

-While the dataset contains a lot of useful information, like `lang_id` and `english_transcription`, you'll focus on the `audio` and `transcription` in this guide. Remove the other columns with the [`~datasets.Dataset.remove_columns`] method:
+While the dataset contains a lot of useful information, like `lang_id` and `english_transcription`, this guide focuses on the `audio` and `transcription`. Remove the other columns with the [`~datasets.Dataset.remove_columns`] method:

 ```py
 >>> minds = minds.remove_columns(["english_transcription", "intent_class", "lang_id"])
 ```

-Take a look at the example again:
+Review the example again:

 ```py
 >>> minds["train"][0]
@ -112,7 +112,7 @@ The next step is to load a Wav2Vec2 processor to process the audio signal:
 >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base")
 ```

-The MInDS-14 dataset has a sampling rate of 8000kHz (you can find this information in its [dataset card](https://huggingface.co/datasets/PolyAI/minds14)), which means you'll need to resample the dataset to 16000kHz to use the pretrained Wav2Vec2 model:
+The MInDS-14 dataset has a sampling rate of 8000Hz (you can find this information in its [dataset card](https://huggingface.co/datasets/PolyAI/minds14)), which means you'll need to resample the dataset to 16000Hz to use the pretrained Wav2Vec2 model:

 ```py
 >>> minds = minds.cast_column("audio", Audio(sampling_rate=16_000))
@ -125,7 +125,7 @@ The MInDS-14 dataset has a sampling rate of 8000kHz (you can find this informati
 'transcription': "hi I'm trying to use the banking app on my phone and currently my checking and savings account balance is not refreshing"}
 ```

-As you can see in the `transcription` above, the text contains a mix of upper and lowercase characters. The Wav2Vec2 tokenizer is only trained on uppercase characters so you'll need to make sure the text matches the tokenizer's vocabulary:
+As you can see in the `transcription` above, the text contains a mix of uppercase and lowercase characters. The Wav2Vec2 tokenizer is only trained on uppercase characters so you'll need to make sure the text matches the tokenizer's vocabulary:

 ```py
 >>> def uppercase(example):
@ -196,7 +196,7 @@ Now instantiate your `DataCollatorForCTCWithPadding`:

 ## Evaluate

-Including a metric during training is often helpful for evaluating your model's performance. You can quickly load an evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [word error rate](https://huggingface.co/spaces/evaluate-metric/wer) (WER) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric):
+Including a metric during training is often helpful for evaluating your model's performance. You can quickly load an evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [word error rate](https://huggingface.co/spaces/evaluate-metric/wer) (WER) metric (refer to the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about loading and computing metrics):

 ```py
 >>> import evaluate
@ -236,7 +236,7 @@ If you aren't familiar with finetuning a model with the [`Trainer`], take a look

 </Tip>

-You're ready to start training your model now! Load Wav2Vec2 with [`AutoModelForCTC`]. Specify the reduction to apply with the `ctc_loss_reduction` parameter. It is often better to use the average instead of the default summation:
+You are now ready to start training your model! Load Wav2Vec2 with [`AutoModelForCTC`]. Specify the reduction to apply with the `ctc_loss_reduction` parameter. It is often better to use the average instead of the default summation:

 ```py
 >>> from transformers import AutoModelForCTC, TrainingArguments, Trainer
@ -252,7 +252,7 @@ At this point, only three steps remain:

 1. Define your training hyperparameters in [`TrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the WER and save the training checkpoint.
 2. Pass the training arguments to [`Trainer`] along with the model, dataset, tokenizer, data collator, and `compute_metrics` function.
-3. Call [`~Trainer.train`] to finetune your model.
+3. Call [`~Trainer.train`] to fine-tune your model.

 ```py
 >>> training_args = TrainingArguments(
@ -289,7 +289,7 @@ At this point, only three steps remain:
 >>> trainer.train()
 ```

-Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model:
+Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so it can be accessible to everyone:

 ```py
 >>> trainer.push_to_hub()
@ -299,13 +299,13 @@ Once training is completed, share your model to the Hub with the [`~transformers

 <Tip>

-For a more in-depth example of how to finetune a model for automatic speech recognition, take a look at this blog [post](https://huggingface.co/blog/fine-tune-wav2vec2-english) for English ASR and this [post](https://huggingface.co/blog/fine-tune-xlsr-wav2vec2) for multilingual ASR.
+For a more in-depth example of how to fine-tune a model for automatic speech recognition, take a look at this blog [post](https://huggingface.co/blog/fine-tune-wav2vec2-english) for English ASR and this [post](https://huggingface.co/blog/fine-tune-xlsr-wav2vec2) for multilingual ASR.

 </Tip>

 ## Inference

-Great, now that you've finetuned a model, you can use it for inference!
+Great, now that you've fine-tuned a model, you can use it for inference!

 Load an audio file you'd like to run inference on. Remember to resample the sampling rate of the audio file to match the sampling rate of the model if you need to!

@ -318,7 +318,7 @@ Load an audio file you'd like to run inference on. Remember to resample the samp
 >>> audio_file = dataset[0]["audio"]["path"]
 ```

-The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for automatic speech recognition with your model, and pass your audio file to it:
+The simplest way to try out your fine-tuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for automatic speech recognition with your model, and pass your audio file to it:

 ```py
 >>> from transformers import pipeline
--- a/docs/source/en/tasks/audio_classification.md
+++ b/docs/source/en/tasks/audio_classification.md
@ -9,7 +9,7 @@ Unless required by applicable law or agreed to in writing, software distributed
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.

-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
 rendered properly in your Markdown viewer.

 -->
@ -20,12 +20,12 @@ rendered properly in your Markdown viewer.

 <Youtube id="KWwzcmG98Ds"/>

-Audio classification - just like with text - assigns a class label output from the input data. The only difference is instead of text inputs, you have raw audio waveforms. Some practical applications of audio classification include identifying speaker intent, language classification, and even animal species by their sounds.
+Audio classification - just like with text - assigns a class label as output from the input data. The only difference is instead of text inputs, you have raw audio waveforms. Some practical applications of audio classification include identifying speaker intent, language classification, and even animal species by their sounds.

 This guide will show you how to:

-1. Finetune [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) on the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset to classify speaker intent.
-2. Use your finetuned model for inference.
+1. Fine-tune [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) on the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset to classify speaker intent.
+2. Use your fine-tuned model for inference.

 <Tip>

@ -57,7 +57,7 @@ Start by loading the MInDS-14 dataset from the 🤗 Datasets library:
 >>> minds = load_dataset("PolyAI/minds14", name="en-US", split="train")
 ```

-Split the dataset's `train` split into a smaller train and test set with the [`~datasets.Dataset.train_test_split`] method. This'll give you a chance to experiment and make sure everything works before spending more time on the full dataset.
+Split the dataset's `train` split into a smaller train and test set with the [`~datasets.Dataset.train_test_split`] method. This will give you a chance to experiment and make sure everything works before spending more time on the full dataset.

 ```py
 >>> minds = minds.train_test_split(test_size=0.2)
@ -79,13 +79,13 @@ DatasetDict({
 })
 ```

-While the dataset contains a lot of useful information, like `lang_id` and `english_transcription`, you'll focus on the `audio` and `intent_class` in this guide. Remove the other columns with the [`~datasets.Dataset.remove_columns`] method:
+While the dataset contains a lot of useful information, like `lang_id` and `english_transcription`, you will focus on the `audio` and `intent_class` in this guide. Remove the other columns with the [`~datasets.Dataset.remove_columns`] method:

 ```py
 >>> minds = minds.remove_columns(["path", "transcription", "english_transcription", "lang_id"])
 ```

-Take a look at an example now:
+Here's an example:

 ```py
 >>> minds["train"][0]
@ -128,7 +128,7 @@ The next step is to load a Wav2Vec2 feature extractor to process the audio signa
 >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
 ```

-The MInDS-14 dataset has a sampling rate of 8000khz (you can find this information in it's [dataset card](https://huggingface.co/datasets/PolyAI/minds14)), which means you'll need to resample the dataset to 16000kHz to use the pretrained Wav2Vec2 model:
+The MInDS-14 dataset has a sampling rate of 8kHz (you can find this information in its [dataset card](https://huggingface.co/datasets/PolyAI/minds14)), which means you'll need to resample the dataset to 16kHz to use the pretrained Wav2Vec2 model:

 ```py
 >>> minds = minds.cast_column("audio", Audio(sampling_rate=16_000))
@ -155,7 +155,7 @@ Now create a preprocessing function that:
 ...     return inputs
 ```

-To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.map`] function. You can speed up `map` by setting `batched=True` to process multiple elements of the dataset at once. Remove the columns you don't need, and rename `intent_class` to `label` because that's the name the model expects:
+To apply the preprocessing function over the entire dataset, use 🤗 Datasets [`~datasets.Dataset.map`] function. You can speed up `map` by setting `batched=True` to process multiple elements of the dataset at once. Remove unnecessary columns and rename `intent_class` to `label`, as required by the model:

 ```py
 >>> encoded_minds = minds.map(preprocess_function, remove_columns="audio", batched=True)
@ -208,9 +208,9 @@ You're ready to start training your model now! Load Wav2Vec2 with [`AutoModelFor

 At this point, only three steps remain:

-1. Define your training hyperparameters in [`TrainingArguments`]. The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the accuracy and save the training checkpoint.
+1. Define your training hyperparameters in [`TrainingArguments`]. The only required parameter is `output_dir`, which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [`Trainer`] will evaluate the accuracy and save the training checkpoint.
 2. Pass the training arguments to [`Trainer`] along with the model, dataset, tokenizer, data collator, and `compute_metrics` function.
-3. Call [`~Trainer.train`] to finetune your model.
+3. Call [`~Trainer.train`] to fine-tune your model.


 ```py
@ -252,15 +252,15 @@ Once training is completed, share your model to the Hub with the [`~transformers

 <Tip>

-For a more in-depth example of how to finetune a model for audio classification, take a look at the corresponding [PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/audio_classification.ipynb).
+For a more in-depth example of how to fine-tune a model for audio classification, take a look at the corresponding [PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/audio_classification.ipynb).

 </Tip>

 ## Inference

-Great, now that you've finetuned a model, you can use it for inference!
+Great, now that you've fine-tuned a model, you can use it for inference!

-Load an audio file you'd like to run inference on. Remember to resample the sampling rate of the audio file to match the sampling rate of the model if you need to!
+Load an audio file for inference. Remember to resample the sampling rate of the audio file to match the model's sampling rate, if necessary.

 ```py
 >>> from datasets import load_dataset, Audio
@ -271,7 +271,7 @@ Load an audio file you'd like to run inference on. Remember to resample the samp
 >>> audio_file = dataset[0]["audio"]["path"]
 ```

-The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for audio classification with your model, and pass your audio file to it:
+The simplest way to try out your fine-tuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for audio classification with your model, and pass your audio file to it:

 ```py
 >>> from transformers import pipeline
--- a/docs/source/en/tasks/multiple_choice.md
+++ b/docs/source/en/tasks/multiple_choice.md
@ -419,7 +419,7 @@ Get the class with the highest probability:
 ```py
 >>> predicted_class = logits.argmax().item()
 >>> predicted_class
-'0'
+0
 ```
 </pt>
 <tf>
@ -448,7 +448,7 @@ Get the class with the highest probability:
 ```py
 >>> predicted_class = int(tf.math.argmax(logits, axis=-1)[0])
 >>> predicted_class
-'0'
+0
 ```
 </tf>
 </frameworkcontent>
--- a/docs/source/en/tasks/question_answering.md
+++ b/docs/source/en/tasks/question_answering.md
@ -325,7 +325,7 @@ or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/no

 Evaluation for question answering requires a significant amount of postprocessing. To avoid taking up too much of your time, this guide skips the evaluation step. The [`Trainer`] still calculates the evaluation loss during training so you're not completely in the dark about your model's performance.

-If have more time and you're interested in how to evaluate your model for question answering, take a look at the [Question answering](https://huggingface.co/course/chapter7/7?fw=pt#post-processing) chapter from the 🤗 Hugging Face Course!
+If you have more time and you're interested in how to evaluate your model for question answering, take a look at the [Question answering](https://huggingface.co/course/chapter7/7?fw=pt#post-processing) chapter from the 🤗 Hugging Face Course!

 ## Inference

@ -397,7 +397,7 @@ Tokenize the text and return TensorFlow tensors:
 >>> from transformers import AutoTokenizer

 >>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_qa_model")
->>> inputs = tokenizer(question, text, return_tensors="tf")
+>>> inputs = tokenizer(question, context, return_tensors="tf")
 ```

 Pass your inputs to the model and return the `logits`:
--- a/docs/source/en/tasks/summarization.md
+++ b/docs/source/en/tasks/summarization.md
@ -283,7 +283,7 @@ Pass your `compute_metrics` function to [`~transformers.KerasMetricCallback`]:
 ```py
 >>> from transformers.keras_callbacks import KerasMetricCallback

->>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
+>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_test_set)
 ```

 Specify where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]:
--- a/docs/source/en/tasks/translation.md
+++ b/docs/source/en/tasks/translation.md
@ -290,7 +290,7 @@ Pass your `compute_metrics` function to [`~transformers.KerasMetricCallback`]:
 ```py
 >>> from transformers.keras_callbacks import KerasMetricCallback

->>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
+>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_test_set)
 ```

 Specify where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]:
--- a/docs/source/en/tasks/video_text_to_text.md
+++ b/docs/source/en/tasks/video_text_to_text.md
@ -144,4 +144,4 @@ print(processor.decode(output[0][2:], skip_special_tokens=True)[len(user_prompt)

 And voila! 

-To learn more about chat templates and token streaming for video-text-to-text models, refer to the [image-text-to-text](../image_text_to_text) task guide because these models work similarly.
+To learn more about chat templates and token streaming for video-text-to-text models, refer to the [image-text-to-text](../tasks/image_text_to_text) task guide because these models work similarly.
--- a/docs/source/es/quicktour.md
+++ b/docs/source/es/quicktour.md
@ -385,8 +385,8 @@ Una característica particularmente interesante de 🤗 Transformers es la habil
 ```py
 >>> from transformers import AutoModel

->>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
->>> pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True)
+>>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True)
 ```
 </pt>
 <tf>
@ -394,8 +394,8 @@ Una característica particularmente interesante de 🤗 Transformers es la habil
 ```py
 >>> from transformers import TFAutoModel

->>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True)
+>>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True)
 ```
 </tf>
 </frameworkcontent>
--- a/docs/source/fr/_toctree.yml
+++ b/docs/source/fr/_toctree.yml
@ -1,30 +1,36 @@
 - sections:
-    - local: index
-      title: 🤗 Transformers
-    - local: quicktour
-      title: Visite rapide
-    - local: installation
-      title: Installation
+  - local: index
+    title: 🤗 Transformers
+  - local: quicktour
+    title: Visite rapide
+  - local: installation
+    title: Installation
  title: Démarrer
 - sections:
-    - local: tutoriel_pipeline
-      title: Pipelines pour l'inférence
-    - local: autoclass_tutorial
-      title: Chargement d'instances pré-entraînées avec une AutoClass
-    - local: in_translation
-      title: Préparation des données
-    - local: in_translation
-      title: Fine-tune un modèle pré-entraîné
-    - local: run_scripts_fr
-      title: Entraînement avec un script
-    - local: in_translation
-      title: Entraînement distribué avec 🤗 Accelerate
-    - local: in_translation
-      title: Chargement et entraînement des adaptateurs avec 🤗 PEFT
-    - local: in_translation
-      title: Partager un modèle
-    - local: in_translation
-      title: Agents
-    - local: in_translation
-      title: Génération avec LLMs
+  - local: tutoriel_pipeline
+    title: Pipelines pour l'inférence
+  - local: autoclass_tutorial
+    title: Chargement d'instances pré-entraînées avec une AutoClass
+  - local: in_translation
+    title: Préparation des données
+  - local: in_translation
+    title: Fine-tune un modèle pré-entraîné
+  - local: run_scripts_fr
+    title: Entraînement avec un script
+  - local: in_translation
+    title: Entraînement distribué avec 🤗 Accelerate
+  - local: in_translation
+    title: Chargement et entraînement des adaptateurs avec 🤗 PEFT
+  - local: in_translation
+    title: Partager un modèle
+  - local: in_translation
+    title: Agents
+  - local: in_translation
+    title: Génération avec LLMs
  title: Tutoriels
+- sections:
+  - local: task_summary
+    title: Ce que 🤗 Transformers peut faire
+  - local: tasks_explained
+    title: Comment 🤗 Transformers résout ces tâches
+  title: Guides conceptuels
--- a/docs/source/fr/quicktour.md
+++ b/docs/source/fr/quicktour.md
@ -354,8 +354,8 @@ Une fonctionnalité particulièrement cool 🤗 Transformers est la possibilité
 ```py
 >>> from transformers import AutoModel

->>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
->>> pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True)
+>>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True)
 ```
 </pt>
 <tf>
@ -363,8 +363,8 @@ Une fonctionnalité particulièrement cool 🤗 Transformers est la possibilité
 ```py
 >>> from transformers import TFAutoModel

->>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True)
+>>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True)
 ```
 </tf>
 </frameworkcontent>
--- a/docs/source/fr/task_summary.md
+++ b/docs/source/fr/task_summary.md
@ -0,0 +1,341 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Ce que 🤗 Transformers peut faire
+
+🤗 Transformers est une bibliothèque de modèles préentraînés à la pointe de la technologie pour les tâches de traitement du langage naturel (NLP), de vision par ordinateur et de traitement audio et de la parole. Non seulement la bibliothèque contient des modèles Transformer, mais elle inclut également des modèles non-Transformer comme des réseaux convolutionnels modernes pour les tâches de vision par ordinateur. Si vous regardez certains des produits grand public les plus populaires aujourd'hui, comme les smartphones, les applications et les téléviseurs, il est probable qu'une technologie d'apprentissage profond soit derrière. Vous souhaitez supprimer un objet de fond d'une photo prise avec votre smartphone ? C'est un exemple de tâche de segmentation panoptique (ne vous inquiétez pas si vous ne savez pas encore ce que cela signifie, nous le décrirons dans les sections suivantes !).
+
+Cette page fournit un aperçu des différentes tâches de traitement de la parole et de l'audio, de vision par ordinateur et de NLP qui peuvent être résolues avec la bibliothèque 🤗 Transformers en seulement trois lignes de code !
+
+## Audio
+
+Les tâches de traitement audio et de la parole sont légèrement différentes des autres modalités principalement parce que l'audio en tant que donnée d'entrée est un signal continu. Contrairement au texte, un signal audio brut ne peut pas discrétisé de la manière dont une phrase peut être divisée en mots. Pour contourner cela, le signal audio brut est généralement échantillonné à intervalles réguliers. Si vous prenez plus d'échantillons dans un intervalle, le taux d'échantillonnage est plus élevé et l'audio ressemble davantage à la source audio originale.
+
+Les approches précédentes prétraitaient l'audio pour en extraire des caractéristiques utiles. Il est maintenant plus courant de commencer les tâches de traitement audio et de la parole en donnant directement le signal audio brut à un encodeur de caractéristiques (*feature encoder* en anglais) pour extraire une représentation de l'audio. Cela correspond à l'étape de prétraitement et permet au modèle d'apprendre les caractéristiques les plus essentielles du signal.
+
+### Classification audio
+
+La classification audio est une tâche qui consiste à attribuer une classe, parmi un ensemble de classes prédéfini, à un audio. La classification audio englobe de nombreuses applications spécifiques, dont certaines incluent :
+
+* la classification d'environnements sonores : attribuer une classe (catégorie) à l'audio pour indiquer l'environnement associé, tel que "bureau", "plage" ou "stade". 
+* la détection d'événements sonores : étiqueter l'audio avec une étiquette d'événement sonore ("klaxon de voiture", "appel de baleine", "verre brisé")
+* l'identification d'éléments sonores : attribuer des tags (*étiquettes* en français) à l'audio pour marquer des sons spécifiques, comme "chant des oiseaux" ou "identification du locuteur lors d'une réunion".
+* la classification musicale : attribuer un genre à la musique, comme "metal", "hip-hop" ou "country".
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline(task="audio-classification", model="superb/hubert-base-superb-er")
+>>> preds = classifier("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+>>> preds
+[{'score': 0.4532, 'label': 'hap'},
+ {'score': 0.3622, 'label': 'sad'},
+ {'score': 0.0943, 'label': 'neu'},
+ {'score': 0.0903, 'label': 'ang'}]
+```
+
+### Reconnaissance vocale
+
+La reconnaissance vocale (*Automatic Speech Recognition* ou ASR en anglais) transcrit la parole en texte. C'est l'une des tâches audio les plus courantes en partie parce que la parole est une forme de communication la plus naturelle pour nous, humains. Aujourd'hui, les systèmes ASR sont intégrés dans des produits technologiques "intelligents" comme les enceintes, les téléphones et les voitures. Il est désormais possible de demander à nos assistants virtuels de jouer de la musique, de définir des rappels et de nous indiquer la météo.
+
+Mais l'un des principaux défis auxquels les architectures Transformer contribuent à résoudre est celui des langues à faibles ressources, c'est-à-dire des langues pour lesquelles il existe peu de données étiquetées. En préentraînant sur de grandes quantités de données vocales d'un autre language plus ou moins similaire, le réglage fin (*fine-tuning* en anglais) du modèle avec seulement une heure de données vocales étiquetées dans une langue à faibles ressources peut tout de même produire des résultats de haute qualité comparés aux systèmes ASR précédents entraînés sur 100 fois plus de données étiquetées.
+
+```py
+>>> from transformers import pipeline
+
+>>> transcriber = pipeline(task="automatic-speech-recognition", model="openai/whisper-small")
+>>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
+{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.'}
+```
+
+## Vision par ordinateur
+
+L'une des premières réussites en vision par ordinateur a été la reconnaissance des numéros de code postal à l'aide d'un [réseau de neurones convolutionnel (CNN)](glossary#convolution). Une image est composée de pixels, chacun ayant une valeur numérique, ce qui permet de représenter facilement une image sous forme de matrice de valeurs de pixels. Chaque combinaison de valeurs de pixels correspond aux couleurs d'une image.
+
+Il existe deux approches principales pour résoudre les tâches de vision par ordinateur :
+
+1. Utiliser des convolutions pour apprendre les caractéristiques hiérarchiques d'une image, des détails de bas niveau aux éléments abstraits de plus haut niveau.
+2. Diviser l'image en morceaux (*patches* en anglais) et utiliser un Transformer pour apprendre progressivement comment chaque morceau est lié aux autres pour former l'image complète. Contrairement à l'approche ascendante des CNNs, cette méthode ressemble à un processus où l'on démarre avec une image floue pour ensuite la mettre au point petit à petit. 
+
+### Classification d'images
+
+La classification d'images consiste à attribuer une classe, parmi un ensemble de classes prédéfini, à toute une image. Comme pour la plupart des tâches de classification, les cas d'utilisation pratiques sont nombreux, notamment :
+
+- Santé : classification d'images médicales pour détecter des maladies ou surveiller l'état de santé des patients.
+- Environnement : classification d'images satellites pour suivre la déforestation, aider à la gestion des terres ou détecter les incendies de forêt.
+- Agriculture : classification d'images de cultures pour surveiller la santé des plantes ou des images satellites pour analyser l'utilisation des terres.
+- Écologie : classification d'images d'espèces animales ou végétales pour suivre les populations fauniques ou les espèces menacées.
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline(task="image-classification")
+>>> preds = classifier(
+...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+... )
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+>>> print(*preds, sep="\n")
+{'score': 0.4335, 'label': 'lynx, catamount'}
+{'score': 0.0348, 'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor'}
+{'score': 0.0324, 'label': 'snow leopard, ounce, Panthera uncia'}
+{'score': 0.0239, 'label': 'Egyptian cat'}
+{'score': 0.0229, 'label': 'tiger cat'}
+```
+
+### Détection d'objets
+
+La détection d'objets, à la différence de la classification d'images, identifie plusieurs objets dans une image ainsi que leurs positions, généralement définies par des boîtes englobantes (*bounding boxes* en anglais). Voici quelques exemples d'applications :
+
+- Véhicules autonomes : détection des objets de la circulation, tels que les véhicules, piétons et feux de signalisation.
+- Télédétection : surveillance des catastrophes, planification urbaine et prévisions météorologiques.
+- Détection de défauts : identification des fissures ou dommages structurels dans les bâtiments, ainsi que des défauts de fabrication.
+
+```py
+>>> from transformers import pipeline
+
+>>> detector = pipeline(task="object-detection")
+>>> preds = detector(
+...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+... )
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"], "box": pred["box"]} for pred in preds]
+>>> preds
+[{'score': 0.9865,
+  'label': 'cat',
+  'box': {'xmin': 178, 'ymin': 154, 'xmax': 882, 'ymax': 598}}]
+```
+
+### Segmentation d'images
+
+La segmentation d'images est une tâche qui consiste à attribuer une classe à chaque pixel d'une image, ce qui la rend plus précise que la détection d'objets, qui se limite aux boîtes englobantes (*bounding boxes* en anglais). Elle permet ainsi de détecter les objets à la précision du pixel. Il existe plusieurs types de segmentation d'images :
+
+- Segmentation d'instances : en plus de classifier un objet, elle identifie chaque instance distincte d'un même objet (par exemple, "chien-1", "chien-2").
+- Segmentation panoptique : combine segmentation sémantique et segmentation d'instances, attribuant à chaque pixel une classe sémantique **et** une instance spécifique.
+
+Ces techniques sont utiles pour les véhicules autonomes, qui doivent cartographier leur environnement pixel par pixel pour naviguer en toute sécurité autour des piétons et des véhicules. Elles sont également précieuses en imagerie médicale, où la précision au niveau des pixels permet de détecter des anomalies cellulaires ou des caractéristiques d'organes. Dans le commerce en ligne, la segmentation est utilisée pour des essayages virtuels de vêtements ou des expériences de réalité augmentée, en superposant des objets virtuels sur des images du monde réel via la caméra.
+
+```py
+>>> from transformers import pipeline
+
+>>> segmenter = pipeline(task="image-segmentation")
+>>> preds = segmenter(
+...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+... )
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+>>> print(*preds, sep="\n")
+{'score': 0.9879, 'label': 'LABEL_184'}
+{'score': 0.9973, 'label': 'snow'}
+{'score': 0.9972, 'label': 'cat'}
+```
+
+### Estimation de la profondeur
+
+L'estimation de la profondeur consiste à prédire la distance de chaque pixel d'une image par rapport à la caméra. Cette tâche est cruciale pour comprendre et reconstruire des scènes réelles. Par exemple, pour les voitures autonomes, il est essentiel de déterminer la distance des objets tels que les piétons, les panneaux de signalisation et les autres véhicules pour éviter les collisions. L'estimation de la profondeur permet également de créer des modèles 3D à partir d'images 2D, ce qui est utile pour générer des représentations détaillées de structures biologiques ou de bâtiments.
+
+Il existe deux principales approches pour estimer la profondeur :
+
+- Stéréo : la profondeur est estimée en comparant deux images d'une même scène prises sous des angles légèrement différents.
+- Monoculaire : la profondeur est estimée à partir d'une seule image.
+
+```py
+>>> from transformers import pipeline
+
+>>> depth_estimator = pipeline(task="depth-estimation")
+>>> preds = depth_estimator(
+...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+... )
+```
+
+## Traitement du langage naturel
+
+Les tâches de traitement du langage naturel (*Natural Language Processing* ou *NLP* en anglais) sont courantes car le texte est une forme naturelle de communication pour nous. Pour qu'un modèle puisse traiter le texte, celui-ci doit être *tokenisé*, c'est-à-dire divisé en mots ou sous-mots appelés "*tokens*", puis converti en nombres. Ainsi, une séquence de texte peut être représentée comme une séquence de nombres, qui peut ensuite être utilisée comme données d'entrée pour un modèle afin de résoudre diverses tâches  de traitement du langage naturel.
+
+### Classification de texte
+
+La classification de texte attribue une classe à une séquence de texte (au niveau d'une phrase, d'un paragraphe ou d'un document) à partir d'un ensemble de classes prédéfini. Voici quelques applications pratiques :
+
+- **Analyse des sentiments** : étiqueter le texte avec une polarité telle que `positive` ou `négative`, ce qui aide à la prise de décision dans des domaines comme la politique, la finance et le marketing.
+- **Classification de contenu** : organiser et filtrer les informations en attribuant des *tags* sur des sujets spécifiques, comme `météo`, `sports` ou `finance`, dans les flux d'actualités et les réseaux sociaux.
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline(task="sentiment-analysis")
+>>> preds = classifier("Hugging Face is the best thing since sliced bread!")
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+>>> preds
+[{'score': 0.9991, 'label': 'POSITIVE'}]
+```
+
+### Classification des tokens
+
+Dans les tâches de traitement du language naturel, le texte est d'abord prétraité en le séparant en mots ou sous-mots individuels, appelés *[tokens](glossary#token)*. La classification des tokens attribue une classe à chaque token à partir d'un ensemble de classes prédéfini.
+
+Voici deux types courants de classification des tokens :
+
+- **Reconnaissance d'entités nommées (*Named Entity Recognition* ou *NER* en anglais)** : étiqueter un token selon une catégorie d'entité, telle qu'organisation, personne, lieu ou date. La NER est particulièrement utilisée dans les contextes biomédicaux pour identifier des gènes, des protéines et des noms de médicaments.
+- **Étiquetage des parties du discours (*Part of Speech* ou *POS* en anglais)** : étiqueter un token en fonction de sa partie du discours, comme nom, verbe ou adjectif. Le POS est utile pour les systèmes de traduction afin de comprendre comment deux mots identiques peuvent avoir des rôles grammaticaux différents (par exemple, "banque" comme nom versus "banque" comme verbe).
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline(task="ner")
+>>> preds = classifier("Hugging Face is a French company based in New York City.")
+>>> preds = [
+...     {
+...         "entity": pred["entity"],
+...         "score": round(pred["score"], 4),
+...         "index": pred["index"],
+...         "word": pred["word"],
+...         "start": pred["start"],
+...         "end": pred["end"],
+...     }
+...     for pred in preds
+... ]
+>>> print(*preds, sep="\n")
+{'entity': 'I-ORG', 'score': 0.9968, 'index': 1, 'word': 'Hu', 'start': 0, 'end': 2}
+{'entity': 'I-ORG', 'score': 0.9293, 'index': 2, 'word': '##gging', 'start': 2, 'end': 7}
+{'entity': 'I-ORG', 'score': 0.9763, 'index': 3, 'word': 'Face', 'start': 8, 'end': 12}
+{'entity': 'I-MISC', 'score': 0.9983, 'index': 6, 'word': 'French', 'start': 18, 'end': 24}
+{'entity': 'I-LOC', 'score': 0.999, 'index': 10, 'word': 'New', 'start': 42, 'end': 45}
+{'entity': 'I-LOC', 'score': 0.9987, 'index': 11, 'word': 'York', 'start': 46, 'end': 50}
+{'entity': 'I-LOC', 'score': 0.9992, 'index': 12, 'word': 'City', 'start': 51, 'end': 55}
+```
+
+### Réponse à des questions - (*Question Answering*)
+
+La réponse à des questions (*Question Answering* ou *QA* en anglais) est une tâche de traitement du language naturel qui consiste à fournir une réponse à une question, parfois avec l'aide d'un contexte (domaine ouvert) et d'autres fois sans contexte (domaine fermé). Cette tâche intervient lorsqu'on interroge un assistant virtuel, par exemple pour savoir si un restaurant est ouvert. Elle est également utilisée pour le support client, technique, et pour aider les moteurs de recherche à fournir des informations pertinentes.
+
+Il existe deux types courants de réponse à des questions :
+
+- **Extractive** : pour une question donnée et un contexte fourni, la réponse est extraite directement du texte du contexte par le modèle.
+- **Abstractive** : pour une question donnée et un contexte, la réponse est générée à partir du contexte. Cette approche utilise le [`Text2TextGenerationPipeline`] plutôt que le [`QuestionAnsweringPipeline`] montré ci-dessous.
+
+
+```py
+>>> from transformers import pipeline
+
+>>> question_answerer = pipeline(task="question-answering")
+>>> preds = question_answerer(
+...     question="What is the name of the repository?",
+...     context="The name of the repository is huggingface/transformers",
+... )
+>>> print(
+...     f"score: {round(preds['score'], 4)}, start: {preds['start']}, end: {preds['end']}, answer: {preds['answer']}"
+... )
+score: 0.9327, start: 30, end: 54, answer: huggingface/transformers
+```
+
+### Résumé de texte - (*Summarization*)
+
+Le résumé de text consiste à créer une version plus courte d'un texte tout en conservant l'essentiel du sens du document original. C'est une tâche de séquence à séquence qui produit un texte plus condensé à partir du texte initial. Cette technique est utile pour aider les lecteurs à saisir rapidement les points clés de longs documents, comme les projets de loi, les documents juridiques et financiers, les brevets, et les articles scientifiques.
+
+Il existe deux types courants de summarization :
+
+- **Extractive** : identifier et extraire les phrases les plus importantes du texte original.
+- **Abstractive** : générer un résumé qui peut inclure des mots nouveaux non présents dans le texte d'origine. Le [`SummarizationPipeline`] utilise l'approche abstractive.
+
+```py
+>>> from transformers import pipeline
+
+>>> summarizer = pipeline(task="summarization")
+>>> summarizer(
+...     "In this work, we presented the Transformer, the first sequence transduction model based entirely on attention, replacing the recurrent layers most commonly used in encoder-decoder architectures with multi-headed self-attention. For translation tasks, the Transformer can be trained significantly faster than architectures based on recurrent or convolutional layers. On both WMT 2014 English-to-German and WMT 2014 English-to-French translation tasks, we achieve a new state of the art. In the former task our best model outperforms even all previously reported ensembles."
+... )
+[{'summary_text': ' The Transformer is the first sequence transduction model based entirely on attention . It replaces the recurrent layers most commonly used in encoder-decoder architectures with multi-headed self-attention . For translation tasks, the Transformer can be trained significantly faster than architectures based on recurrent or convolutional layers .'}]
+```
+
+### Traduction
+
+La traduction convertit un texte d'une langue à une autre. Elle facilite la communication entre personnes de différentes langues, permet de toucher des audiences plus larges et peut aussi servir d'outil d'apprentissage pour ceux qui apprennent une nouvelle langue. Comme le résumé de texte, la traduction est une tâche de séquence à séquence, où le modèle reçoit une séquence d'entrée (un texte est ici vu comme une séquence de mots, ou plus précisément de tokens) et produit une séquence de sortie dans la langue cible.
+
+Initialement, les modèles de traduction étaient principalement monolingues, mais il y a eu récemment un intérêt croissant pour les modèles multilingues capables de traduire entre plusieurs paires de langues.
+
+```py
+>>> from transformers import pipeline
+
+>>> text = "translate English to French: Hugging Face is a community-based open-source platform for machine learning."
+>>> translator = pipeline(task="translation", model="google-t5/t5-small")
+>>> translator(text)
+[{'translation_text': "Hugging Face est une tribune communautaire de l'apprentissage des machines."}]
+```
+
+### Modélisation du langage
+
+La modélisation du langage consiste à prédire un mot dans un texte. Cette tâche est devenue très populaire en traitement du language naturel, car un modèle de langage préentraîné sur cette tâche peut ensuite être ajusté (*finetuned*) pour accomplir de nombreuses autres tâches. Récemment, les grands modèles de langage (LLMs) ont suscité beaucoup d'intérêt pour leur capacité à apprendre avec peu ou pas de données spécifiques à une tâche, ce qui leur permet de résoudre des problèmes pour lesquels ils n'ont pas été explicitement entraînés. Ces modèles peuvent générer du texte fluide et convaincant, bien qu'il soit important de vérifier leur précision.
+
+Il existe deux types de modélisation du langage :
+
+- **Causale** : le modèle prédit le token suivant dans une séquence, avec les tokens futurs masqués.
+
+    ```py
+    >>> from transformers import pipeline
+
+    >>> prompt = "Hugging Face is a community-based open-source platform for machine learning."
+    >>> generator = pipeline(task="text-generation")
+    >>> generator(prompt)  # doctest: +SKIP
+    ```
+
+- **Masquée** : le modèle prédit un token masqué dans une séquence en ayant accès à tous les autres tokens de la séquence (passé et futur).
+
+    ```py
+    >>> text = "Hugging Face is a community-based open-source <mask> for machine learning."
+    >>> fill_mask = pipeline(task="fill-mask")
+    >>> preds = fill_mask(text, top_k=1)
+    >>> preds = [
+    ...     {
+    ...         "score": round(pred["score"], 4),
+    ...         "token": pred["token"],
+    ...         "token_str": pred["token_str"],
+    ...         "sequence": pred["sequence"],
+    ...     }
+    ...     for pred in preds
+    ... ]
+    >>> preds
+    [{'score': 0.2236,
+      'token': 1761,
+      'token_str': ' platform',
+      'sequence': 'Hugging Face is a community-based open-source platform for machine learning.'}]
+    ```
+
+## Multimodal
+
+Les tâches multimodales nécessitent qu'un modèle traite plusieurs types de données (texte, image, audio, vidéo) pour résoudre un problème spécifique. Par exemple, la génération de légendes pour les images est une tâche multimodale où le modèle prend une image en entrée et produit une séquence de texte décrivant l'image ou ses propriétés.
+
+Bien que les modèles multimodaux traitent divers types de données, ils convertissent toutes ces données en *embeddings* (vecteurs ou listes de nombres contenant des informations significatives). Pour des tâches comme la génération de légendes pour les images, le modèle apprend les relations entre les *embeddings* d'images et ceux de texte.
+
+### Réponse à des questions sur des documents - (*Document Question Answering*)
+
+La réponse à des questions sur des documents consiste à répondre à des questions en langage naturel en utilisant un document comme référence. Contrairement à la réponse à des questions au niveau des tokens, qui prend du texte en entrée, cette tâche prend une image d'un document ainsi qu'une question concernant ce document, et fournit une réponse. Elle est utile pour analyser des données structurées et extraire des informations clées. Par exemple, à partir d'un reçu, on peut extraire des informations telles que le montant total et le change dû.
+
+```py
+>>> from transformers import pipeline
+>>> from PIL import Image
+>>> import requests
+
+>>> url = "https://huggingface.co/datasets/hf-internal-testing/example-documents/resolve/main/jpeg_images/2.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> doc_question_answerer = pipeline("document-question-answering", model="magorshunov/layoutlm-invoices")
+>>> preds = doc_question_answerer(
+...     question="What is the total amount?",
+...     image=image,
+... )
+>>> preds
+[{'score': 0.8531, 'answer': '17,000', 'start': 4, 'end': 4}]
+```
+
+En espérant que cette page vous ait donné plus d'informations sur les différents types de tâches dans chaque modalité et l'importance pratique de chacune d'elles. Dans la [section suivante](tasks_explained), vous découvrirez **comment** 🤗 Transformers fonctionne pour résoudre ces tâches.
--- a/docs/source/fr/tasks_explained.md
+++ b/docs/source/fr/tasks_explained.md
@ -0,0 +1,294 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Comment 🤗 Transformers résout ces tâches
+
+Dans [Ce que 🤗 Transformers peut faire](task_summary), vous avez découvert les tâches de traitement du langage naturel (NLP), de traitement de la parole et de l'audio, de vision par ordinateur, ainsi que certaines de leurs applications importantes. Cette page se penche sur la manière dont les modèles résolvent ces tâches et explique les processus en arrière-plan. Bien que différents modèles puissent utiliser diverses techniques ou approches innovantes, les modèles Transformer suivent généralement une idée commune. Grâce à leur architecture flexible, la plupart des modèles sont basés sur un encodeur, un décodeur ou une combinaison encodeur-décodeur. En plus des modèles Transformer, notre bibliothèque comprend également des réseaux de neurones convolutifs (CNN), qui restent utilisés pour les tâches de vision par ordinateur. Nous expliquerons aussi le fonctionnement d'un CNN moderne.
+
+Voici comment différents modèles résolvent des tâches spécifiques :
+
+- [Wav2Vec2](model_doc/wav2vec2) pour la classification audio et la reconnaissance vocale (*ASR* en anglais)
+- [Vision Transformer (ViT)](model_doc/vit) et [ConvNeXT](model_doc/convnext) pour la classification d'images
+- [DETR](model_doc/detr) pour la détection d'objets
+- [Mask2Former](model_doc/mask2former) pour la segmentation d'images
+- [GLPN](model_doc/glpn) pour l'estimation de la profondeur
+- [BERT](model_doc/bert) pour les tâches de traitement du language naturel telles que la classification de texte, la classification des tokens et la réponse à des questions utilisant un encodeur
+- [GPT2](model_doc/gpt2) pour les tâches de traitement du language naturel telles que la génération de texte utilisant un décodeur
+- [BART](model_doc/bart) pour les tâches de traitement du language naturel telles que le résumé de texte et la traduction utilisant un encodeur-décodeur
+
+<Tip>
+
+Avant de poursuivre, il est utile d'avoir quelques connaissances de base sur l'architecture des Transformers. Comprendre le fonctionnement des encodeurs, des décodeurs et du mécanisme d'attention vous aidera à saisir comment les différents modèles Transformer fonctionnent. Si vous débutez ou avez besoin d'un rappel, consultez notre [cours](https://huggingface.co/course/chapter1/4?fw=pt) pour plus d'informations !
+
+</Tip>
+
+## Paroles et audio
+
+[Wav2Vec2](model_doc/wav2vec2) est un modèle auto-supervisé qui est préentraîné sur des données de parole non étiquetées et ajusté sur des données étiquetées pour des tâches telles que la classification audio et la reconnaissance vocale (ASR).
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/wav2vec2_architecture.png"/>
+</div>
+
+Ce modèle comporte quatre composants principaux :
+
+1. **Encodeur de caractéristiques** (*feature encoder*): Il prend le signal audio brut, le normalise pour avoir une moyenne nulle et une variance unitaire, et le convertit en une séquence de vecteurs de caractéristiques, chacun représentant une durée de 20 ms.
+
+2. **Module de quantification** (*quantization module*): Les vecteurs de caractéristiques sont passés à ce module pour apprendre des unités de parole discrètes. Chaque vecteur est associé à un *codebook* (une collection de mots-clés), et l'unité de parole la plus représentative est sélectionnée parmi celles du codebook et transmise au modèle.
+
+3. **Réseau de contexte** (*context network*): Environ la moitié des vecteurs de caractéristiques sont masqués aléatoirement. Les vecteurs masqués sont ensuite envoyés à un *réseau de contexte*, qui est un encodeur qui ajoute des embeddings positionnels relatifs.
+
+4. **Tâche contrastive** (*contrastive task*): Le réseau de contexte est préentraîné avec une tâche contrastive. Le modèle doit prédire la véritable unité de parole quantifiée à partir de la prédiction masquée parmi un ensemble de fausses, ce qui pousse le modèle à trouver l'unité de parole quantifiée la plus proche de la prédiction.
+
+Une fois préentraîné, wav2vec2 peut être ajusté sur vos propres données pour des tâches comme la classification audio ou la reconnaissance automatique de la parole !
+
+### Classification audio
+
+Pour utiliser le modèle préentraîné pour la classification audio, ajoutez une tête de classification de séquence au-dessus du modèle Wav2Vec2 de base. Cette tête de classification est une couche linéaire qui reçoit les états cachés (*hidden states*) de l'encodeur. Ces états cachés, qui représentent les caractéristiques apprises de chaque trame audio, peuvent avoir des longueurs variables. Pour obtenir un vecteur de longueur fixe, les états cachés sont d'abord regroupés, puis transformés en logits correspondant aux étiquettes de classe. La perte d'entropie croisée est calculée entre les logits et la cible pour déterminer la classe la plus probable.
+
+Prêt à vous lancer dans la classification audio ? Consultez notre [guide complet de classification audio](tasks/audio_classification) pour apprendre à ajuster Wav2Vec2 et à l'utiliser pour l'inférence !
+
+### Reconnaissance vocale
+
+Pour utiliser le modèle préentraîné pour la reconnaissance vocale, ajoutez une tête de modélisation du langage au-dessus du modèle Wav2Vec2 de base pour la [classification temporelle connexionniste (CTC)](glossary#connectionist-temporal-classification-ctc). Cette tête de modélisation du langage est une couche linéaire qui prend les états cachés (*hidden states*) de l'encodeur et les convertit en logits. Chaque logit correspond à une classe de token (le nombre de tokens provient du vocabulaire de la tâche). La perte CTC est calculée entre les logits et les cibles (*targets*) pour identifier la séquence de tokens la plus probable, qui est ensuite décodée en transcription.
+
+Prêt à vous lancer dans la reconnaissance automatique de la parole ? Consultez notre [guide complet de reconnaissance automatique de la parole](tasks/asr) pour apprendre à ajuster Wav2Vec2 et à l'utiliser pour l'inférence !
+
+## Vision par ordinateur
+
+Il existe deux façons d'aborder les tâches de vision par ordinateur :
+
+1. **Diviser une image en une séquence de patches** et les traiter en parallèle avec un Transformer.
+2. **Utiliser un CNN moderne**, comme [ConvNeXT](model_doc/convnext), qui repose sur des couches convolutionnelles mais adopte des conceptions de réseau modernes.
+
+<Tip>
+
+Une troisième approche combine les Transformers avec des convolutions (par exemple, [Convolutional Vision Transformer](model_doc/cvt) ou [LeViT](model_doc/levit)). Nous ne discuterons pas de ces approches ici, car elles mélangent simplement les deux approches que nous examinons.
+
+</Tip>
+
+ViT et ConvNeXT sont couramment utilisés pour la classification d'images. Pour d'autres tâches de vision par ordinateur comme la détection d'objets, la segmentation et l'estimation de la profondeur, nous examinerons respectivement DETR, Mask2Former et GLPN, qui sont mieux adaptés à ces tâches.
+
+### Classification d'images
+
+ViT et ConvNeXT peuvent tous deux être utilisés pour la classification d'images ; la principale différence réside dans leurs approches : ViT utilise un mécanisme d'attention tandis que ConvNeXT repose sur des convolutions.
+
+#### Transformer
+
+[ViT](model_doc/vit) remplace entièrement les convolutions par une architecture Transformer pure. Si vous êtes déjà familiarisé avec le Transformer original, vous trouverez que ViT suit des principes similaires, mais adaptés pour traiter les images comme des séquences de patches.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/vit_architecture.jpg"/>
+</div>
+
+Le principal changement introduit par ViT concerne la façon dont les images sont fournies à un Transformer :
+
+1. **Tokenisation des images** : L'image est divisée en patches carrés non chevauchants, chacun étant transformé en un vecteur ou *embedding de patch*. Ces embeddings de patch sont générés à partir d'une couche convolutionnelle 2D pour adapter les dimensions d'entrée (par exemple, 768 valeurs pour chaque embedding de patch). Si vous avez une image de 224x224 pixels, elle peut être divisée en 196 patches de 16x16 pixels. Ainsi, une image est "tokenisée" en une séquence de patches.
+
+2. **Token `[CLS]`** : Un *embedding apprenables* spécial, appelé token `[CLS]`, est ajouté au début des embeddings de patch, similaire à BERT. L'état caché final du token `[CLS]` est utilisé comme entrée pour la tête de classification attachée, tandis que les autres sorties sont ignorées. Ce token aide le modèle à encoder une représentation globale de l'image.
+
+3. **Embeddings de position** : Pour que le modèle comprenne l'ordre des patches, des *embeddings de position* sont ajoutés aux embeddings de patch. Ces embeddings de position, également apprenables et de la même taille que les embeddings de patch, permettent au modèle de saisir la structure spatiale de l'image.
+
+4. **Classification** : Les embeddings, enrichis des embeddings de position, sont ensuite traités par l'encodeur Transformer. La sortie associée au token `[CLS]` est passée à une tête de perceptron multicouche (MLP) pour la classification. La tête MLP convertit cette sortie en logits pour chaque étiquette de classe, et la perte d'entropie croisée est calculée pour déterminer la classe la plus probable.
+
+Prêt à vous essayer à la classification d'images ? Consultez notre [guide complet de classification d'images](tasks/image_classification) pour apprendre à ajuster ViT et à l'utiliser pour l'inférence !
+
+#### CNN
+
+<Tip>
+
+Cette section explique brièvement les convolutions, mais il serait utile d'avoir une compréhension préalable de la façon dont elles modifient la forme et la taille d'une image. Si vous n'êtes pas familier avec les convolutions, consultez le [chapitre sur les réseaux de neurones convolutionnels](https://github.com/fastai/fastbook/blob/master/13_convolutions.ipynb) du livre fastai !
+
+</Tip>
+
+[ConvNeXT](model_doc/convnext) est une architecture CNN qui adopte des conceptions de réseau modernes pour améliorer les performances. Cependant, les convolutions restent au cœur du modèle. D'un point de vue général, une [convolution](glossary#convolution) est une opération où une matrice plus petite (*noyau*) est multipliée par une petite fenêtre de pixels de l'image. Elle calcule certaines caractéristiques à partir de cette fenêtre, comme une texture particulière ou la courbure d'une ligne. Ensuite, elle se déplace vers la fenêtre suivante de pixels ; la distance parcourue par la convolution est appelée le *stride*.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/convolution.gif"/>
+</div>
+
+<small>Une convolution de base sans padding ni stride, tirée de <a href="https://arxiv.org/abs/1603.07285">Un guide des calculs de convolution pour l'apprentissage profond.</a></small>
+
+Vous pouvez alimenter la sortie d'une couche convolutionnelle à une autre couche convolutionnelle. À chaque couche successive, le réseau apprend des caractéristiques de plus en plus complexes et abstraites, telles que des objets spécifiques comme des hot-dogs ou des fusées. Entre les couches convolutionnelles, il est courant d'ajouter des couches de pooling pour réduire la dimensionnalité et rendre le modèle plus robuste aux variations de position des caractéristiques.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/convnext_architecture.png"/>
+</div>
+
+ConvNeXT modernise un CNN de cinq manières :
+
+1. **Modification du nombre de blocs** : ConvNeXT utilise une approche similaire à ViT en "patchifiant" l'image avec un stride plus grand et une taille de noyau correspondante, divisant ainsi l'image en patches non chevauchants.
+
+2. **Couche de goulot d'étranglement** (*bottleneck layer*) : Cette couche réduit puis restaure le nombre de canaux pour accélérer les convolutions 1x1, permettant une plus grande profondeur du réseau. Un goulot d'étranglement inversé augmente d'abord le nombre de canaux avant de les réduire, optimisant ainsi l'utilisation de la mémoire.
+
+3. **Convolution en profondeur** (*depthwise convolution*): Remplace la convolution 3x3 traditionnelle par une convolution appliquée à chaque canal d'entrée séparément, améliorant ainsi la largeur du réseau et ses performances.
+
+4. **Augmentation de la taille du noyau** : ConvNeXT utilise un noyau de 7x7 pour imiter le champ réceptif global de ViT, ce qui permet de capturer des informations sur une plus grande partie de l'image.
+
+5. **Changements de conception des couches** : Le modèle adopte des modifications inspirées des Transformers, telles que moins de couches d'activation et de normalisation, l'utilisation de GELU au lieu de ReLU, et LayerNorm plutôt que BatchNorm.
+
+La sortie des blocs de convolution est ensuite passée à une tête de classification, qui convertit les sorties en logits et calcule la perte d'entropie croisée pour déterminer l'étiquette la plus probable.
+
+### Object detection
+
+[DETR](model_doc/detr), *DEtection TRansformer*, est un modèle de détection d'objets de bout en bout qui combine un CNN avec un encodeur-décodeur Transformer.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/detr_architecture.png"/>
+</div>
+
+Décomposons le fonctionnement de DETR (DEtection TRansformer) pour la détection d'objets :
+
+1. **Extraction des caractéristiques avec le CNN** : Un CNN préentraîné, appelé *backbone*, prend une image et génère une carte de caractéristiques (*feature map*) à basse résolution. Une convolution 1x1 est ensuite appliquée pour réduire la dimensionnalité et créer une nouvelle carte de caractéristiques qui représente des abstractions de plus haut niveau de l'image. Cette dernière est ensuite aplatie en une séquence de vecteurs de caractéristiques, qui sont combinés avec des embeddings positionnels.
+
+2. **Traitement avec l'encodeur et le décodeur** : Les vecteurs de caractéristiques sont passés à l'encodeur, qui apprend les représentations de l'image avec ses couches d'attention. Les états cachés de l'encodeur sont ensuite combinés avec des *objects queries* dans le décodeur. Ces *objects queries* sont des embeddings appris qui se concentrent sur différentes régions de l'image et sont mis à jour à chaque couche d'attention. Les états cachés du décodeur sont utilisés pour prédire les coordonnées de la boîte englobante (*bounding box*) et le label de la classe pour chaque objet query, ou `pas d'objet` si aucun objet n'est détecté.
+
+3. **Perte de correspondance bipartite** : Lors de l'entraînement, DETR utilise une *perte de correspondance bipartite* pour comparer un nombre fixe de prédictions avec un ensemble fixe de labels de vérité terrain. Si le nombre de labels de vérité terrain est inférieur au nombre de *N* labels, ils sont complétés avec une classe `pas d'objet`. Cette fonction de perte encourage DETR à trouver une correspondance un à un entre les prédictions et les labels de vérité terrain. Si les boîtes englobantes ou les labels de classe ne sont pas corrects, une perte est encourue. De même, si DETR prédit un objet inexistant, il est pénalisé. Cela encourage DETR à trouver d'autres objets dans l'image au lieu de se concentrer sur un seul objet très proéminent.
+
+Une tête de détection d'objets est ajoutée au-dessus de DETR pour trouver le label de la classe et les coordonnées de la boîte englobante. Cette tête de détection d'objets comprend deux composants : une couche linéaire pour transformer les états cachés du décodeur en logits sur les labels de classe, et un MLP pour prédire la boîte englobante.
+
+Prêt à essayer la détection d'objets ? Consultez notre guide complet sur la [détection d'objets](tasks/object_detection) pour apprendre à affiner DETR et à l'utiliser pour l'inférence !
+
+### Segmentation d'image
+
+[Mask2Former](model_doc/mask2former) est une architecture polyvalente conçue pour traiter tous les types de tâches de segmentation d'image. Contrairement aux modèles de segmentation traditionnels, qui sont généralement spécialisés dans des sous-tâches spécifiques comme la segmentation d'instances, sémantique ou panoptique, Mask2Former aborde chaque tâche comme un problème de *classification de masques*. Cette approche regroupe les pixels en *N* segments et prédit pour chaque image *N* masques ainsi que leur étiquette de classe correspondante. Dans cette section, nous vous expliquerons le fonctionnement de Mask2Former et vous aurez la possibilité d'effectuer un réglage fin (*fine-tuning*) de SegFormer à la fin.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/mask2former_architecture.png"/>
+</div>
+
+Il y a trois composants principaux dans Mask2Former :
+
+1. Un [backbone Swin](model_doc/swin) qui prend une image en entrée et génère une carte de caractéristiques (*feature map*) à basse résolution après trois convolutions successives de 3x3.
+
+2. Cette carte de caractéristiques est ensuite envoyée à un *décodeur de pixels*, qui augmente progressivement la résolution des caractéristiques pour obtenir des embeddings par pixel en haute résolution. Le décodeur de pixels produit des caractéristiques multi-échelles, comprenant des résolutions de 1/32, 1/16, et 1/8 de l'image originale.
+
+3. Les cartes de caractéristiques à différentes échelles sont successivement traitées par une couche de décodeur Transformer, permettant de capturer les petits objets à partir des caractéristiques haute résolution. Le point central de Mask2Former est le mécanisme de *masquage d'attention* dans le décodeur. Contrairement à l'attention croisée, qui peut se concentrer sur l'ensemble de l'image, l'attention masquée se focalise uniquement sur certaines zones spécifiques. Cette approche est plus rapide et améliore les performances en permettant au modèle de se concentrer sur les détails locaux de l'image.
+
+4. À l'instar de [DETR](tasks_explained#object-detection), Mask2Former utilise également des requêtes d'objet apprises, qu'il combine avec les caractéristiques de l'image du décodeur de pixels pour faire des prédictions globales (c'est-à-dire, `étiquette de classe`, `prédiction de masque`). Les états cachés du décodeur sont passés dans une couche linéaire pour être transformés en logits correspondant aux étiquettes de classe. La perte d'entropie croisée est alors calculée entre les logits et l'étiquette de classe pour déterminer la plus probable.
+
+   Les prédictions de masque sont générées en combinant les embeddings de pixels avec les états cachés finaux du décodeur. La perte d'entropie croisée sigmoïde et la perte de Dice sont calculées entre les logits et le masque de vérité terrain pour déterminer le masque le plus probable.
+
+Prêt à vous lancer dans la détection d'objets ? Consultez notre [guide complet sur la segmentation d'image](tasks/semantic_segmentation) pour apprendre à affiner SegFormer et l'utiliser pour l'inférence !
+
+### Estimation de la profondeur
+
+[GLPN](model_doc/glpn), *Global-Local Path Network*, est un Transformer pour l'estimation de profondeur qui combine un encodeur [SegFormer](model_doc/segformer) avec un décodeur léger.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/glpn_architecture.jpg"/>
+</div>
+1. Comme avec ViT, une image est divisée en une séquence de patches, mais ces patches sont plus petits. Cette approche est particulièrement adaptée aux tâches de prédiction dense telles que la segmentation ou l'estimation de profondeur. Les patches d'image sont transformés en embeddings (voir la section [classification d'image](#image-classification) pour plus de détails sur la création des embeddings), puis envoyés à l'encodeur.
+
+2. L'encodeur traite les embeddings de patches à travers plusieurs blocs d'encodeur. Chaque bloc comprend des couches d'attention et de Mix-FFN, conçues pour fournir des informations positionnelles. À la fin de chaque bloc, une couche de *fusion de patches* crée des représentations hiérarchiques. Les caractéristiques des groupes de patches voisins sont concaténées, et une couche linéaire est appliquée pour réduire le nombre de patches à une résolution de 1/4. Ce processus est répété dans les blocs suivants jusqu'à obtenir des caractéristiques d'image avec des résolutions de 1/8, 1/16, et 1/32.
+
+3. Un décodeur léger prend la dernière carte de caractéristiques (à l'échelle 1/32) de l'encodeur et l'agrandit à l'échelle 1/16. Ensuite, cette caractéristique passe par un module de *Fusion de Caractéristiques Sélective (SFF)*, qui sélectionne et combine les caractéristiques locales et globales à partir d'une carte d'attention pour chaque caractéristique, puis l'agrandit à 1/8. Ce processus est répété jusqu'à ce que les caractéristiques décodées aient la même taille que l'image originale. La sortie est ensuite traitée par deux couches de convolution, suivies d'une activation sigmoïde pour prédire la profondeur de chaque pixel.
+
+## Traitement du langage naturel
+
+Le Transformer a été initialement conçu pour la traduction automatique, et depuis, il est devenu pratiquement l'architecture par défaut pour résoudre toutes les tâches de traitement du langage naturel (NLP). Certaines tâches se prêtent bien à la structure d'encodeur du Transformer, tandis que d'autres sont mieux adaptées au décodeur. D'autres tâches encore utilisent à la fois la structure encodeur-décodeur du Transformer.
+
+### Classification de texte
+
+[BERT](model_doc/bert) est un modèle basé uniquement sur l'encodeur, qui a été le premier à intégrer efficacement la bidirectionnalité profonde pour obtenir des représentations plus riches du texte en tenant compte des mots en amont et en aval.
+
+1. BERT utilise la tokenisation [WordPiece](tokenizer_summary#wordpiece) pour générer des embeddings de tokens à partir du texte. Pour différencier une seule phrase d'une paire de phrases, un token spécial `[SEP]` est ajouté. De plus, un token spécial `[CLS]` est placé au début de chaque séquence de texte. La sortie finale associée au token `[CLS]` est utilisée comme entrée pour la tête de classification des tâches. BERT ajoute également un embedding de segment pour indiquer si un token appartient à la première ou à la deuxième phrase dans une paire.
+
+2. BERT est préentraîné avec deux objectifs : le masquage de mots (masked language modeling) et la prédiction de la phrase suivante. Pour le masquage de mots, un pourcentage des tokens d'entrée est masqué aléatoirement, et le modèle doit prédire ces mots. Cela permet de surmonter le problème de la bidirectionnalité, où le modèle pourrait autrement tricher en voyant tous les mots et en "prédire" le mot suivant. Les états cachés finaux des tokens masqués sont passés à un réseau feedforward avec une fonction softmax sur le vocabulaire pour prédire le mot masqué.
+
+   Le deuxième objectif de préentraînement est la prédiction de la phrase suivante. Le modèle doit déterminer si la phrase B suit la phrase A. Dans la moitié des cas, la phrase B est la phrase suivante, et dans l'autre moitié, elle est aléatoire. Cette prédiction (phrase suivante ou non) est envoyée à un réseau feedforward avec une softmax sur les deux classes (`IsNext` et `NotNext`).
+
+3. Les embeddings d'entrée sont traités par plusieurs couches d'encodeur pour produire des états cachés finaux.
+
+Pour utiliser le modèle préentraîné pour la classification de texte, ajoutez une tête de classification de séquence au-dessus du modèle BERT de base. Cette tête est une couche linéaire qui prend les états cachés finaux et les transforme en logits. La perte d'entropie croisée est ensuite calculée entre les logits et les cibles pour déterminer l'étiquette la plus probable.
+
+Prêt à essayer la classification de texte ? Consultez notre [guide complet sur la classification de texte](tasks/sequence_classification) pour apprendre à effectuer un réglagle fin (*fine-tuning*) de DistilBERT et l'utiliser pour l'inférence !
+
+### Classification de tokens
+
+Pour utiliser BERT dans des tâches de classification de tokens, comme la reconnaissance d'entités nommées (NER), ajoutez une tête de classification de tokens au-dessus du modèle BERT de base. Cette tête est une couche linéaire qui prend les états cachés finaux et les transforme en logits. La perte d'entropie croisée est ensuite calculée entre les logits et les labels de chaque token pour déterminer l'étiquette la plus probable.
+
+Prêt à essayer la classification de tokens ? Consultez notre [guide complet sur la classification de tokens](tasks/token_classification) pour découvrir comment effectuer un réglagle fin (*fine-tuning*) de DistilBERT et l'utiliser pour l'inférence !
+
+### Réponse aux questions - (*Question Answering*)
+
+Pour utiliser BERT pour la réponse aux questions, ajoutez une tête de classification de span au-dessus du modèle BERT de base. Cette tête est une couche linéaire qui transforme les états cachés finaux en logits pour les positions de début et de fin du `span` correspondant à la réponse. La perte d'entropie croisée est calculée entre les logits et les positions réelles pour déterminer le span de texte le plus probable en tant que réponse.
+
+Prêt à essayer la réponse aux questions ? Consultez notre [guide complet sur la réponse aux questions](tasks/question_answering) pour découvrir comment effectuer un réglagle fin (*fine-tuning*) de DistilBERT et l'utiliser pour l'inférence !
+
+<Tip>
+
+💡 Une fois BERT préentraîné, il est incroyablement facile de l’adapter à diverses tâches ! Il vous suffit d’ajouter une tête spécifique au modèle préentraîné pour transformer les états cachés en la sortie souhaitée.
+
+</Tip>
+
+### Génération de texte
+
+[GPT-2](model_doc/gpt2) est un modèle basé uniquement sur le décodeur, préentraîné sur une grande quantité de texte. Il peut générer du texte convaincant (bien que parfois inexact !) à partir d'une invite et accomplir d'autres tâches de NLP, comme la réponse aux questions, même s'il n'a pas été spécifiquement entraîné pour ces tâches.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/gpt2_architecture.png"/>
+</div>
+
+1. GPT-2 utilise le [byte pair encoding (BPE)](tokenizer_summary#bytepair-encoding-bpe) pour tokeniser les mots et générer des embeddings de tokens. Des encodages positionnels sont ajoutés pour indiquer la position de chaque token dans la séquence. Les embeddings d'entrée passent à travers plusieurs blocs de décodeur pour produire des états cachés finaux. Chaque bloc de décodeur utilise une couche d'*attention masquée*, ce qui signifie que GPT-2 ne peut pas se concentrer sur les tokens futurs et est uniquement autorisé à se focaliser sur les tokens à gauche dans le texte. Cela diffère du token [`mask`] de BERT, car ici, dans l'attention masquée, un masque d'attention est utilisé pour attribuer un score de `0` aux tokens futurs.
+
+2. La sortie du décodeur est ensuite envoyée à une tête de modélisation du langage, qui effectue une transformation linéaire pour convertir les états cachés en logits. L'étiquette est le token suivant dans la séquence, obtenue en décalant les logits vers la droite d'une position. La perte d'entropie croisée est calculée entre les logits décalés et les étiquettes pour déterminer le token suivant le plus probable.
+
+L'objectif de préentraînement de GPT-2 est basé sur la [modélisation du langage causale](glossary#causal-language-modeling), qui consiste à prédire le mot suivant dans une séquence. Cette approche rend GPT-2 particulièrement efficace pour les tâches de génération de texte.
+
+Prêt à essayer la génération de texte ? Consultez notre [guide complet sur la modélisation du langage causale](tasks/language_modeling#causal-language-modeling) pour découvrir comment effectuer un réglagle fin (*fine-tuning*) de DistilGPT-2 et l'utiliser pour l'inférence !
+
+<Tip>
+
+Pour plus d'informations sur la génération de texte, consultez le guide sur les [stratégies de génération de texte](generation_strategies) !
+
+</Tip>
+
+### Résumé de texte
+
+Les modèles encodeur-décodeur tels que [BART](model_doc/bart) et [T5](model_doc/t5) sont conçus pour les tâches de résumé en mode séquence-à-séquence. Dans cette section, nous expliquerons le fonctionnement de BART, puis vous aurez l'occasion de découvrir comment réaliser un réglagle fin (*fine-tuning*) de T5.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bart_architecture.png"/>
+</div>
+
+1. L'architecture de l'encodeur de BART est très similaire à celle de BERT, acceptant des embeddings de tokens et des embeddings positionnels du texte. BART est préentraîné en corrompant l'entrée et en la reconstruisant avec le décodeur. Contrairement à d'autres encodeurs utilisant des stratégies de corruption spécifiques, BART peut appliquer divers types de corruption, parmi lesquelles la stratégie de *text infilling* est la plus efficace. Dans le text infilling, plusieurs segments de texte sont remplacés par un **seul** token [`mask`]. Cette approche est cruciale car elle force le modèle à prédire les tokens masqués et à estimer le nombre de tokens manquants. Les embeddings d'entrée et les spans masqués sont passés à l'encodeur pour produire des états cachés finaux. Contrairement à BERT, BART ne comporte pas de réseau feedforward final pour prédire un mot.
+
+2. La sortie de l'encodeur est transmise au décodeur, qui doit prédire à la fois les tokens masqués et les tokens non corrompus. Ce contexte supplémentaire aide le décodeur à restaurer le texte original. La sortie du décodeur est ensuite envoyée à une tête de modélisation du langage, qui transforme les états cachés en logits. La perte d'entropie croisée est calculée entre les logits et l'étiquette, qui est simplement le token décalé vers la droite.
+
+Prêt à essayer le résumé ? Consultez notre [guide complet sur le résumé](tasks/summarization) pour apprendre à effectuer un réglage fin (*fine-tuning*) de T5 et l'utiliser pour l'inférence !
+
+<Tip>
+
+Pour plus d'informations sur la génération de texte, consultez le guide sur les [stratégies de génération de texte](generation_strategies) !
+
+</Tip>
+
+### Traduction
+
+La traduction est un autre exemple de tâche séquence-à-séquence, ce qui signifie qu'un modèle encodeur-décodeur comme [BART](model_doc/bart) ou [T5](model_doc/t5) peut être utilisé pour cette tâche. Nous expliquerons ici comment BART fonctionne pour la traduction, puis vous pourrez découvrir comment affiner T5.
+
+BART adapte le modèle à la traduction en ajoutant un encodeur séparé, initialisé aléatoirement, pour mapper la langue source en une entrée qui peut être décodée dans la langue cible. Les embeddings de cet encodeur sont ensuite passés à l'encodeur préentraîné au lieu des embeddings de mots originaux. L'encodeur source est entraîné en mettant à jour l'encodeur source, les embeddings positionnels et les embeddings d'entrée avec la perte d'entropie croisée provenant de la sortie du modèle. Les paramètres du modèle sont figés lors de cette première étape, et tous les paramètres du modèle sont entraînés ensemble lors de la deuxième étape.
+
+BART a été suivi par une version multilingue, mBART, qui est spécifiquement conçue pour la traduction et préentraînée sur de nombreuses langues différentes.
+
+Prêt à essayer la traduction ? Consultez notre [guide complet sur la traduction](tasks/translation) pour apprendre à affiner T5 et l'utiliser pour l'inférence !
+
+<Tip>
+
+Pour plus d'informations sur la génération de texte, consultez le guide sur les [stratégies de génération de texte](generation_strategies) !
+
+</Tip>
--- a/docs/source/it/quicktour.md
+++ b/docs/source/it/quicktour.md
@ -111,7 +111,7 @@ etichetta: negative, con punteggio: 0.9998
 La [`pipeline`] può anche iterare su un dataset intero. Inizia installando la libreria [🤗 Datasets](https://huggingface.co/docs/datasets/):

 ```bash
-pip install datasets 
+pip install datasets
 ```

 Crea una [`pipeline`] con il compito che vuoi risolvere e con il modello che vuoi utilizzare.
@ -385,8 +385,8 @@ Una caratteristica particolarmente interessante di 🤗 Transformers è la sua a
 ```py
 >>> from transformers import AutoModel

->>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
->>> pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True)
+>>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True)
 ```
 </pt>
 <tf>
@ -394,8 +394,8 @@ Una caratteristica particolarmente interessante di 🤗 Transformers è la sua a
 ```py
 >>> from transformers import TFAutoModel

->>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True)
+>>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True)
 ```
 </tf>
 </frameworkcontent>
--- a/docs/source/ja/perf_infer_gpu_many.md
+++ b/docs/source/ja/perf_infer_gpu_many.md
@ -34,7 +34,7 @@ BetterTransformerは、テキスト、画像、音声モデルの単一GPUおよ
 <Tip>

 Flash Attentionは、fp16またはbf16 dtypeを使用しているモデルにのみ使用できます。BetterTransformerを使用する前に、モデルを適切なdtypeにキャストしてください。
-  
+
 </Tip>

 ### Decoder models
@ -53,11 +53,12 @@ model.to_bettertransformer()
 # Use it for training or inference
 ```

-SDPAは、ハードウェアや問題のサイズなどの特定の設定で[Flash Attention](https://arxiv.org/abs/2205.14135)カーネルを呼び出すこともできます。Flash Attentionを有効にするか、特定の設定（ハードウェア、問題のサイズ）で利用可能かを確認するには、[`torch.backends.cuda.sdp_kernel`](https://pytorch.org/docs/master/backends.html#torch.backends.cuda.sdp_kernel)をコンテキストマネージャとして使用します。
+SDPAは、ハードウェアや問題のサイズなどの特定の設定で[Flash Attention](https://arxiv.org/abs/2205.14135)カーネルを呼び出すこともできます。Flash Attentionを有効にするか、特定の設定（ハードウェア、問題のサイズ）で利用可能かを確認するには、[`torch.nn.kernel.sdpa_kernel`](https://pytorch.org/docs/stable/generated/torch.nn.attention.sdpa_kernel.html)をコンテキストマネージャとして使用します。


 ```diff
 import torch
+ from torch.nn.attention import SDPBackend, sdpa_kernel
 from transformers import AutoModelForCausalLM, AutoTokenizer

 tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
@ -68,7 +69,7 @@ model.to_bettertransformer()
 input_text = "Hello my dog is cute and"
 inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

-+ with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
+ with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
    outputs = model.generate(**inputs)

 print(tokenizer.decode(outputs[0], skip_special_tokens=True))
@ -105,6 +106,7 @@ BetterTransformerのパフォーマンスの詳細については、この[ブ

 ```py
 import torch
+from torch.nn.attention import SDPBackend, sdpa_kernel
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

 quantization_config = BitsAndBytesConfig(
@ -118,7 +120,7 @@ model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", quantization_c
 input_text = "Hello my dog is cute and"
 inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

-with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
+with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
    outputs = model.generate(**inputs)

 print(tokenizer.decode(outputs[0], skip_special_tokens=True))
--- a/docs/source/ja/perf_infer_gpu_one.md
+++ b/docs/source/ja/perf_infer_gpu_one.md
@ -55,8 +55,8 @@ model_id = "tiiuae/falcon-7b"
 tokenizer = AutoTokenizer.from_pretrained(model_id)

 model = AutoModelForCausalLM.from_pretrained(
-    model_id, 
-    torch_dtype=torch.bfloat16, 
+    model_id,
+    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
 )
 ```
@ -112,7 +112,7 @@ model_id = "tiiuae/falcon-7b"
 tokenizer = AutoTokenizer.from_pretrained(model_id)

 model = AutoModelForCausalLM.from_pretrained(
-    model_id, 
+    model_id,
    load_in_8bit=True,
    attn_implementation="flash_attention_2",
 )
@ -130,7 +130,7 @@ model_id = "tiiuae/falcon-7b"
 tokenizer = AutoTokenizer.from_pretrained(model_id)

 model = AutoModelForCausalLM.from_pretrained(
-    model_id, 
+    model_id,
    load_in_4bit=True,
    attn_implementation="flash_attention_2",
 )
@ -149,7 +149,7 @@ model_id = "tiiuae/falcon-7b"
 tokenizer = AutoTokenizer.from_pretrained(model_id)

 model = AutoModelForCausalLM.from_pretrained(
-    model_id, 
+    model_id,
    load_in_4bit=True,
    attn_implementation="flash_attention_2",
 )
@ -173,7 +173,7 @@ BetterTransformerは、テキスト、画像、およびオーディオモデル
 <Tip>

 Flash Attentionは、fp16またはbf16のdtypeを使用するモデルにのみ使用できます。BetterTransformerを使用する前に、モデルを適切なdtypeにキャストしてください。
-  
+
 </Tip>

 ### Encoder models
@ -214,11 +214,12 @@ model.to_bettertransformer()
 # Use it for training or inference
 ```

-SDPAは、ハードウェアや問題のサイズに応じて[Flash Attention](https://arxiv.org/abs/2205.14135)カーネルを使用することもできます。Flash Attentionを有効にするか、特定の設定（ハードウェア、問題サイズ）で使用可能かどうかを確認するには、[`torch.backends.cuda.sdp_kernel`](https://pytorch.org/docs/master/backends.html#torch.backends.cuda.sdp_kernel)をコンテキストマネージャとして使用します。
+SDPAは、ハードウェアや問題のサイズに応じて[Flash Attention](https://arxiv.org/abs/2205.14135)カーネルを使用することもできます。Flash Attentionを有効にするか、特定の設定（ハードウェア、問題サイズ）で使用可能かどうかを確認するには、[`torch.nn.attention.sdpa_kernel`](https://pytorch.org/docs/stable/generated/torch.nn.attention.sdpa_kernel.html)をコンテキストマネージャとして使用します。


 ```diff
 import torch
+ from torch.nn.attention import SDPBackend, sdpa_kernel
 from transformers import AutoModelForCausalLM, AutoTokenizer

 tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
@ -229,7 +230,7 @@ model.to_bettertransformer()
 input_text = "Hello my dog is cute and"
 inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

-+ with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
+ with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
    outputs = model.generate(**inputs)

 print(tokenizer.decode(outputs[0], skip_special_tokens=True))
@ -421,6 +422,7 @@ In this example, the first GPU will use 1GB of memory and the second 2GB.

 ```py
 import torch
+from torch.nn.attention import SDPBackend, sdpa_kernel
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

 quantization_config = BitsAndBytesConfig(
@ -434,7 +436,7 @@ model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", quantization_c
 input_text = "Hello my dog is cute and"
 inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

-with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
+with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
    outputs = model.generate(**inputs)

 print(tokenizer.decode(outputs[0], skip_special_tokens=True))
--- a/docs/source/ja/quicktour.md
+++ b/docs/source/ja/quicktour.md
@ -386,8 +386,8 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
 ```py
 >>> from transformers import AutoModel

->>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
->>> pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True)
+>>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True)
 ```

 </pt>
@ -396,8 +396,8 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
 ```py
 >>> from transformers import TFAutoModel

->>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True)
+>>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True)
 ```
 </tf>
 </frameworkcontent>
--- a/docs/source/ja/tasks/audio_classification.md
+++ b/docs/source/ja/tasks/audio_classification.md
@ -128,7 +128,7 @@ DatasetDict({
 >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
 ```

-MInDS-14 データセットのサンプリング レートは 8000khz です (この情報は [データセット カード](https://huggingface.co/datasets/PolyAI/minds14) で確認できます)。つまり、データセットを再サンプリングする必要があります。事前トレーニングされた Wav2Vec2 モデルを使用するには、16000kHz に設定します。
+MInDS-14 データセットのサンプリング レートは 8khz です (この情報は [データセット カード](https://huggingface.co/datasets/PolyAI/minds14) で確認できます)。つまり、データセットを再サンプリングする必要があります。事前トレーニングされた Wav2Vec2 モデルを使用するには、16kHz に設定します。

 ```py
 >>> minds = minds.cast_column("audio", Audio(sampling_rate=16_000))
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@ -145,34 +145,14 @@
    title: (번역중) Getting started
  - local: quantization/bitsandbytes
    title: bitsandbytes
-  - local: in_translation
-    title: (번역중) GPTQ
+  - local: quantization/gptq
+    title: GPTQ
  - local: quantization/awq
    title: AWQ
  - local: in_translation
    title: (번역중) AQLM
  - local: in_translation
-    title: (번역중) Quanto
-  - local: in_translation
-    title: (번역중) EETQ
-  - local: in_translation
-    title: (번역중) HQQ
-  - local: in_translation
-    title: (번역중) Optimum
-  - local: in_translation
-    title: (번역중) Contribute new quantization method
-  title: (번역중) 경량화 메소드
- sections:
-  - local: in_translation
-    title: (번역중) Getting started
-  - local: in_translation
-    title: (번역중) bitsandbytes
-  - local: quantization/gptq
-    title: GPTQ
-  - local: in_translation
-    title: (번역중) AWQ
-  - local: in_translation
-    title: (번역중) AQLM
+    title: (번역중) VPTQ 
  - local: quantization/quanto
    title: Quanto
  - local: quantization/eetq
@ -688,8 +668,8 @@
      sections:
      - local: in_translation
        title: (번역중) ALIGN
-      - local: in_translation
-        title: (번역중) AltCLIP
+      - local: model_doc/altclip
+        title: AltCLIP
      - local: model_doc/blip-2
        title: BLIP-2
      - local: model_doc/blip
--- a/docs/source/ko/llm_optims.md
+++ b/docs/source/ko/llm_optims.md
@ -375,7 +375,7 @@ with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable
 양자화는 LLM 가중치를 더 낮은 정밀도로 저장하여 크기를 줄입니다. 이는 메모리 사용량을 줄이며 GPU 메모리에 제약이 있는 경우 추론을 위해 LLM을 로드하는 것을 더 용이하게 합니다. GPU가 충분하다면, 모델을 양자화할 필요는 없습니다. 추가적인 양자화 및 양자화 해제 단계로 인해 약간의 지연이 발생할 수 있기 때문입니다(AWQ 및 융합 AWQ 모듈 제외).

 > [!TIP]
-> 다양한 양자화 라이브러리(자세한 내용은 [Quantization](./quantization) 가이드를 참조하십시오)가 있습니다. 여기에는 Quanto, AQLM, AWQ 및 AutoGPTQ가 포함됩니다. 사용 사례에 가장 잘 맞는 라이브러리를 사용해 보십시오. 또한 AutoGPTQ와 bitsandbytes를 비교하는 [Overview of natively supported quantization schemes in 🤗 Transformers](https://hf.co/blog/overview-quantization-transformers) 블로그 게시물을 읽어보는 것을 추천합니다.
+> 다양한 양자화 라이브러리(자세한 내용은 [Quantization](./quantization) 가이드를 참조하십시오)가 있습니다. 여기에는 Quanto, AQLM, VPTQ, AWQ 및 AutoGPTQ가 포함됩니다. 사용 사례에 가장 잘 맞는 라이브러리를 사용해 보십시오. 또한 AutoGPTQ와 bitsandbytes를 비교하는 [Overview of natively supported quantization schemes in 🤗 Transformers](https://hf.co/blog/overview-quantization-transformers) 블로그 게시물을 읽어보는 것을 추천합니다.

 아래의 모델 메모리 계산기를 사용하여 모델을 로드하는 데 필요한 메모리를 추정하고 비교해 보십시오. 예를 들어 [Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)를 로드하는 데 필요한 메모리를 추정해 보십시오.

--- a/docs/source/ko/main_classes/quantization.md
+++ b/docs/source/ko/main_classes/quantization.md
@ -35,6 +35,10 @@ Transformers에서 지원되지 않는 양자화 기법들은 [`HfQuantizer`]

 [[autodoc]] AqlmConfig

+## VptqConfig[[transformers.VptqConfig]]
+
+[[autodoc]] VptqConfig
+
 ## AwqConfig[[transformers.AwqConfig]]

 [[autodoc]] AwqConfig
--- a/docs/source/ko/model_doc/altclip.md
+++ b/docs/source/ko/model_doc/altclip.md
@ -0,0 +1,78 @@
+# AltCLIP
+
+## 개요[[overview]]
+
+AltCLIP 모델은 Zhongzhi Chen, Guang Liu, Bo-Wen Zhang, Fulong Ye, Qinghong Yang, Ledell Wu의 [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679v2) 논문에서 제안되었습니다. AltCLIP(CLIP의 언어 인코더를 변경하여 언어 기능 확장)은 다양한 이미지-텍스트 및 텍스트-텍스트 쌍으로 훈련된 신경망입니다. CLIP의 텍스트 인코더를 사전 훈련된 다국어 텍스트 인코더 XLM-R로 교체하여, 거의 모든 작업에서 CLIP과 유사한 성능을 얻을 수 있었으며, 원래 CLIP의 다국어 이해와 같은 기능도 확장되었습니다.
+
+논문의 초록은 다음과 같습니다:
+
+*본 연구에서는 강력한 이중 언어 멀티모달 표현 모델을 훈련하는 개념적으로 간단하고 효과적인 방법을 제시합니다. OpenAI에서 출시한 사전 훈련된 멀티모달 표현 모델 CLIP에서 시작하여, 그 텍스트 인코더를 사전 훈련된 다국어 텍스트 인코더 XLM-R로 교체하고, 교사 학습과 대조 학습으로 구성된 2단계 훈련 스키마를 통해 언어와 이미지 표현을 정렬했습니다. 우리는 광범위한 작업 평가를 통해 우리의 방법을 검증했습니다. ImageNet-CN, Flicker30k-CN, COCO-CN을 포함한 여러 작업에서 새로운 최고 성능을 달성했으며, 거의 모든 작업에서 CLIP과 유사한 성능을 얻었습니다. 이는 CLIP의 텍스트 인코더를 단순히 변경하여 다국어 이해와 같은 확장 기능을 얻을 수 있음을 시사합니다.*
+
+이 모델은 [jongjyh](https://huggingface.co/jongjyh)에 의해 기여되었습니다.
+
+## 사용 팁과 예제[[usage-tips-and-example]]
+
+AltCLIP의 사용법은 CLIP과 매우 유사하며, 차이점은 텍스트 인코더에 있습니다. 일반적인 어텐션 대신 양방향 어텐션을 사용하며, XLM-R의 [CLS] 토큰을 사용하여 텍스트 임베딩을 나타냅니다.
+
+AltCLIP은 멀티모달 비전 및 언어 모델입니다. 이미지와 텍스트 간의 유사성 계산 및 제로샷 이미지 분류에 사용할 수 있습니다. AltCLIP은 ViT와 같은 트랜스포머를 사용하여 시각적 특징을 얻고, 양방향 언어 모델을 사용하여 텍스트 특징을 얻습니다. 이후 텍스트와 시각적 특징 모두 동일한 차원의 잠재 공간으로 투사됩니다. 투사된 이미지와 텍스트 특징 간의 내적을 유사도 점수로 사용합니다.
+
+이미지를 트랜스포머 인코더에 입력하기 위해, 각 이미지를 일정한 크기의 겹치지 않는 패치 시퀀스로 분할한 뒤, 이를 선형 임베딩합니다. 전체 이미지를 나타내기 위해 [CLS] 토큰이 추가됩니다. 저자들은 절대 위치 임베딩도 추가하여 결과 벡터 시퀀스를 표준 트랜스포머 인코더에 입력합니다. [`CLIPImageProcessor`]는 모델을 위해 이미지를 크기 조정하고 정규화하는 데 사용할 수 있습니다.
+
+[`AltCLIPProcessor`]는 [`CLIPImageProcessor`]와 [`XLMRobertaTokenizer`]를 하나의 인스턴스로 묶어 텍스트를 인코딩하고 이미지를 준비합니다. 다음 예제는 [`AltCLIPProcessor`]와 [`AltCLIPModel`]을 사용하여 이미지와 텍스트 간의 유사성 점수를 얻는 방법을 보여줍니다.
+```python
+>>> from PIL import Image
+>>> import requests
+
+>>> from transformers import AltCLIPModel, AltCLIPProcessor
+
+>>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
+>>> processor = AltCLIPProcessor.from_pretrained("BAAI/AltCLIP")
+
+>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
+
+>>> outputs = model(**inputs)
+>>> logits_per_image = outputs.logits_per_image  # 이미지-텍스트 유사도 점수
+>>> probs = logits_per_image.softmax(dim=1)  # 라벨 마다 확률을 얻기 위해 softmax 적용
+```
+<Tip>
+
+이 모델은 `CLIPModel`을 기반으로 하므로, 원래 CLIP처럼 사용할 수 있습니다.
+
+</Tip>
+
+## AltCLIPConfig
+
+[[autodoc]] AltCLIPConfig
+    - from_text_vision_configs
+
+## AltCLIPTextConfig
+
+[[autodoc]] AltCLIPTextConfig
+
+## AltCLIPVisionConfig
+
+[[autodoc]] AltCLIPVisionConfig
+
+## AltCLIPProcessor
+
+[[autodoc]] AltCLIPProcessor
+
+## AltCLIPModel
+
+[[autodoc]] AltCLIPModel
+    - forward
+    - get_text_features
+    - get_image_features
+
+## AltCLIPTextModel
+
+[[autodoc]] AltCLIPTextModel
+    - forward
+
+## AltCLIPVisionModel
+
+[[autodoc]] AltCLIPVisionModel
+    - forward
--- a/docs/source/ko/quicktour.md
+++ b/docs/source/ko/quicktour.md
@ -361,8 +361,8 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
 ```py
 >>> from transformers import AutoModel

->>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
->>> pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True)
+>>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True)
 ```
 </pt>
 <tf>
@ -370,8 +370,8 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
 ```py
 >>> from transformers import TFAutoModel

->>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True)
+>>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True)
 ```
 </tf>
 </frameworkcontent>
--- a/docs/source/ko/tasks/audio_classification.md
+++ b/docs/source/ko/tasks/audio_classification.md
@ -128,7 +128,7 @@ DatasetDict({
 >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
 ```

-MinDS-14 데이터 세트의 샘플링 속도는 8000khz이므로(이 정보는 [데이터세트 카드](https://huggingface.co/datasets/PolyAI/minds14)에서 확인할 수 있습니다), 사전 훈련된 Wav2Vec2 모델을 사용하려면 데이터 세트를 16000kHz로 리샘플링해야 합니다:
+MinDS-14 데이터 세트의 샘플링 속도는 8khz이므로(이 정보는 [데이터세트 카드](https://huggingface.co/datasets/PolyAI/minds14)에서 확인할 수 있습니다), 사전 훈련된 Wav2Vec2 모델을 사용하려면 데이터 세트를 16kHz로 리샘플링해야 합니다:

 ```py
 >>> minds = minds.cast_column("audio", Audio(sampling_rate=16_000))
--- a/docs/source/pt/quicktour.md
+++ b/docs/source/pt/quicktour.md
@ -37,7 +37,7 @@ A [`pipeline`] apoia diversas tarefas fora da caixa:
 **Texto**:
 * Análise sentimental: classifica a polaridade de um texto.
 * Geração de texto (em Inglês): gera texto a partir de uma entrada.
-* Reconhecimento de entidade mencionada: legenda cada palavra com uma classe que a representa (pessoa, data, local, etc...) 
+* Reconhecimento de entidade mencionada: legenda cada palavra com uma classe que a representa (pessoa, data, local, etc...)
 * Respostas: extrai uma resposta dado algum contexto e uma questão
 * Máscara de preenchimento: preenche o espaço, dado um texto com máscaras de palavras.
 * Sumarização: gera o resumo de um texto longo ou documento.
@ -87,7 +87,7 @@ Importe [`pipeline`] e especifique a tarefa que deseja completar:
 >>> classifier = pipeline("sentiment-analysis")
 ```

-A pipeline baixa and armazena um [modelo pré-treinado](https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english) padrão e tokenizer para análise sentimental. Agora você pode usar `classifier` no texto alvo: 
+A pipeline baixa and armazena um [modelo pré-treinado](https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english) padrão e tokenizer para análise sentimental. Agora você pode usar `classifier` no texto alvo:

 ```py
 >>> classifier("We are very happy to show you the 🤗 Transformers library.")
@ -107,7 +107,7 @@ label: NEGATIVE, with score: 0.5309
 A [`pipeline`] também pode iterar sobre um Dataset inteiro. Comece instalando a biblioteca de [🤗 Datasets](https://huggingface.co/docs/datasets/):

 ```bash
-pip install datasets 
+pip install datasets
 ```

 Crie uma [`pipeline`] com a tarefa que deseja resolver e o modelo que deseja usar.
@ -133,7 +133,7 @@ Precisamos garantir que a taxa de amostragem do conjunto de dados corresponda à
 >>> dataset = dataset.cast_column("audio", Audio(sampling_rate=speech_recognizer.feature_extractor.sampling_rate))
 ```

-Os arquivos de áudio são carregados e re-amostrados automaticamente ao chamar a coluna `"audio"`. 
+Os arquivos de áudio são carregados e re-amostrados automaticamente ao chamar a coluna `"audio"`.
 Vamos extrair as arrays de formas de onda originais das primeiras 4 amostras e passá-las como uma lista para o pipeline:

 ```py
@ -176,7 +176,7 @@ Use o [`TFAutoModelForSequenceClassification`] and [`AutoTokenizer`] para carreg
 </tf>
 </frameworkcontent>

-Então você pode especificar o modelo e o tokenizador na [`pipeline`] e aplicar o `classifier` no seu texto alvo: 
+Então você pode especificar o modelo e o tokenizador na [`pipeline`] e aplicar o `classifier` no seu texto alvo:

 ```py
 >>> classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
@ -190,7 +190,7 @@ Se você não conseguir achar um modelo para o seu caso de uso, precisará usar

 <Youtube id="AhChOFRegn4"/>

-Por baixo dos panos, as classes [`AutoModelForSequenceClassification`] e [`AutoTokenizer`] trabalham juntas para fortificar o [`pipeline`]. Um [AutoClass](./model_doc/auto) é um atalho que automaticamente recupera a arquitetura de um modelo pré-treinado a partir de seu nome ou caminho. Basta selecionar a `AutoClass` apropriada para sua tarefa e seu tokenizer associado com [`AutoTokenizer`]. 
+Por baixo dos panos, as classes [`AutoModelForSequenceClassification`] e [`AutoTokenizer`] trabalham juntas para fortificar o [`pipeline`]. Um [AutoClass](./model_doc/auto) é um atalho que automaticamente recupera a arquitetura de um modelo pré-treinado a partir de seu nome ou caminho. Basta selecionar a `AutoClass` apropriada para sua tarefa e seu tokenizer associado com [`AutoTokenizer`].

 Vamos voltar ao nosso exemplo e ver como você pode usar a `AutoClass` para replicar os resultados do [`pipeline`].

@ -383,8 +383,8 @@ Um recurso particularmente interessante dos 🤗 Transformers é a capacidade de
 ```py
 >>> from transformers import AutoModel

->>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
->>> pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True)
+>>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True)
 ```
 </pt>
 <tf>
@ -392,8 +392,8 @@ Um recurso particularmente interessante dos 🤗 Transformers é a capacidade de
 ```py
 >>> from transformers import TFAutoModel

->>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True)
+>>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True)
 ```
 </tf>
 </frameworkcontent>
--- a/docs/source/te/quicktour.md
+++ b/docs/source/te/quicktour.md
@ -366,8 +366,8 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
 ```py
 >>> from transformers import AutoModel

->>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
->>> pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True)
+>>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True)
 ```
 </pt>
 <tf>
@ -375,8 +375,8 @@ tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
 ```py
 >>> from transformers import TFAutoModel

->>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True)
+>>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True)
 ```
 </tf>
 </frameworkcontent>
--- a/Show More
+++ b/Show More